diff --git a/INSTALL.md b/INSTALL.md
index 709b8017..84549e43 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -4,7 +4,7 @@ Install
  - C++14 compiler (clang, gcc, hipcc, icc, nvcc)
   - *Debian/Ubuntu:* `sudo apt-get install gcc build-essential`
   - *Arch Linux:* `sudo pacman -S base-devel`
- - `alpaka` 1.1.0
+ - `alpaka` 1.2.0
   - included as git submodule
  - `boost` >= 1.65.1
    - dependency of alpaka
diff --git a/alpaka/.github/workflows/ci.yml b/alpaka/.github/workflows/ci.yml
index 709d1c5f..fe14a5a3 100644
--- a/alpaka/.github/workflows/ci.yml
+++ b/alpaka/.github/workflows/ci.yml
@@ -14,16 +14,15 @@ concurrency:
 ################################################################################
 # NOTE: Testing the full matrix is not practical.
 # Therefore we aim to have each value been set in at lest one job.
-# CXX                                           : {g++, clang++}
+# ALPAKA_CI_CXX                                 : {g++, clang++}
 #   [g++] ALPAKA_CI_GCC_VER                     : {9, 10, 11, 12, 13}
 #   [clang++] ALPAKA_CI_CLANG_VER               : {9, 10, 11, 12, 13, 14}
 #   [cl.exe] ALPAKA_CI_CL_VER                   : {2022}
-#   ALPAKA_CI_STDLIB                            : {libstdc++, [CXX==clang++]:libc++}
+#   ALPAKA_CI_STDLIB                            : {libstdc++, [ALPAKA_CI_CXX==clang++]:libc++}
 # CMAKE_BUILD_TYPE                              : {Debug, Release}
 # alpaka_CI                                     : {GITHUB}
-# ALPAKA_CI_DOCKER_BASE_IMAGE_NAME              : {ubuntu:20.04, ubuntu:22.04}
-# ALPAKA_BOOST_VERSION                          : {1.74.0, 1.75.0, 1.76.0, 1.77.0, 1.78.0, 1.79.0, 1.80.0, 1.81.0, 1.82.0}
-# ALPAKA_CI_CMAKE_VER                           : {3.22.6, 3.23.5, 3.24.4, 3.25.3, 3.26.4}
+# ALPAKA_BOOST_VERSION                          : {1.74.0, 1.75.0, 1.76.0, 1.77.0, 1.78.0, 1.79.0, 1.80.0, 1.81.0, 1.82.0, 1.83.0, 1.84.0, 1.85.0, 1.86.0}
+# ALPAKA_CI_CMAKE_VER                           : {3.22.6, 3.23.5, 3.24.4, 3.25.3, 3.26.4, 3.27.9, 3.28.6, 3.29.8, 3.30.3}
 # ALPAKA_CI_XCODE_VER                           : {13.2.1, 14.2}
 # ALPAKA_CI_SANITIZERS                          : {ASan, UBsan, TSan}
 #    TSan is not currently used because it produces many unexpected errors
@@ -40,8 +39,8 @@ concurrency:
 # alpaka_ACC_ANY_BT_OMP5_ENABLE                 : {ON, OFF}
 #   [ON] OMP_NUM_THREADS                        : {1, 2, 3, 4}
 # alpaka_ACC_GPU_CUDA_ENABLE                    : {ON, OFF}
-#   [ON] ALPAKA_CI_CUDA_VERSION                 : {11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6}
-#   [ON] CMAKE_CUDA_COMPILER                    : {nvcc, [CXX==clang++]:clang++}
+#   [ON] ALPAKA_CI_CUDA_VERSION                 : {11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6}
+#   [ON] ALPAKA_CI_CUDA_COMPILER                : {nvcc, [ALPAKA_CI_CXX==clang++]:clang++}
 # alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE             : {ON, OFF}
 # alpaka_ACC_GPU_HIP_ENABLE                     : {ON, OFF}
 #   [ON] ALPAKA_CI_HIP_BRANCH                   : {rocm-4.2}
@@ -59,9 +58,10 @@ env:
   ALPAKA_CI_HIP_ROOT_DIR: ${{ github.workspace }}/hip
   ALPAKA_CI_SANITIZERS: ""
   ALPAKA_CI_ANALYSIS: OFF
-  ALPAKA_CI_ONEAPI_VERSION: 2023.2.0
+  ALPAKA_CI_ONEAPI_VERSION: 2024.0
   ALPAKA_CI_TBB_VERSION: 2021.10.0
   ALPAKA_CI_RUN_TESTS: ON
+  alpaka_CXX_STANDARD: 17
   alpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE: ON
   alpaka_ACC_CPU_B_SEQ_T_THREADS_ENABLE: ON
   alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: ON
@@ -99,6 +99,7 @@ jobs:
     name: ${{ matrix.name }}
     runs-on: ${{ matrix.os }}
     env: ${{ matrix.env }}
+    container: ${{ matrix.container }}
 
     strategy:
       fail-fast: false
@@ -107,41 +108,43 @@ jobs:
         ### Analysis builds
         - name: linux_clang-14_cuda-11.2_debug_analysis
           os: ubuntu-20.04
-          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 14,     ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_BOOST_VERSION: 1.79.0, ALPAKA_CI_CMAKE_VER: 3.23.5, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_CI_ANALYSIS: ON, ALPAKA_CI_RUN_TESTS: OFF, alpaka_DEBUG: 1, alpaka_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CI_CUDA_VERSION: "11.2", CMAKE_CUDA_COMPILER: clang++,   alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+          env: {ALPAKA_CI_CXX: clang++, ALPAKA_CI_CLANG_VER: 14,     ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_BOOST_VERSION: 1.79.0, ALPAKA_CI_CMAKE_VER: 3.23.5, OMP_NUM_THREADS: 4, ALPAKA_CI_ANALYSIS: ON, ALPAKA_CI_RUN_TESTS: OFF, alpaka_DEBUG: 1, alpaka_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CI_CUDA_VERSION: "11.2", ALPAKA_CI_CUDA_COMPILER : clang++,   alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+          container: ubuntu:20.04
         - name: windows_cl-2022_debug_analysis
           os: windows-2022
-          env: {CXX: cl.exe,  CC: cl.exe, ALPAKA_CI_CL_VER: 2022,                                   CMAKE_BUILD_TYPE: Debug,   ALPAKA_BOOST_VERSION: 1.78.0, ALPAKA_CI_CMAKE_VER: 3.23.5,                     ALPAKA_CI_ANALYSIS: ON, alpaka_DEBUG: 2}
+          env: {ALPAKA_CI_CXX: cl.exe,  ALPAKA_CI_CL_VER: 2022,                                   CMAKE_BUILD_TYPE: Debug,   ALPAKA_BOOST_VERSION: 1.78.0, ALPAKA_CI_CMAKE_VER: 3.23.5,                     ALPAKA_CI_ANALYSIS: ON, alpaka_DEBUG: 2}
         - name: macos_xcode-14.2_debug_analysis
           os: macos-12
-          env: {CXX: clang++, CC: clang,  ALPAKA_CI_XCODE_VER: 14.2,                                CMAKE_BUILD_TYPE: Debug,   ALPAKA_BOOST_VERSION: 1.81.0,                                                  ALPAKA_CI_ANALYSIS: ON, alpaka_DEBUG: 2,                                                                                                                                                                             alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE: ON, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: ON, ALPAKA_CI_BUILD_JOBS: 3}
+          env: {ALPAKA_CI_CXX: clang++, ALPAKA_CI_XCODE_VER: 14.2,                                CMAKE_BUILD_TYPE: Debug,   ALPAKA_BOOST_VERSION: 1.81.0,                                                  ALPAKA_CI_ANALYSIS: ON, alpaka_DEBUG: 2,                                                                                                                                                                             alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE: ON, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: ON, ALPAKA_CI_BUILD_JOBS: 3}
         - name: linux_gcc-12_debug_analysis
           os: ubuntu-22.04
-          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 12,       ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_BOOST_VERSION: 1.80.0, ALPAKA_CI_CMAKE_VER: 3.23.5, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:22.04", ALPAKA_CI_ANALYSIS: ON, alpaka_DEBUG: 2}
+          env: {ALPAKA_CI_CXX: g++,    ALPAKA_CI_GCC_VER: 12,       ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_BOOST_VERSION: 1.80.0, ALPAKA_CI_CMAKE_VER: 3.23.5, OMP_NUM_THREADS: 4, ALPAKA_CI_ANALYSIS: ON, alpaka_DEBUG: 2}
+          container: ubuntu:22.04
 
         ### macOS
         - name: macos_xcode-14.2_release
           os: macos-12
-          env: {CXX: clang++, CC: clang,  ALPAKA_CI_XCODE_VER: 14.2,                                CMAKE_BUILD_TYPE: Release, ALPAKA_BOOST_VERSION: 1.81.0,                                                                                                                                                                                                                                                                       alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE: ON, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: ON, ALPAKA_CI_BUILD_JOBS: 3}
+          env: {ALPAKA_CI_CXX: clang++, ALPAKA_CI_XCODE_VER: 14.2,                                CMAKE_BUILD_TYPE: Release, ALPAKA_BOOST_VERSION: 1.81.0,                                                                                                                                                                                                                                                                       alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE: ON, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: ON, ALPAKA_CI_BUILD_JOBS: 3}
         - name: macos_xcode-14.3.1_debug
           os: macos-13
-          env: {CXX: clang++, CC: clang,  ALPAKA_CI_XCODE_VER: 14.3.1,                              CMAKE_BUILD_TYPE: Debug,   ALPAKA_BOOST_VERSION: 1.82.0,                                                                                                                                                                                                                                                                       alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE: ON, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: ON, ALPAKA_CI_BUILD_JOBS: 3}
+          env: {ALPAKA_CI_CXX: clang++, ALPAKA_CI_XCODE_VER: 14.3.1,                              CMAKE_BUILD_TYPE: Debug,   ALPAKA_BOOST_VERSION: 1.82.0,                                                                                                                                                                                                                                                                       alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE: ON, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: ON, ALPAKA_CI_BUILD_JOBS: 3}
 
         ### Windows
         - name: windows_cl-2022_release
           os: windows-2022
-          env: {CXX: cl.exe,  CC: cl.exe, ALPAKA_CI_CL_VER: 2022,                                   CMAKE_BUILD_TYPE: Release, ALPAKA_BOOST_VERSION: 1.79.0, ALPAKA_CI_CMAKE_VER: 3.24.4, OMP_NUM_THREADS: 1}
+          env: {ALPAKA_CI_CXX: cl.exe,  ALPAKA_CI_CL_VER: 2022,                                   CMAKE_BUILD_TYPE: Release, ALPAKA_BOOST_VERSION: 1.79.0, ALPAKA_CI_CMAKE_VER: 3.24.4, OMP_NUM_THREADS: 1}
         - name: windows_cl-2022_debug
           os: windows-2022
-          env: {CXX: cl.exe,  CC: cl.exe, ALPAKA_CI_CL_VER: 2022,                                   CMAKE_BUILD_TYPE: Debug,   ALPAKA_BOOST_VERSION: 1.80.0, ALPAKA_CI_CMAKE_VER: 3.25.1, OMP_NUM_THREADS: 4,                                                                                                                                                                                           alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+          env: {ALPAKA_CI_CXX: cl.exe,  ALPAKA_CI_CL_VER: 2022,                                   CMAKE_BUILD_TYPE: Debug,   ALPAKA_BOOST_VERSION: 1.80.0, ALPAKA_CI_CMAKE_VER: 3.25.1, OMP_NUM_THREADS: 4,                                                                                                                                                                                           alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
 
         ## CUDA 12.1
         # nvcc + MSVC
         # - name: windows_nvcc-12.1_cl-2022_release_cuda-only
         #  os: windows-2022
-        #  env: {CXX: cl.exe,  CC: cl.exe, ALPAKA_CI_CL_VER: 2022,                                   CMAKE_BUILD_TYPE: Release, ALPAKA_BOOST_VERSION: 1.81.0, ALPAKA_CI_CMAKE_VER: 3.24.4, ALPAKA_CI_RUN_TESTS: OFF,                     alpaka_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CI_CUDA_VERSION: "12.1", CMAKE_CUDA_ARCHITECTURES: "50;90", alpaka_ACC_GPU_CUDA_ONLY_MODE: ON,                                                     alpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE: OFF, alpaka_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+        #  env: {ALPAKA_CI_CXX: cl.exe,  ALPAKA_CI_CL_VER: 2022,                                   CMAKE_BUILD_TYPE: Release, ALPAKA_BOOST_VERSION: 1.81.0, ALPAKA_CI_CMAKE_VER: 3.24.4, ALPAKA_CI_RUN_TESTS: OFF,                     alpaka_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CI_CUDA_VERSION: "12.1", CMAKE_CUDA_ARCHITECTURES: "50;90", alpaka_ACC_GPU_CUDA_ONLY_MODE: ON,                                                     alpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE: OFF, alpaka_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
         # - name: windows_nvcc-12.1_cl-2022_debug
         #  os: windows-2022
-        #  env: {CXX: cl.exe,  CC: cl.exe, ALPAKA_CI_CL_VER: 2022,                                   CMAKE_BUILD_TYPE: Debug,   ALPAKA_BOOST_VERSION: 1.78.0, ALPAKA_CI_CMAKE_VER: 3.25.1, ALPAKA_CI_RUN_TESTS: OFF,                     alpaka_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CI_CUDA_VERSION: "12.1",                                                                                                                           alpaka_ACC_CPU_BT_OMP5_ENABLE: OFF, alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF}
+        #  env: {ALPAKA_CI_CXX: cl.exe,  ALPAKA_CI_CL_VER: 2022,                                   CMAKE_BUILD_TYPE: Debug,   ALPAKA_BOOST_VERSION: 1.78.0, ALPAKA_CI_CMAKE_VER: 3.25.1, ALPAKA_CI_RUN_TESTS: OFF,                     alpaka_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CI_CUDA_VERSION: "12.1",                                                                                                                           alpaka_ACC_CPU_BT_OMP5_ENABLE: OFF, alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF}
 
         ### Ubuntu
         ## native
@@ -153,13 +156,16 @@ jobs:
         #  - Ubuntu 22.04
         - name: linux_gcc-9_debug
           os: ubuntu-20.04
-          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 9,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_BOOST_VERSION: 1.74.0, ALPAKA_CI_CMAKE_VER: 3.22.6, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", CMAKE_CXX_EXTENSIONS: OFF}
+          env: {ALPAKA_CI_CXX: g++,    ALPAKA_CI_GCC_VER: 9,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_BOOST_VERSION: 1.74.0, ALPAKA_CI_CMAKE_VER: 3.22.6, OMP_NUM_THREADS: 4, CMAKE_CXX_EXTENSIONS: OFF}
+          container: ubuntu:20.04
         - name: linux_gcc-12_release_c++20
           os: ubuntu-22.04
-          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 12,       ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_BOOST_VERSION: 1.80.0, ALPAKA_CI_CMAKE_VER: 3.24.4, OMP_NUM_THREADS: 2, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:22.04", alpaka_CXX_STANDARD: 20, alpaka_USE_MDSPAN: "FETCH"}
+          env: {ALPAKA_CI_CXX: g++,    ALPAKA_CI_GCC_VER: 12,       ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_BOOST_VERSION: 1.80.0, ALPAKA_CI_CMAKE_VER: 3.24.4, OMP_NUM_THREADS: 2, alpaka_CXX_STANDARD: 20, alpaka_USE_MDSPAN: "FETCH"}
+          container: ubuntu:22.04
         - name: linux_gcc-13_debug
           os: ubuntu-22.04
-          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 13,       ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_BOOST_VERSION: 1.82.0, ALPAKA_CI_CMAKE_VER: 3.26.4, OMP_NUM_THREADS: 2, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:22.04"}
+          env: {ALPAKA_CI_CXX: g++,    ALPAKA_CI_GCC_VER: 13,       ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_BOOST_VERSION: 1.82.0, ALPAKA_CI_CMAKE_VER: 3.26.4, OMP_NUM_THREADS: 2}
+          container: ubuntu:22.04
 
         # TODO: keep jobs until GitLab CI supports:
         #  - disable CMAKE_CXX_EXTENSIONS=OFF
@@ -169,20 +175,25 @@ jobs:
         # clang++
         - name: linux_clang-10_release
           os: ubuntu-20.04
-          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 10,     ALPAKA_CI_STDLIB: libc++,    CMAKE_BUILD_TYPE: Release, ALPAKA_BOOST_VERSION: 1.75.0, ALPAKA_CI_CMAKE_VER: 3.23.5, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, CMAKE_CXX_EXTENSIONS: OFF}
+          env: {ALPAKA_CI_CXX: clang++,  ALPAKA_CI_CLANG_VER: 10,     ALPAKA_CI_STDLIB: libc++,    CMAKE_BUILD_TYPE: Release, ALPAKA_BOOST_VERSION: 1.75.0, ALPAKA_CI_CMAKE_VER: 3.23.5, OMP_NUM_THREADS: 4, alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, CMAKE_CXX_EXTENSIONS: OFF}
+          container: ubuntu:20.04
         # clang-11 tested in GitLab CI
         - name: linux_clang-12_release
           os: ubuntu-20.04
-          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 12,     ALPAKA_CI_STDLIB: libc++,    CMAKE_BUILD_TYPE: Release, ALPAKA_BOOST_VERSION: 1.77.0, ALPAKA_CI_CMAKE_VER: 3.24.4, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, CMAKE_CXX_EXTENSIONS: OFF}
+          env: {ALPAKA_CI_CXX: clang++,  ALPAKA_CI_CLANG_VER: 12,     ALPAKA_CI_STDLIB: libc++,    CMAKE_BUILD_TYPE: Release, ALPAKA_BOOST_VERSION: 1.77.0, ALPAKA_CI_CMAKE_VER: 3.24.4, OMP_NUM_THREADS: 4, alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, CMAKE_CXX_EXTENSIONS: OFF}
+          container: ubuntu:20.04
         - name: linux_clang-13_debug
           os: ubuntu-22.04
-          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 13,     ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_BOOST_VERSION: 1.79.0, ALPAKA_CI_CMAKE_VER: 3.23.5, OMP_NUM_THREADS: 3, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", CMAKE_CXX_EXTENSIONS: OFF}
+          env: {ALPAKA_CI_CXX: clang++,  ALPAKA_CI_CLANG_VER: 13,     ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_BOOST_VERSION: 1.79.0, ALPAKA_CI_CMAKE_VER: 3.23.5, OMP_NUM_THREADS: 3, CMAKE_CXX_EXTENSIONS: OFF}
+          container: ubuntu:22.04
         - name: linux_clang-16_debug_ubsan
-          os: ubuntu-latest
-          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 16,     ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_BOOST_VERSION: 1.77.0, ALPAKA_CI_CMAKE_VER: 3.23.5, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:22.04", CMAKE_CXX_EXTENSIONS: OFF, ALPAKA_CI_SANITIZERS: UBSan}
+          os: ubuntu-22.04
+          env: {ALPAKA_CI_CXX: clang++,  ALPAKA_CI_CLANG_VER: 16,     ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_BOOST_VERSION: 1.77.0, ALPAKA_CI_CMAKE_VER: 3.23.5, OMP_NUM_THREADS: 4, CMAKE_CXX_EXTENSIONS: OFF, ALPAKA_CI_SANITIZERS: UBSan}
+          container: ubuntu:22.04
         - name: linux_clang-16_debug_tsan
-          os: ubuntu-latest
-          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 16,     ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_BOOST_VERSION: 1.77.0, ALPAKA_CI_CMAKE_VER: 3.23.5, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:22.04", alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, CMAKE_CXX_EXTENSIONS: OFF, ALPAKA_CI_SANITIZERS: TSan}
+          os: ubuntu-22.04
+          env: {ALPAKA_CI_CXX: clang++,  ALPAKA_CI_CLANG_VER: 16,     ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_BOOST_VERSION: 1.77.0, ALPAKA_CI_CMAKE_VER: 3.23.5, OMP_NUM_THREADS: 4, alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, CMAKE_CXX_EXTENSIONS: OFF, ALPAKA_CI_SANITIZERS: TSan}
+          container: ubuntu:22.04
 
     steps:
     - name: check filter
diff --git a/alpaka/.github/workflows/gh-pages.yml b/alpaka/.github/workflows/gh-pages.yml
index 256ee452..993f85f6 100644
--- a/alpaka/.github/workflows/gh-pages.yml
+++ b/alpaka/.github/workflows/gh-pages.yml
@@ -6,6 +6,8 @@ on:
   push:
     branches:
       - develop
+env:
+  ALPAKA_CI_OS_NAME: "Linux"
 
 jobs:
   gh-pages:
diff --git a/alpaka/.gitignore b/alpaka/.gitignore
index 6c946c89..2dff37de 100644
--- a/alpaka/.gitignore
+++ b/alpaka/.gitignore
@@ -8,6 +8,7 @@
 
 # tmp files
 *~
+.*.swp
 
 # netbeans project files
 /nbproject/
diff --git a/alpaka/.gitlab-ci.yml b/alpaka/.gitlab-ci.yml
index 549f8ed9..a649e23e 100644
--- a/alpaka/.gitlab-ci.yml
+++ b/alpaka/.gitlab-ci.yml
@@ -17,7 +17,7 @@ variables:
   # container version of the generated jobs
   # should be merged with ALPAKA_GITLAB_CI_CONTAINER_VERSION
   # see: script/job_generator/generate_job_yaml.py
-  ALPAKA_GITLAB_CI_GENERATOR_CONTAINER_VERSION: "3.1"
+  ALPAKA_GITLAB_CI_GENERATOR_CONTAINER_VERSION: "3.2"
 
 generate:
   stage: generator
@@ -27,7 +27,9 @@ generate:
     - apk update && apk add python3~=3.11 py3-pip
     - pip3 install -r script/job_generator/requirements.txt
     # it is sufficient to verify once, as the same job matrix is generated, verified and then filtered each time
-    - python3 script/job_generator/job_generator.py ${ALPAKA_GITLAB_CI_GENERATOR_CONTAINER_VERSION} --verify --wave compile_only_job -o compile_only.yml
+    # disable verify because we know that the generator is broken: https://github.com/thombashi/allpairspy/pull/10
+    #- python3 script/job_generator/job_generator.py ${ALPAKA_GITLAB_CI_GENERATOR_CONTAINER_VERSION} --verify --wave compile_only_job -o compile_only.yml
+    - python3 script/job_generator/job_generator.py ${ALPAKA_GITLAB_CI_GENERATOR_CONTAINER_VERSION} --wave compile_only_job -o compile_only.yml
     - python3 script/job_generator/job_generator.py ${ALPAKA_GITLAB_CI_GENERATOR_CONTAINER_VERSION} --wave runtime_job_cpu -o runtime_cpu.yml
     - python3 script/job_generator/job_generator.py ${ALPAKA_GITLAB_CI_GENERATOR_CONTAINER_VERSION} --wave runtime_job_gpu -o runtime_gpu.yml
     - cat compile_only.yml
diff --git a/alpaka/.pre-commit-config.yaml b/alpaka/.pre-commit-config.yaml
new file mode 100644
index 00000000..065efe81
--- /dev/null
+++ b/alpaka/.pre-commit-config.yaml
@@ -0,0 +1,45 @@
+minimum_pre_commit_version: 3.2.0  # necessitated by Lucas-C's hooks
+default_install_hook_types: [pre-commit, pre-push]
+exclude: 'thirdParty'
+repos:
+- repo: https://github.com/pre-commit/mirrors-clang-format
+  rev: v16.0.6
+  hooks:
+  - id: clang-format
+    files: \.(cpp|hpp)
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v4.5.0
+  hooks:
+  - id: no-commit-to-branch
+    args: [-b, develop]
+  - id: check-merge-conflict
+  - id: trailing-whitespace
+    exclude_types: [markdown, rst]
+  - id: end-of-file-fixer
+  - id: check-toml
+  - id: check-yaml
+  - id: mixed-line-ending
+  - id: check-executables-have-shebangs
+  - id: check-shebang-scripts-are-executable
+- repo: https://github.com/Lucas-C/pre-commit-hooks
+  rev: v1.5.4
+  hooks:
+    - id: forbid-tabs
+      types_or: [file]
+      exclude_types: [rst]
+    - id: remove-tabs
+      types_or: [file]
+      exclude_types: [rst]
+    - id: forbid-crlf
+    - id: remove-crlf
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  rev: v0.2.1
+  hooks:
+    - id: ruff
+      types_or: [ python, pyi, jupyter ]
+      # The ignores in here are chosen to conform with the currently
+      # existing code and not motivated any other way.
+      args: [ --fix, --ignore, "F403,F405,E731"]
+    - id: ruff-format
+      types_or: [ python, pyi, jupyter ]
+      args: ["--line-length", "120"]
diff --git a/alpaka/.zenodo.json b/alpaka/.zenodo.json
index a2c1e618..9b11be3f 100644
--- a/alpaka/.zenodo.json
+++ b/alpaka/.zenodo.json
@@ -12,29 +12,46 @@
       "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf",
       "orcid": "0000-0002-8218-3116"
     },
+    {
+      "name": "Erdem, Sven",
+      "affiliation": "Helmholtz-Zentrum Berlin"
+    },
+    {
+      "name": "Fila, Mateusz Jakub",
+      "affiliation": "CERN"
+    },
     {
       "name": "Gruber, Bernhard Manfred",
       "affiliation": "CASUS, Helmholtz-Zentrum Dresden-Rossendorf, CERN",
       "orcid": "0000-0001-7848-1690"
     },
     {
-      "name": "Martin-Haugh, Stewart",
-      "affiliation": "STFC Rutherford Appleton Laboratory",
-      "orcid": "0000-0001-9457-1928"
+      "name": "Lenz, Julian",
+      "affiliation": "CASUS, Helmholtz-Zentrum Dresden-Rossendorf",
+      "orcid": "0000-0001-5250-0005"
     },
     {
       "name": "Perego, Aurora",
       "affiliation": "CERN",
-      "orcid": "0000-0003-1576-6757"
+      "orcid": "0009-0002-5210-6213"
     },
     {
-      "name": "Tascon, Andres Rios",
-      "affiliation": "Princeton University"
+      "name": "Varvarin, Michael",
+      "affiliation": "CASUS, Helmholtz-Zentrum Dresden-Rossendorf"
+    },
+    {
+      "name": "Vyskočil, Jiří",
+      "affiliation": "CASUS, Helmholtz-Zentrum Dresden-Rossendorf",
+      "orcid": "0000-0001-8822-0929"
     },
     {
       "name": "Widera, René",
       "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf",
       "orcid": "0000-0003-1642-0459"
+    },
+    {
+      "name": "Yusufoglu, Mehmet",
+      "affiliation": "CASUS, Helmholtz-Zentrum Dresden-Rossendorf"
     }
   ],
   "contributors": [
@@ -98,6 +115,12 @@
       "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf",
       "type": "Other"
     },
+    {
+      "name": "Martin-Haugh, Stewart",
+      "affiliation": "STFC Rutherford Appleton Laboratory",
+      "orcid": "0000-0001-9457-1928",
+      "type": "Other"
+    },
     {
       "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf, TU Dresden",
       "name": "Matthes, Alexander",
@@ -143,14 +166,13 @@
       "type": "Other"
     },
     {
-      "name": "Vollmer, Daniel",
-      "affiliation": "Deutsches Zentrum für Luft- und Raumfahrt e.V.",
+      "name": "Tascon, Andres Rios",
+      "affiliation": "Princeton University",
       "type": "Other"
     },
     {
-      "name": "Vyskočil, Jiří",
-      "affiliation": "CASUS, Helmholtz-Zentrum Dresden-Rossendorf",
-      "orcid": "0000-0001-8822-0929",
+      "name": "Vollmer, Daniel",
+      "affiliation": "Deutsches Zentrum für Luft- und Raumfahrt e.V.",
       "type": "Other"
     },
     {
diff --git a/alpaka/CHANGELOG.md b/alpaka/CHANGELOG.md
index dab9f20e..6af19ec2 100644
--- a/alpaka/CHANGELOG.md
+++ b/alpaka/CHANGELOG.md
@@ -3,6 +3,65 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 
+## [1.2.0] - 2024-10-02
+
+### Added
+
+- device functions to simplify writing kernel code #2337 #2369 #2383
+- support Clang 18 and 19 #2387
+- support oneAPI 2024.2 #2368
+- support for mapped memory allocation for the SYCL backend #2375
+- support for pre-commit #2253
+- support for device and constant global variables in the SYCL backend #2242
+- alpaka::meta::isList, alpaka::meta::ToList and alpaka::meta::toTuple #2269
+- accelerator trait to check for single and multi-threads per block #2263
+- trait IsKernelTriviallyCopyable #2302
+- trait AccIsEnabled #2267
+- documentation: cmake flag to build alpaka benchmarks #2272
+- benchmark: babelstream support for different Accs #2299
+- example: using MdSpan to pass 2D data #2293
+- example: 2D heat equation #2365 #2383
+- example: Convolution #2228 #2220
+
+### Changed
+
+- update cheatsheet.rst  #2398 #2386 #2241
+- signature of `[get|is]ValidWorkDiv*` #2349
+- use shared CUDA libraries by default #2348 #2342
+- add thread count to CPU blocks accelerators #2338
+- link libcudart even when libcurand is not used #2329
+- ctest: display only output of tests, which failed #2322
+- example: Matrix Multiplication use MdSpan #2317
+- move the Complex class to internal namespace #2301
+- run examples with all enabled accelerators #2280
+- template order allocMappedBuf #2270
+- slow getWarpSize problem #2246
+- simplification of workdiv creation #2240
+- benchmarks: move from examples into own directory #2237
+
+### Fixed
+
+- `[get|is]ValidWorkDiv*` #2349 #2335
+- cray clang compiler errors #2392
+- fix and update SYCL targets #2390 #2361
+- single thread acc throw for invalid workdiv fix #2391
+- explicitly call alpaka::detail to achieve SYCL compilation #2385
+- deduction guide for vector #2376
+- issue with device global variables with CUDA 12.4 #2303
+- clang9/nvcc11.2 boost bug #2294
+- HIP: fix CMake relocatable device code option #2290
+- Re-enable AtomicAtomicRef #2288
+- alpaka_add_library relocatable device code #2273
+- forwarding of msvc compiler flag '/Zo' #2266
+- Windows: usage of Idx to alpaka::Idx #2265
+- compiler detection for clang 17 and 18 as CUDA compiler with libstdc++ (gcc) #2256
+- support for non-integral types in Vec generator constructor #2236
+- memcpy warning #2295
+
+### Removed
+
+- support for nvcc11.0 and nvcc11.1 #2310
+
 ## [1.1.0] - 2024-01-18
 
 ### Added
diff --git a/alpaka/CMakeLists.txt b/alpaka/CMakeLists.txt
index 860796f5..fcaae972 100644
--- a/alpaka/CMakeLists.txt
+++ b/alpaka/CMakeLists.txt
@@ -35,6 +35,7 @@ set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 # Options and Variants
 
 option(alpaka_BUILD_EXAMPLES "Build the examples" OFF)
+option(alpaka_BUILD_BENCHMARKS "Build the benchmarks" OFF)
 
 # Enable the test infrastructure only if alpaka is the top-level project
 if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
@@ -46,8 +47,9 @@ endif()
 option(alpaka_INSTALL_TEST_HEADER "Install headers of the namespace alpaka::test. Attention, headers are not designed for production code, see documentation." OFF)
 
 include(CMakeDependentOption)
+
 cmake_dependent_option(alpaka_CHECK_HEADERS "Check all alpaka headers as part of the tests whether they can be compiled standalone." OFF BUILD_TESTING OFF)
-cmake_dependent_option(alpaka_USE_INTERNAL_CATCH2 "Use internally shipped Catch2" ON BUILD_TESTING OFF)
+cmake_dependent_option(alpaka_USE_INTERNAL_CATCH2 "Use internally shipped Catch2" ON "BUILD_TESTING OR alpaka_BUILD_BENCHMARKS" OFF)
 
 ################################################################################
 # Internal variables.
@@ -80,7 +82,7 @@ set(_alpaka_INCLUDE_DIRECTORY "${_alpaka_ROOT_DIR}/include")
 set(_alpaka_SUFFIXED_INCLUDE_DIR "${_alpaka_INCLUDE_DIRECTORY}/alpaka")
 
 # the sequential accelerator is required for the tests and examples
-if(alpaka_BUILD_EXAMPLES OR BUILD_TESTING)
+if(alpaka_BUILD_EXAMPLES OR alpaka_BUILD_BENCHMARKS OR BUILD_TESTING)
   if (NOT (alpaka_ACC_GPU_CUDA_ONLY_MODE OR alpaka_ACC_GPU_HIP_ONLY_MODE))
     if (NOT DEFINED alpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE)
       option(alpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE "enable alpaka serial accelerator" ON)
@@ -148,6 +150,10 @@ if(alpaka_BUILD_EXAMPLES)
     add_subdirectory("example/")
 endif()
 
+if(alpaka_BUILD_BENCHMARKS)
+    add_subdirectory("benchmarks/")
+endif()
+
 # Only build the tests if alpaka is the top-level project and BUILD_TESTING is ON
 if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME AND BUILD_TESTING)
     add_subdirectory("test/")
@@ -184,10 +190,10 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
 
     if(alpaka_INSTALL_TEST_HEADER)
         install(DIRECTORY "${_alpaka_SUFFIXED_INCLUDE_DIR}"
-	        DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")
+            DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")
     else()
-	install(DIRECTORY "${_alpaka_SUFFIXED_INCLUDE_DIR}"
-	        DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
+    install(DIRECTORY "${_alpaka_SUFFIXED_INCLUDE_DIR}"
+            DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
                 PATTERN "test" EXCLUDE)
     endif()
 
diff --git a/alpaka/CONTRIBUTING.md b/alpaka/CONTRIBUTING.md
index bae65b41..eec1037c 100644
--- a/alpaka/CONTRIBUTING.md
+++ b/alpaka/CONTRIBUTING.md
@@ -1,11 +1,34 @@
 # Contributing
 
+Please review our more detailed [Coding Guidelines](https://alpaka.readthedocs.io/en/latest/dev/style.html) as well.
+
+## Pre-commit
+
+This project is set up for use with [pre-commit](https://pre-commit.com). Using it will make your code conform with most
+of our (easily automatable) code style guidelines automatically.
+In very short (for anything further see [pre-commit](https://pre-commit.com)), after running the following in your
+working clone of alpaka
+```bash
+# if not yet done, install the pre-commit executable following https://pre-commit.com
+cd /path/to/alpaka-working-clone
+pre-commit install
+```
+`git` will run a number of checks prior to every commit and push and will refuse to perform the
+pertinent action if they fail. Most of them (like e.g. the formatter) will have automatically altered your working tree
+with the necessary changes such that
+```bash
+git add -u
+```
+will make the next commit pass.
+
 ## Formatting
 
-Please format your code before before opening pull requests using clang-format 16 and the .clang-format file placed in the repository root.
+Please format your code before before opening pull requests using clang-format 16 and the .clang-format file placed in 
+the repository root. If you were using `pre-commit` during your changes, this has happened automatically already. If
+not, find further instructions below.
 
 ### Visual Studio and CLion
-Suport for clang-format is built-in since Visual Studio 2017 15.7 and CLion 2019.1.
+Support for clang-format is built-in since Visual Studio 2017 15.7 and CLion 2019.1.
 The .clang-format file in the repository will be automatically detected and formatting is done as you type, or triggered when pressing the format hotkey.
 
 ### Bash
diff --git a/alpaka/README.md b/alpaka/README.md
index 945e6fba..f7a03994 100644
--- a/alpaka/README.md
+++ b/alpaka/README.md
@@ -15,18 +15,18 @@ The **alpaka** library is a header-only C++17 abstraction library for accelerato
 
 Its aim is to provide performance portability across accelerators through the abstraction (not hiding!) of the underlying levels of parallelism.
 
-It is platform independent and supports the concurrent and cooperative use of multiple devices such as the hosts CPU as well as attached accelerators as for instance CUDA GPUs and Xeon Phis (currently native execution only).
-A multitude of accelerator back-end variants using CUDA, OpenMP (2.0/5.0), std::thread and also serial execution is provided and can be selected depending on the device.
+It is platform independent and supports the concurrent and cooperative use of multiple devices such as the hosts CPU (x86, ARM, RISC-V and Power 8+) and  GPU accelerators from different vendors (NVIDIA, AMD and Intel).
+A multitude of accelerator back-end variants using NVIDIA CUDA, AMD HIP, SYCL, OpenMP 2.0+, std::thread and also serial execution is provided and can be selected depending on the device.
 Only one implementation of the user kernel is required by representing them as function objects with a special interface.
-There is no need to write special CUDA, OpenMP or custom threading code.
-Accelerator back-ends can be mixed within a device queue.
+There is no need to write special CUDA, HIP, OpenMP or custom threading code.
+Accelerator back-ends can be mixed and synchronized via compute device queue.
 The decision which accelerator back-end executes which kernel can be made at runtime.
 
-The abstraction used is very similar to the CUDA grid-blocks-threads division strategy.
+The abstraction used is very similar to the CUDA grid-blocks-threads domain decomposition strategy.
 Algorithms that should be parallelized have to be divided into a multi-dimensional grid consisting of small uniform work items.
 These functions are called kernels and are executed in parallel threads.
 The threads in the grid are organized in blocks.
-All threads in a block are executed in parallel and can interact via fast shared memory.
+All threads in a block are executed in parallel and can interact via fast shared memory and low level synchronization methods.
 Blocks are executed independently and can not interact in any way.
 The block execution order is unspecified and depends on the accelerator in use.
 By using this abstraction the execution can be optimally adapted to the available hardware.
@@ -65,17 +65,17 @@ Supported Compilers
 
 This library uses C++17 (or newer when available).
 
-| Accelerator Back-end                                                           | gcc 9.5 <br/> (Linux)                           | gcc 10.4 / 11.1 <br/> (Linux)                   | gcc 12.3 <br/> (Linux)                      | gcc 13.1 <br/> (Linux) | clang 9 <br/> (Linux)                                      | clang 10 / 11<br/> (Linux)                            | clang 12 <br/> (Linux)                          | clang 13 <br/> (Linux)                      | clang 14 <br/> (Linux)                            | clang 15 <br/> (Linux)                    | clang 16 <br/> (Linux)                    | clang 17 <br/> (Linux)                    | icpx 2023.1.0 / 2023.2.0 (Linux) | Xcode 13.2.1 / 14.2 / 14.3.1 <br /> (macOS)           | Visual Studio 2022 <br/> (Windows)   |
-|--------------------------------------------------------------------------------|-------------------------------------------------|-------------------------------------------------|---------------------------------------------|------------------------|------------------------------------------------------------|-------------------------------------------------------|-------------------------------------------------|---------------------------------------------|---------------------------------------------------|-------------------------------------------|-------------------------------------------|-------------------------------------------|----------------------------------|-------------------------------------------------------|--------------------------------------|
-| Serial                                                                         | :white_check_mark:                              | :white_check_mark:                              | :white_check_mark:                          | :white_check_mark:     | :white_check_mark:                                         | :white_check_mark:                                    | :white_check_mark:                              | :white_check_mark:                          | :white_check_mark:                                | :white_check_mark:                        | :white_check_mark:                        | :white_check_mark:                        | :white_check_mark:               | :white_check_mark:                                    | :white_check_mark:                   |
-| OpenMP 2.0+ blocks                                                             | :white_check_mark:                              | :white_check_mark:                              | :white_check_mark:                          | :white_check_mark:     | :white_check_mark:                                         | :white_check_mark:                                    | :white_check_mark:                              | :white_check_mark:                          | :white_check_mark:                                | :white_check_mark:                        | :white_check_mark:                        | :white_check_mark:                        | :white_check_mark:[^3]           | :white_check_mark:                                    | :white_check_mark:                   |
-| OpenMP 2.0+ threads                                                            | :white_check_mark:                              | :white_check_mark:                              | :white_check_mark:                          | :white_check_mark:     | :white_check_mark:                                         | :white_check_mark:                                    | :white_check_mark:                              | :white_check_mark:                          | :white_check_mark:                                | :white_check_mark:                        | :white_check_mark:                        | :white_check_mark:                        | :white_check_mark:[^3]           | :white_check_mark:                                    | :white_check_mark:                   |
-| std::thread                                                                    | :white_check_mark:                              | :white_check_mark:                              | :white_check_mark:                          | :white_check_mark:     | :white_check_mark:                                         | :white_check_mark:                                    | :white_check_mark:                              | :white_check_mark:                          | :white_check_mark:                                | :white_check_mark:                        | :white_check_mark:                        | :white_check_mark:                        | :white_check_mark:               | :white_check_mark:                                    | :white_check_mark:                   |
-| TBB                                                                            | :white_check_mark:                              | :white_check_mark:                              | :white_check_mark:                          | :white_check_mark:     | :white_check_mark:                                         | :white_check_mark:                                    | :white_check_mark:                              | :white_check_mark:                          | :white_check_mark:                                | :white_check_mark:                        | :white_check_mark:                        | :white_check_mark:                        | :white_check_mark:               | :white_check_mark:                                    | :white_check_mark:                   |
-| CUDA (nvcc)                                                                    | :white_check_mark: <br/> (CUDA 11.0 - 12.3)[^2] | :white_check_mark: <br/> (CUDA 11.4 - 12.0)[^2] | :white_check_mark: <br/> (CUDA 12.0 - 12.3) | :x:                    | :white_check_mark: <br/> (CUDA 11.0-11.2; 11.6 - 12.0)[^2] | :white_check_mark: <br/> (CUDA 11.2, 11.6 - 12.0)[^2] | :white_check_mark: <br/> (CUDA 11.6 - 12.0)[^2] | :white_check_mark: <br/> (CUDA 11.7 - 12.0) | :white_check_mark: <br/> (CUDA 11.8 - 12.0)       | :white_check_mark: <br/> (CUDA 12.2)      | :white_check_mark: <br/> (CUDA 12.3)      | :x:                                       | :x:                              | :x:                                                   | :x:                                  |
-| CUDA (clang)                                                                   | -                                               | -                                               | -                                           | :x:                    | :x:                                                        | :x:                                                   | :x:                                             | :x:                                         | :white_check_mark: (CUDA 11.0 - 11.5)             | :white_check_mark: (CUDA 11.0 - 11.5)[^1] | :white_check_mark: (CUDA 11.0 - 11.5)[^1] | :white_check_mark: (CUDA 11.0 - 11.8)[^1] | :x:                              | -                                                     | -                                    |
-| [HIP](https://alpaka.readthedocs.io/en/latest/install/HIP.html) (clang)        | -                                               | -                                               | -                                           | :x:                    | :x:                                                        | :x:                                                   | :x:                                             | :x:                                         | :white_check_mark: (HIP 5.1 - 5.2)                | :white_check_mark: (HIP 5.3 - 5.4)        | :white_check_mark: (HIP 5.5 - 5.6)        | :white_check_mark: (HIP 5.7 - 6.0)        | :x:                              | -                                                     | -                                    |
-| SYCL                                                                           | :x:                                             | :x:                                             | :x:                                         | :x:                    | :x:                                                        | :x:                                                   | :x:                                             | :x:                                         | :x:                                               | :x:                                       | :x:                                       | :x:                                       | :white_check_mark:[^4]           | :x:                                                   | :x:                                  |
+| Accelerator Back-end | gcc 9.5 (Linux)                           | gcc 10.4 / 11.1 (Linux)                   | gcc 12.3 (Linux)                      | gcc 13.1 (Linux)                      | clang 9 (Linux)                           | clang 10/11 (Linux)                             | clang 12 (Linux)                          | clang 13 (Linux)                      | clang 14 (Linux)                      | clang 15 (Linux)                      | clang 16 (Linux)                      | clang 17 (Linux)                      | icpx 2024.2 (Linux)     | Xcode 13.2.1 / 14.2 / 14.3.1 (macOS) | Visual Studio 2022 (Windows) |
+|----------------------|-------------------------------------------|-------------------------------------------|---------------------------------------|---------------------------------------|-------------------------------------------|-------------------------------------------------|-------------------------------------------|---------------------------------------|---------------------------------------|---------------------------------------|---------------------------------------|---------------------------------------|-------------------------|--------------------------------------|------------------------------|
+| Serial               | :white_check_mark:                        | :white_check_mark:                        | :white_check_mark:                    | :white_check_mark:                    | :white_check_mark:                        | :white_check_mark:                              | :white_check_mark:                        | :white_check_mark:                    | :white_check_mark:                    | :white_check_mark:                    | :white_check_mark:                    | :white_check_mark:                    | :white_check_mark:      | :white_check_mark:                   | :white_check_mark:           |
+| OpenMP 2.0+ blocks   | :white_check_mark:                        | :white_check_mark:                        | :white_check_mark:                    | :white_check_mark:                    | :white_check_mark:                        | :white_check_mark:                              | :white_check_mark:                        | :white_check_mark:                    | :white_check_mark:                    | :white_check_mark:                    | :white_check_mark:                    | :white_check_mark:                    | :white_check_mark: [^1] | :white_check_mark:                   | :white_check_mark:           |
+| OpenMP 2.0+ threads  | :white_check_mark:                        | :white_check_mark:                        | :white_check_mark:                    | :white_check_mark:                    | :white_check_mark:                        | :white_check_mark:                              | :white_check_mark:                        | :white_check_mark:                    | :white_check_mark:                    | :white_check_mark:                    | :white_check_mark:                    | :white_check_mark:                    | :white_check_mark: [^1] | :white_check_mark:                   | :white_check_mark:           |
+| std::thread          | :white_check_mark:                        | :white_check_mark:                        | :white_check_mark:                    | :white_check_mark:                    | :white_check_mark:                        | :white_check_mark:                              | :white_check_mark:                        | :white_check_mark:                    | :white_check_mark:                    | :white_check_mark:                    | :white_check_mark:                    | :white_check_mark:                    | :white_check_mark:      | :white_check_mark:                   | :white_check_mark:           |
+| TBB                  | :white_check_mark:                        | :white_check_mark:                        | :white_check_mark:                    | :white_check_mark:                    | :white_check_mark:                        | :white_check_mark:                              | :white_check_mark:                        | :white_check_mark:                    | :white_check_mark:                    | :white_check_mark:                    | :white_check_mark:                    | :white_check_mark:                    | :white_check_mark:      | :white_check_mark:                   | :white_check_mark:           |
+| CUDA (nvcc)          | :white_check_mark: (CUDA 11.2 - 12.5)[^2] | :white_check_mark: (CUDA 11.4 - 12.0)[^2] | :white_check_mark: (CUDA 12.0 - 12.5) | :white_check_mark: (CUDA 12.4 - 12.5) | :white_check_mark: (CUDA 11.6 - 12.0)[^2] | :white_check_mark: (CUDA 11.2, 11.6 - 12.0)[^2] | :white_check_mark: (CUDA 11.6 - 12.0)[^2] | :white_check_mark: (CUDA 11.7 - 12.0) | :white_check_mark: (CUDA 11.8 - 12.0) | :white_check_mark: (CUDA 12.2)        | :white_check_mark: (CUDA 12.3)        | :white_check_mark: (CUDA 12.4 - 15.5) | :x:                     | -                                    | :x:                          |
+| CUDA (clang)         | -                                         | -                                         | -                                     | -                                     | :x:                                       | :x:                                             | :x:                                       | :x:                                   | :white_check_mark: (CUDA 11.2 - 11.5) | :white_check_mark: (CUDA 11.2 - 11.5) | :white_check_mark: (CUDA 11.2 - 11.5) | :white_check_mark: (CUDA 11.2 - 11.8) | :x:                     | -                                    | -                            |
+| HIP (clang)          | -                                         | -                                         | -                                     | -                                     | :x:                                       | :x:                                             | :x:                                       | :x:                                   | :white_check_mark: (HIP 5.1 - 5.2)    | :white_check_mark: (HIP 5.3 - 5.4)    | :white_check_mark: (HIP 5.5 - 5.6)    | :white_check_mark: (HIP 5.7 - 6.1)    | :x:                     | -                                    | -                            |
+| SYCL                 | :x:                                       | :x:                                       | :x:                                   | :x:                                   | :x:                                       | :x:                                             | :x:                                       | :x:                                   | :x:                                   | :x:                                   | :x:                                   | :x:                                   | :white_check_mark: [^4] | -                                    | :x:                          |
 
 Other compilers or combinations marked with :x: in the table above may work but are not tested in CI and are therefore not explicitly supported.
 
@@ -91,7 +91,7 @@ Dependencies
 The **alpaka** library itself just requires header-only libraries.
 However some of the accelerator back-end implementations require different boost libraries to be built.
 
-When an accelerator back-end using *CUDA* is enabled, version *11.0* (with nvcc as CUDA compiler) or version *9.2* (with clang as CUDA compiler) of the *CUDA SDK* is the minimum requirement.
+When an accelerator back-end using *CUDA* is enabled, version *11.2* (with nvcc as CUDA compiler) or version *11.2* (with clang as CUDA compiler) of the *CUDA SDK* is the minimum requirement.
 *NOTE*: When using clang as a native *CUDA* compiler, the *CUDA accelerator back-end* can not be enabled together with any *OpenMP accelerator back-end* because this combination is currently unsupported.
 *NOTE*: Separable compilation is disabled by default and can be enabled via the CMake flag `CMAKE_CUDA_SEPARABLE_COMPILATION`.
 
@@ -214,47 +214,7 @@ consider citing us accordingly in your derived work and publications:
 Contributing
 ------------
 
-Rules for contributions can be found in [CONTRIBUTING.md](CONTRIBUTING.md)
-
-Authors
--------
-
-### Maintainers* and Core Developers
-
-- Benjamin Worpitz* (original author)
-- Dr. Sergei Bastrakov*
-- Kseniia Bastrakova
-- Dr. Andrea Bocci*
-- Dr. Antonio Di Pilato
-- Simeon Ehrig
-- Luca Ferragina
-- Bernhard Manfred Gruber*
-- Christian Kaever
-- Dr. Jeffrey Kelling
-- Dr. Stewart Martin-Haugh
-- Aurora Perego
-- Jan Stephan*
-- René Widera*
-- Dr. Jeffrey Young
-
-### Former Members, Contributions and Thanks
-
-- Dr. Michael Bussmann
-- Mat Colgrove
-- Valentin Gehrke
-- Dr. Axel Hübl
-- Maximilian Knespel
-- Jakob Krude
-- Alexander Matthes
-- Hauke Mewes
-- Phil Nash
-- Dr. Felice Pantaleo
-- Dr. David M. Rogers
-- Mutsuo Saito
-- Jonas Schenke
-- Daniel Vollmer
-- Dr. Jiří Vyskočil
-- Matthias Werner
-- Bert Wesarg
-- Malte Zacharias
-- Erik Zenker
+Rules for contributions can be found in [CONTRIBUTING.md](CONTRIBUTING.md).
+Any pull request will be reviewed by a [maintainer](https://github.com/orgs/alpaka-group/teams/alpaka-maintainers).
+
+Thanks to all [active and former contributors](.zenodo.json).
diff --git a/alpaka/README_SYCL.md b/alpaka/README_SYCL.md
index 5a4eadca..09d620c4 100644
--- a/alpaka/README_SYCL.md
+++ b/alpaka/README_SYCL.md
@@ -96,9 +96,8 @@ These can be used interchangeably (some restrictions apply - see below) with the
   ```
   See [Intel's FAQ](https://github.com/intel/compute-runtime/blob/master/opencl/doc/FAQ.md#feature-double-precision-emulation-fp64) for more information.
 * The FPGA back-end does not support atomics. alpaka will not check this.
-* Device global variables (corresponding to `__device__` and `__constant__` variables in CUDA) are not supported in the SYCL back-end yet.
 * Shared memory works but on the GPU it is very slow.
-* The latest Intel OpenCL CPU runtime does not work properly. Some tests (`atomicTest`, `blockSharedTest`, `blockSharedSharingTest` and `warpTest`) fail with a `PI_ERROR_OUT_OF_RESOURCES`. The only runtime version that seems to work is 2022.14.8.0.04 (can be downloaded [here](https://github.com/intel/llvm/releases/download/2022-WW33/oclcpuexp-2022.14.8.0.04_rel.tar.gz) apart from a bug with `all_of_group` / `any_of_group` that requires the warp size being equal to the block size as a workaround.
+* The latest Intel OpenCL CPU runtime does not work properly. Some tests (`atomicTest`, `blockSharedTest`, `blockSharedSharingTest` and `warpTest`) fail with a `PI_ERROR_OUT_OF_RESOURCES`. The only runtime version that seems to work is 2022.14.8.0.04 (can be downloaded [here](https://github.com/intel/llvm/releases/download/2022-WW33/oclcpuexp-2022.14.8.0.04_rel.tar.gz)) apart from a bug with `all_of_group` / `any_of_group` that requires the warp size being equal to the block size as a workaround.
 
 ### Choosing the sub-group size (warp size)
 
diff --git a/alpaka/benchmarks/CMakeLists.txt b/alpaka/benchmarks/CMakeLists.txt
new file mode 100644
index 00000000..6a8da0ef
--- /dev/null
+++ b/alpaka/benchmarks/CMakeLists.txt
@@ -0,0 +1,23 @@
+#
+# Copyright 2023 Benjamin Worpitz, Jan Stephan, Mehmet Yusufoglu
+# SPDX-License-Identifier: ISC
+#
+
+################################################################################
+# Required CMake version.
+################################################################################
+
+cmake_minimum_required(VERSION 3.22)
+
+project("alpakaBenchmarks" LANGUAGES CXX)
+
+################################################################################
+# Add subdirectories.
+################################################################################
+
+if(NOT BUILD_TESTING)
+    # Testing is not enabled therefore CATCH2 which is part of common must be pulled.
+    add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../test/common "${CMAKE_BINARY_DIR}/test/common")
+endif()
+
+add_subdirectory("babelstream/")
diff --git a/alpaka/benchmarks/babelstream/CMakeLists.txt b/alpaka/benchmarks/babelstream/CMakeLists.txt
new file mode 100644
index 00000000..5deb1a3e
--- /dev/null
+++ b/alpaka/benchmarks/babelstream/CMakeLists.txt
@@ -0,0 +1,49 @@
+#
+# Copyright 2023 Erik Zenker, Benjamin Worpitz, Jan Stephan, Bernhard Manfred Gruber
+# SPDX-License-Identifier: ISC
+#
+
+cmake_minimum_required(VERSION 3.22)
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+project(babelstream LANGUAGES CXX)
+
+if(NOT TARGET alpaka::alpaka)
+    option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)
+    if(alpaka_USE_SOURCE_TREE)
+        # Don't build the benchmarks recursively
+        set(alpaka_BUILD_BENCHMARKS OFF)
+        add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
+    else()
+        find_package(alpaka REQUIRED)
+    endif()
+endif()
+
+
+set(_TARGET_NAME "babelstream")
+append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
+
+alpaka_add_executable(
+    ${_TARGET_NAME}
+    ${_FILES_SOURCE})
+
+target_include_directories(
+ ${_TARGET_NAME}
+ PRIVATE "src")
+
+target_link_libraries(
+    ${_TARGET_NAME}
+    PRIVATE common)
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER benchmarks/babelstream)
+
+#Run as a ctest
+if(alpaka_CI)
+    # Only run for release builds since this is a benchmark
+    if(CMAKE_BUILD_TYPE STREQUAL "Release")
+       add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
+    endif()
+else()
+    # For a normal benchmark test, number of samples should be equal to the default value.
+    add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
+endif()
diff --git a/alpaka/example/babelstream/src/LICENSE b/alpaka/benchmarks/babelstream/src/LICENSE
similarity index 100%
rename from alpaka/example/babelstream/src/LICENSE
rename to alpaka/benchmarks/babelstream/src/LICENSE
diff --git a/alpaka/benchmarks/babelstream/src/README.md b/alpaka/benchmarks/babelstream/src/README.md
new file mode 100644
index 00000000..cd3eee70
--- /dev/null
+++ b/alpaka/benchmarks/babelstream/src/README.md
@@ -0,0 +1,101 @@
+This work was initially based on the [cupla port of BabelStream](https://github.com/jyoung3131/BabelStream) from Jeff Young. Then refactored.
+The benchmark BabelStream is developed by Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC; based on John D. McCalpin's original STREAM benchmark for CPUs
+Some implementations and the documents are accessible through https://github.com/UoB-HPC
+
+# Example Run
+Can be run with custom arguments as well as catch2 arguments
+# With Custom arguments:
+./babelstream  --array-size=1280000 --number-runs=10
+# With Catch2 arguments:
+./babelstream --success
+# With Custom and catch2 arguments together:
+./babelstream  --success --array-size=1280000 --number-runs=10
+
+# Command for a benchmarking run
+# ./babelstream --array-size=33554432 --number-runs=100 
+# Otuput is below:
+
+'''Array size provided: 33554432
+Number of runs provided: 100
+Randomness seeded to: 2775986196
+
+
+AcceleratorType:AccCpuSerial<1,unsigned int>
+NumberOfRuns:100
+Precision:single
+DataSize(items):33554432
+DeviceName:13th Gen Intel(R) Core(TM) i7-1360P
+WorkDivInit :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+WorkDivCopy :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+WorkDivMult :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+WorkDivAdd  :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+WorkDivTriad:{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+Kernels         Bandwidths(GB/s) MinTime(s) MaxTime(s) AvgTime(s) DataUsage(MB) 
+ InitKernel      12.2133         0.0219789 0.0244341 0.0234795 268.435 
+ CopyKernel      20.8898         0.01285  0.0141298 0.0130288 268.435 
+ MultKernel      20.9943         0.0127861 0.0161767 0.0129707 268.435 
+ AddKernel       24.4181         0.01649  0.0178725 0.0166714 402.653 
+ TriadKernel     24.44           0.0164751 0.0182611 0.0166579 402.653 
+
+
+
+AcceleratorType:AccGpuCudaRt<1,unsigned int>
+NumberOfRuns:100
+Precision:single
+DataSize(items):33554432
+DeviceName:NVIDIA RTX A500 Laptop GPU
+WorkDivInit :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivCopy :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivMult :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivAdd  :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivTriad:{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivDot  :{gridBlockExtent: (256), blockThreadExtent: (1024), threadElemExtent: (1)}
+Kernels         Bandwidths(GB/s) MinTime(s) MaxTime(s) AvgTime(s) DataUsage(MB) 
+ InitKernel      62.3725         0.00430374 0.00434411 0.00433501 268.435 
+ CopyKernel      90.2948         0.00297288 0.00302862 0.00300712 268.435 
+ MultKernel      90.3858         0.00296988 0.00302989 0.00300866 268.435 
+ AddKernel       90.947          0.00442734 0.00448436 0.00446751 402.653 
+ TriadKernel     90.88           0.0044306 0.00447952 0.00446739 402.653 
+ DotKernel       93.369          0.002875 0.00291691 0.0029106 268.435 
+
+
+
+AcceleratorType:AccCpuSerial<1,unsigned int>
+NumberOfRuns:100
+Precision:double
+DataSize(items):33554432
+DeviceName:13th Gen Intel(R) Core(TM) i7-1360P
+WorkDivInit :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+WorkDivCopy :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+WorkDivMult :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+WorkDivAdd  :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+WorkDivTriad:{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+WorkDivDot  :{gridBlockExtent: (256), blockThreadExtent: (1024), threadElemExtent: (1)}
+Kernels         Bandwidths(GB/s) MinTime(s) MaxTime(s) AvgTime(s) DataUsage(MB) 
+ InitKernel      12.2326         0.0438886 0.0543366 0.0463925 536.871 
+ CopyKernel      20.8888         0.0257014 0.0272265 0.0260267 536.871 
+ MultKernel      21.0395         0.0255173 0.0292734 0.0262349 536.871 
+ AddKernel       24.6628         0.0326527 0.0383083 0.0334047 805.306 
+ TriadKernel     24.5604         0.0327888 0.0494151 0.0335766 805.306 
+
+
+
+AcceleratorType:AccGpuCudaRt<1,unsigned int>
+NumberOfRuns:100
+Precision:double
+DataSize(items):33554432
+DeviceName:NVIDIA RTX A500 Laptop GPU
+WorkDivInit :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivCopy :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivMult :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivAdd  :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivTriad:{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivDot  :{gridBlockExtent: (256), blockThreadExtent: (1024), threadElemExtent: (1)}
+Kernels         Bandwidths(GB/s) MinTime(s) MaxTime(s) AvgTime(s) DataUsage(MB) 
+ InitKernel      62.4307         0.00859947 0.00864104 0.00862767 536.871 
+ CopyKernel      89.4157         0.00600421 0.00607738 0.00604754 536.871 
+ MultKernel      89.2831         0.00601313 0.00606791 0.0060488 536.871 
+ AddKernel       90.5499         0.00889351 0.00895834 0.00893668 805.306 
+ TriadKernel     90.5685         0.00889168 0.00897055 0.00893744 805.306 
+ DotKernel       93.2451         0.00575763 0.00581312 0.00579143 536.871 
+'''
diff --git a/alpaka/benchmarks/babelstream/src/babelStreamCommon.hpp b/alpaka/benchmarks/babelstream/src/babelStreamCommon.hpp
new file mode 100644
index 00000000..a22f7d03
--- /dev/null
+++ b/alpaka/benchmarks/babelstream/src/babelStreamCommon.hpp
@@ -0,0 +1,440 @@
+#pragma once
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <ctime>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <map>
+#include <numeric>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+namespace
+{
+    // Default array size, can be changed from command line arguments.
+    // To display cmd line args use ./babelstream --help or -?
+    // According to tests, 2^25 or larger values are needed for proper benchmarking:
+    // ./babelstream --array-size=33554432 --number-runs=100
+    // To prevent timeouts in CI, a smaller default value is used.
+    [[maybe_unused]] auto arraySizeMain = 1024 * 1024;
+
+    // Minimum array size to be used.
+    [[maybe_unused]] constexpr auto minArrSize = 1024 * 128;
+
+    // Scalar value for Mul and Triad kernel parameters.
+    [[maybe_unused]] constexpr auto scalarVal = 2.0f;
+
+    // Block thread extent for DotKernel test work division parameters.
+    [[maybe_unused]] constexpr auto blockThreadExtentMain = 1024;
+
+    // Number of runs for each kernel, can be changed by command line arguments.
+    // At least 100 runs are recommended for good benchmarking.
+    // To prevent timeouts in CI, a small value is used.
+    [[maybe_unused]] auto numberOfRuns = 2;
+
+    // Data input value for babelstream.
+    [[maybe_unused]] constexpr auto valA = 1.0f;
+
+    //! handleCustomArguments Gets custom cmd line arguments from the all arguments.
+    //! Namely gets --array-size=1234 and --number-runs=1234 and keeps the others which are
+    //! command line args for Catch2 session.
+    [[maybe_unused]] static void handleCustomArguments(int& argc, char* argv[])
+    {
+        std::vector<char*> newArgv;
+        newArgv.push_back(argv[0]); // Keep the program name
+
+        for(int i = 1; i < argc; ++i)
+        {
+            std::string arg = argv[i];
+            if(arg.rfind("--array-size=", 0) == 0)
+            {
+                auto const arrSize = std::stoi(arg.substr(13)); // Convert to integer
+                if(arrSize > minArrSize)
+                {
+                    arraySizeMain = arrSize;
+                    std::cout << "Array size provided(items): " << arraySizeMain << std::endl;
+                }
+                else
+                {
+                    std::cout << "Too small array size given. Must be at least " << minArrSize << std::endl;
+                    std::cout << "Using default array size(number of items): " << arraySizeMain << std::endl;
+                }
+            }
+            else if(arg.rfind("--number-runs=", 0) == 0)
+            {
+                auto const numRuns = std::stoi(arg.substr(14)); // Convert to integer
+                if(numRuns > 0)
+                {
+                    numberOfRuns = numRuns;
+                    std::cout << "Number of runs provided: " << numberOfRuns << std::endl;
+                }
+                else
+                {
+                    std::cout << "Using default number of runs: " << numberOfRuns << std::endl;
+                }
+            }
+            else
+            {
+                // If it's not a custom argument, keep it for Catch2
+                newArgv.push_back(argv[i]);
+            }
+            if(arg.rfind("-?", 0) == 0 || arg.rfind("--help", 0) == 0 || arg.rfind("-h", 0) == 0)
+            {
+                std::cout << "Usage of custom arguments (arguments which are not Catch2):  --array-size=33554432 and "
+                             "--number-runs=100"
+                          << std::endl;
+            }
+        }
+
+        // Update argc and argv to exclude custom arguments
+        argc = static_cast<int>(newArgv.size());
+        for(int i = 0; i < argc; ++i)
+        {
+            argv[i] = newArgv[static_cast<size_t>(i)];
+        }
+    }
+
+    //! FuzzyEqual compares two floating-point or integral type values.
+    //! \tparam T Type of the values to compare.
+    //! \param a First value to compare.
+    //! \param b Second value to compare.
+    //! \return Returns true if the values are approximately equal (for floating-point types) or exactly equal (for
+    //! integral types).
+    template<typename T>
+    [[maybe_unused]] bool FuzzyEqual(T a, T b)
+    {
+        if constexpr(std::is_floating_point_v<T>)
+        {
+            return std::fabs(a - b) < std::numeric_limits<T>::epsilon() * static_cast<T>(100.0);
+        }
+        else if constexpr(std::is_integral_v<T>)
+        {
+            return a == b;
+        }
+        else
+        {
+            static_assert(
+                std::is_floating_point_v<T> || std::is_integral_v<T>,
+                "FuzzyEqual<T> is only supported for integral or floating-point types.");
+        }
+    }
+
+    //!   Gets the current timestamp and returns it as a string.
+    //! \return A string representation of the current timestamp in the format "YYYY-MM-DD HH:MM:SS".
+    [[maybe_unused]] static std::string getCurrentTimestamp()
+    {
+        auto now = std::chrono::system_clock::now();
+        auto now_c = std::chrono::system_clock::to_time_t(now);
+        std::stringstream ss;
+        ss << std::put_time(std::localtime(&now_c), "%Y-%m-%d %X");
+        return ss.str();
+    }
+
+    //! joinElements  Joins the elements of a vector into a string, separated by a specified delimiter.
+    //! \tparam T Type of the elements in the vector.
+    //! \param vec The vector of elements to join.
+    //! \param delim The delimiter to separate the elements in the resulting string.
+    //! \return A string with the vector elements separated by the specified delimiter.
+    template<typename T>
+    [[maybe_unused]] static std::string joinElements(std::vector<T> const& vec, std::string const& delim)
+    {
+        return std::accumulate(
+            vec.begin(),
+            vec.end(),
+            std::string(),
+            [&delim](std::string const& a, T const& b)
+            {
+                std::ostringstream oss;
+                if(!a.empty())
+                    oss << a << delim;
+                oss << std::setprecision(5) << b;
+                return oss.str();
+            });
+    }
+
+    //! findMinMax  Finds the minimum and maximum elements in a container.
+    //! \tparam Container The type of the container.
+    //! \param times The container from which to find the minimum and maximum elements.
+    //! \return A pair containing the minimum and maximum values in the container.
+    //! \note The first element is omitted if the container size is larger than 1, as the result is used in time
+    //! measurement for benchmarking.
+    template<typename Container>
+    [[maybe_unused]] static auto findMinMax(Container const& times)
+        -> std::pair<typename Container::value_type, typename Container::value_type>
+    {
+        if(times.empty())
+            return std::make_pair(typename Container::value_type{}, typename Container::value_type{});
+
+        // Default to min and max being the same element for single element containers
+        auto minValue = *std::min_element(times.begin(), times.end());
+        auto maxValue = minValue;
+
+        if(times.size() > 1)
+        {
+            // Calculate min and max ignoring the first element
+            minValue = *std::min_element(times.begin() + 1, times.end());
+            maxValue = *std::max_element(times.begin() + 1, times.end());
+        }
+
+        return std::make_pair(minValue, maxValue);
+    }
+
+    //! findAverage  Calculates the average value of elements in a container, does not take into account the first one.
+    //! \tparam Container The type of the container.
+    //! \param elements The container from which to calculate the average.
+    //! \return The average value of the elements in the container without considering the first element.
+    template<typename Container>
+    [[maybe_unused]] static auto findAverage(Container const& elements) -> typename Container::value_type
+    {
+        if(elements.empty())
+            return typename Container::value_type{};
+
+        if(elements.size() == 1)
+            return elements.front(); // Only one element, return it as the average
+
+        // Calculate the sum of the elements, start from the second one
+        auto sum = std::accumulate(elements.begin() + 1, elements.end(), typename Container::value_type{});
+
+        // Calculate and return the average, take into account that one element is not used
+        return sum / static_cast<typename Container::value_type>(elements.size() - 1);
+    }
+
+    //!   Enum class representing benchmark information data types.
+    enum class BMInfoDataType
+    {
+        AcceleratorType,
+        TimeStamp,
+        NumRuns,
+        DataSize,
+        DataType,
+        WorkDivInit,
+        WorkDivCopy,
+        WorkDivAdd,
+        WorkDivTriad,
+        WorkDivMult,
+        WorkDivDot,
+        DeviceName,
+        TimeUnit,
+        KernelNames,
+        KernelBandwidths,
+        KernelDataUsageValues,
+        KernelMinTimes,
+        KernelMaxTimes,
+        KernelAvgTimes
+    };
+
+    //! typeToTypeStr Converts BMInfoDataType enum values to their corresponding string representations.
+    //! \param item The BMInfoDataType enum type value to convert to a more explicit string with units.
+    //! \return A string representation of the given BMInfoDataType enum value.
+#if defined(__clang__)
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wswitch-default"
+#    pragma clang diagnostic ignored "-Wcovered-switch-default"
+#endif
+    static std::string typeToTypeStr(BMInfoDataType item)
+    {
+        switch(item)
+        {
+        case BMInfoDataType::AcceleratorType:
+            return "AcceleratorType";
+        case BMInfoDataType::TimeStamp:
+            return "TimeStamp";
+        case BMInfoDataType::NumRuns:
+            return "NumberOfRuns";
+        case BMInfoDataType::DataSize:
+            return "DataSize(items)";
+        case BMInfoDataType::DataType:
+            return "Precision";
+        case BMInfoDataType::DeviceName:
+            return "DeviceName";
+        case BMInfoDataType::TimeUnit:
+            return "TimeUnitForXMLReport";
+        case BMInfoDataType::KernelNames:
+            return "Kernels";
+        case BMInfoDataType::KernelDataUsageValues:
+            return "DataUsage(MB)";
+        case BMInfoDataType::KernelBandwidths:
+            return "Bandwidths(GB/s)";
+        case BMInfoDataType::KernelMinTimes:
+            return "MinTime(s)";
+        case BMInfoDataType::KernelMaxTimes:
+            return "MaxTime(s)";
+        case BMInfoDataType::KernelAvgTimes:
+            return "AvgTime(s)";
+        case BMInfoDataType::WorkDivInit:
+            return "WorkDivInit ";
+        case BMInfoDataType::WorkDivCopy:
+            return "WorkDivCopy ";
+        case BMInfoDataType::WorkDivAdd:
+            return "WorkDivAdd  ";
+        case BMInfoDataType::WorkDivTriad:
+            return "WorkDivTriad";
+        case BMInfoDataType::WorkDivMult:
+            return "WorkDivMult ";
+        case BMInfoDataType::WorkDivDot:
+            return "WorkDivDot  ";
+        default:
+            return "";
+        }
+    }
+#if defined(__clang__)
+#    pragma clang diagnostic pop
+#endif
+    //! getDataThroughput Calculates the data throughput for processing the entire array.
+    //! \tparam DataType The type of the data.
+    //! \tparam T The type of the parameters.
+    //! \param readsWrites The number of read/write operations.
+    //! \param arraySize The size of the array.
+    //! \return The calculated data throughput in MB.
+    template<typename DataType, typename T>
+    [[maybe_unused]] static double getDataThroughput(T readsWrites, T arraySize)
+    {
+        auto throughput = readsWrites * sizeof(DataType) * arraySize;
+        // convert to MB (not MiB)
+        return static_cast<double>(throughput) * 1.0E-6;
+    }
+
+    //! calculateBandwidth Calculates the bandwidth in GB/sec.
+    //! \tparam T The type of bytesReadWriteMB.
+    //! \tparam U The type of runTimeSeconds (e.g., double).
+    //! \param bytesReadWriteMB The amount of data read/write in MB.
+    //! \param runTimeSeconds The runtime in seconds.
+    //! \return The calculated bandwidth in GB/sec.
+    template<typename T, typename U>
+    [[maybe_unused]] static double calculateBandwidth(T bytesReadWriteMB, U runTimeSeconds)
+    {
+        // Divide by 1.0E+3 to convert from MB to GB (not GiB)
+        auto bytesReadWriteGB = static_cast<double>(bytesReadWriteMB) * (1.0E-3);
+        return bytesReadWriteGB / static_cast<double>(runTimeSeconds);
+    }
+
+    //! MetaData class to store and serialize benchmark information.
+    //! \details The MetaData class includes a single map to keep all benchmark information and provides serialization
+    //! methods for generating output.
+    class MetaData
+    {
+    public:
+        //! setItem  Sets an item in the metadata map.
+        //! \tparam T The type of the value to store.
+        //! \param key The BMInfoDataType key.
+        //! \param value The value to store associated with the key.
+        template<typename T>
+        [[maybe_unused]] void setItem(BMInfoDataType key, T const& value)
+        {
+            std::ostringstream oss;
+            oss << value;
+            metaDataMap[key] = oss.str();
+        }
+
+        //! serialize  Serializes the entire metadata to a string.
+        //! \return A string containing the serialized metadata.
+        //! \details This is standard serialization and produces output that can be post-processed easily.
+        [[maybe_unused]] std::string serialize() const
+        {
+            std::stringstream ss;
+            for(auto const& pair : metaDataMap)
+            {
+                ss << "\n" << typeToTypeStr(pair.first) << ":" << pair.second;
+            }
+            return ss.str();
+        }
+
+        //! serializeAsTable Serializes the metadata into a more structured format for easy visual inspection.
+        //! \return A string containing the serialized metadata as a table.
+        //! \details The method first serializes general information, then creates a summary as a table where each row
+        //! represents a kernel.
+        [[maybe_unused]] std::string serializeAsTable() const
+        {
+            std::stringstream ss;
+            // define lambda to add values to a string stream created already
+            auto addItemValue = [&, this](BMInfoDataType item) {
+                ss << "\n" << typeToTypeStr(item) << ":" << metaDataMap.at(item);
+            };
+
+            // Initially chose some data to serialize
+            ss << "\n";
+            addItemValue(BMInfoDataType::AcceleratorType);
+            addItemValue(BMInfoDataType::NumRuns);
+            addItemValue(BMInfoDataType::DataType);
+            addItemValue(BMInfoDataType::DataSize);
+            addItemValue(BMInfoDataType::DeviceName);
+            addItemValue(BMInfoDataType::WorkDivInit);
+            addItemValue(BMInfoDataType::WorkDivCopy);
+            addItemValue(BMInfoDataType::WorkDivMult);
+            addItemValue(BMInfoDataType::WorkDivAdd);
+            addItemValue(BMInfoDataType::WorkDivTriad);
+            if(metaDataMap.count(BMInfoDataType::WorkDivDot) != 0)
+                addItemValue(BMInfoDataType::WorkDivDot);
+
+            auto getItemFromStrList = [this](BMInfoDataType item, int index) -> std::string
+            {
+                std::string const str = metaDataMap.at(item);
+
+                if(index < 1)
+                {
+                    throw std::invalid_argument("Index must be 1 or greater.");
+                }
+
+                std::istringstream iss(str);
+                std::string token;
+                int current_index = 1; // Start at 1 for 1-based indexing
+
+                // Using ", " as the delimiter, we handle the token extraction manually
+                while(std::getline(iss, token, ','))
+                {
+                    // Remove any leading spaces that may be left by `getline`
+                    size_t start = token.find_first_not_of(' ');
+                    if(start != std::string::npos)
+                    {
+                        token = token.substr(start);
+                    }
+
+                    if(current_index == index)
+                    {
+                        return token;
+                    }
+                    ++current_index;
+                }
+
+                throw std::out_of_range("Index out of range");
+            };
+
+            // Prepare Table
+            // Table column names
+            ss << std::endl;
+            ss << std::left << std::setw(15) << typeToTypeStr(BMInfoDataType::KernelNames) << " " << std::left
+               << std::setw(15) << typeToTypeStr(BMInfoDataType::KernelBandwidths) << " " << std::left << std::setw(10)
+               << typeToTypeStr(BMInfoDataType::KernelMinTimes) << " " << std::left << std::setw(10)
+               << typeToTypeStr(BMInfoDataType::KernelMaxTimes) << " " << std::left << std::setw(10)
+               << typeToTypeStr(BMInfoDataType::KernelAvgTimes) << " " << std::left << std::setw(6)
+               << typeToTypeStr(BMInfoDataType::KernelDataUsageValues) << " ";
+            ss << std::endl;
+            auto const kernelNamesStr = metaDataMap.at(BMInfoDataType::KernelNames);
+            auto numberOfKernels = std::count(kernelNamesStr.begin(), kernelNamesStr.end(), ',') + 1;
+
+            // Table rows. Print test results for each kernel line by line
+            for(auto i = 1; i <= numberOfKernels; i++)
+            {
+                // Print the row for the kernel i
+                ss << " " << std::left << std::setw(15) << getItemFromStrList(BMInfoDataType::KernelNames, i) << " ";
+                ss << std::left << std::setw(15) << getItemFromStrList(BMInfoDataType::KernelBandwidths, i) << " ";
+                ss << std::left << std::setw(8) << getItemFromStrList(BMInfoDataType::KernelMinTimes, i) << " ";
+                ss << std::left << std::setw(8) << getItemFromStrList(BMInfoDataType::KernelMaxTimes, i) << " ";
+                ss << std::left << std::setw(8) << getItemFromStrList(BMInfoDataType::KernelAvgTimes, i) << " ";
+                ss << std::left << std::setw(6) << getItemFromStrList(BMInfoDataType::KernelDataUsageValues, i) << " "
+                   << std::endl;
+            }
+
+            return ss.str();
+        }
+
+    private:
+        std::map<BMInfoDataType, std::string> metaDataMap;
+    };
+} // namespace
diff --git a/alpaka/benchmarks/babelstream/src/babelStreamMainTest.cpp b/alpaka/benchmarks/babelstream/src/babelStreamMainTest.cpp
new file mode 100644
index 00000000..79ec6216
--- /dev/null
+++ b/alpaka/benchmarks/babelstream/src/babelStreamMainTest.cpp
@@ -0,0 +1,478 @@
+
+#include "babelStreamCommon.hpp"
+#include "catch2/catch_session.hpp"
+
+#include <alpaka/alpaka.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+
+#include <catch2/benchmark/catch_benchmark.hpp>
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+
+#include <string>
+
+/**
+ * Babelstream benchmarking example. Babelstream has 5 kernels. Add, Multiply, Copy, Triad and Dot.
+ * Babelstream is a memory-bound benchmark since the main operation in the kernels has high Code Balance (bytes/FLOP)
+ * value. For example c[i] = a[i] + b[i]; has 2 reads 1 writes and has one FLOP operation. For double precision each
+ * read-write is 8 bytes. Hence Code Balance (3*8 / 1) = 24 bytes/FLOP.
+ *
+ * Some implementations and the documents are accessible through https://github.com/UoB-HPC
+ *
+ * Can be run with custom arguments as well as catch2 arguments
+ * Run with Custom arguments:
+ * ./babelstream --array-size=33554432 --number-runs=100
+ * Runt with default array size and num runs:
+ * ./babelstream
+ * Run with Catch2 arguments and defaul arrary size and num runs:
+ * ./babelstream --success
+ * ./babelstream -r a.xml
+ * Run with Custom and catch2 arguments together:
+ * ./babelstream  --success --array-size=1280000 --number-runs=10
+ * Help to list custom and catch2 arguments
+ * ./babelstream -?
+ * ./babelstream --help
+ *  According to tests, 2^25 or larger data size values are needed for proper benchmarking:
+ *  ./babelstream --array-size=33554432 --number-runs=100
+ */
+
+// Main function that integrates Catch2 and custom argument handling
+int main(int argc, char* argv[])
+{
+    // Handle custom arguments
+    handleCustomArguments(argc, argv);
+
+    // Initialize Catch2 and pass the command-line arguments to it
+    int result = Catch::Session().run(argc, argv);
+
+    // Return the result of the tests
+    return result;
+}
+
+//! Initialization kernel
+struct InitKernel
+{
+    //! The kernel entry point
+    //! \tparam TAcc The accelerator environment to be executed on.
+    //! \tparam T The data type
+    //! \param acc The accelerator to be executed on.
+    //! \param a Pointer for vector a
+    //! \param initA the value to set all items in the vector
+    template<typename TAcc, typename T>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T* b, T* c, T initA) const
+    {
+        auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        a[i] = initA;
+        b[i] = static_cast<T>(0.0);
+        c[i] = static_cast<T>(0.0);
+    }
+};
+
+//! Vector copying kernel
+struct CopyKernel
+{
+    //! The kernel entry point
+    //! \tparam TAcc The accelerator environment to be executed on.
+    //! \tparam T The data type
+    //! \param acc The accelerator to be executed on.
+    //! \param a Pointer for vector a
+    //! \param b Pointer for vector b
+    template<typename TAcc, typename T>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T* b) const
+    {
+        auto const [index] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        b[index] = a[index];
+    }
+};
+
+//! Kernel multiplies the vector with a scalar, scaling or multiplication kernel
+struct MultKernel
+{
+    //! The kernel entry point
+    //! \tparam TAcc The accelerator environment to be executed on.
+    //! \tparam T The data type
+    //! \param acc The accelerator to be executed on.
+    //! \param a Pointer for vector a
+    //! \param b Pointer for result vector b
+    template<typename TAcc, typename T>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T* const a, T* b) const
+    {
+        const T scalar = static_cast<T>(scalarVal);
+        auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        b[i] = scalar * a[i];
+    }
+};
+
+//! Vector summation kernel
+struct AddKernel
+{
+    //! The kernel entry point
+    //! \tparam TAcc The accelerator environment to be executed on.
+    //! \tparam T The data type
+    //! \param acc The accelerator to be executed on.
+    //! \param a Pointer for vector a
+    //! \param b Pointer for vector b
+    //! \param c Pointer for result vector c
+    template<typename TAcc, typename T>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* c) const
+    {
+        auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        c[i] = a[i] + b[i];
+    }
+};
+
+//! Kernel to find the linear combination of 2 vectors by initially scaling one of them
+struct TriadKernel
+{
+    //! The kernel entry point
+    //! \tparam TAcc The accelerator environment to be executed on.
+    //! \tparam T The data type
+    //! \param acc The accelerator to be executed on.
+    //! \param a Pointer for vector a
+    //! \param b Pointer for vector b
+    //! \param c Pointer for result vector c
+    template<typename TAcc, typename T>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* c) const
+    {
+        const T scalar = static_cast<T>(scalarVal);
+        auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        c[i] = a[i] + scalar * b[i];
+    }
+};
+
+//! Dot product of two vectors. The result is not a scalar but a vector of block-level dot products. For the
+//! BabelStream implementation and documentation: https://github.com/UoB-HPC
+struct DotKernel
+{
+    //! The kernel entry point
+    //! \tparam TAcc The accelerator environment to be executed on.
+    //! \tparam T The data type
+    //! \param acc The accelerator to be executed on.
+    //! \param a Pointer for vector a
+    //! \param b Pointer for vector b
+    //! \param sum Pointer for result vector consisting sums for each block
+    template<typename TAcc, typename T>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* sum, alpaka::Idx<TAcc> arraySize) const
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        auto& tbSum = alpaka::declareSharedVar<T[blockThreadExtentMain], __COUNTER__>(acc);
+
+        auto i = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
+        auto const local_i = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0];
+        auto const totalThreads = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];
+
+        T threadSum = 0;
+        for(; i < arraySize; i += totalThreads)
+            threadSum += a[i] * b[i];
+        tbSum[local_i] = threadSum;
+
+        auto const blockSize = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0];
+        for(Idx offset = blockSize / 2; offset > 0; offset /= 2)
+        {
+            alpaka::syncBlockThreads(acc);
+            if(local_i < offset)
+                tbSum[local_i] += tbSum[local_i + offset];
+        }
+
+        auto const gridBlockIndex = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0];
+        if(local_i == 0)
+            sum[gridBlockIndex] = tbSum[local_i];
+    }
+};
+
+//! \brief The Function for testing babelstream kernels for given Acc type and data type.
+//! \tparam TAcc the accelerator type
+//! \tparam DataType The data type to differentiate single or double data type based tests.
+template<typename TAcc, typename DataType>
+void testKernels()
+{
+    using Acc = TAcc;
+    // Define the index domain
+    // Set the number of dimensions as an integral constant. Set to 1 for 1D.
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
+
+    // Meta data
+    // A MetaData class instance to keep the problem and results to print later
+    MetaData metaData;
+    std::string dataTypeStr;
+    if(std::is_same<DataType, float>::value)
+    {
+        dataTypeStr = "single";
+    }
+    else if(std::is_same<DataType, double>::value)
+    {
+        dataTypeStr = "double";
+    }
+
+    using QueueAcc = alpaka::Queue<Acc, alpaka::Blocking>;
+
+    // Select a device
+    auto const platform = alpaka::Platform<Acc>{};
+    auto const devAcc = alpaka::getDevByIdx(platform, 0);
+
+    // Create a queue on the device
+    QueueAcc queue(devAcc);
+
+    // Get the host device for allocating memory on the host.
+    auto const platformHost = alpaka::PlatformCpu{};
+    auto const devHost = alpaka::getDevByIdx(platformHost, 0);
+
+    // Create vectors
+    Idx arraySize = static_cast<Idx>(arraySizeMain);
+
+    // Acc buffers
+    auto bufAccInputA = alpaka::allocBuf<DataType, Idx>(devAcc, arraySize);
+    auto bufAccInputB = alpaka::allocBuf<DataType, Idx>(devAcc, arraySize);
+    auto bufAccOutputC = alpaka::allocBuf<DataType, Idx>(devAcc, arraySize);
+
+    // Host buffer as the result
+    auto bufHostOutputA = alpaka::allocBuf<DataType, Idx>(devHost, arraySize);
+    auto bufHostOutputB = alpaka::allocBuf<DataType, Idx>(devHost, arraySize);
+    auto bufHostOutputC = alpaka::allocBuf<DataType, Idx>(devHost, arraySize);
+
+    // Grid size and elems per thread will be used to get the work division
+    using Vec = alpaka::Vec<Dim, Idx>;
+    auto const elementsPerThread = Vec::all(static_cast<Idx>(1));
+    auto const elementsPerGrid = Vec::all(arraySize);
+
+    // Create pointer variables for buffer access
+    auto bufAccInputAPtr = std::data(bufAccInputA);
+    auto bufAccInputBPtr = std::data(bufAccInputB);
+    auto bufAccOutputCPtr = std::data(bufAccOutputC);
+
+    // Bind gridsize and elements per thread together
+    alpaka::KernelCfg<Acc> const kernelCfg = {elementsPerGrid, elementsPerThread};
+    // Let alpaka calculate good work division (namely the block and grid sizes) given our full problem extent
+    auto const workDivInit = alpaka::getValidWorkDiv(
+        kernelCfg,
+        devAcc,
+        InitKernel(),
+        bufAccInputAPtr,
+        bufAccInputBPtr,
+        bufAccOutputCPtr,
+        static_cast<DataType>(valA));
+    auto const workDivCopy
+        = alpaka::getValidWorkDiv(kernelCfg, devAcc, CopyKernel(), bufAccInputAPtr, bufAccInputBPtr);
+    auto const workDivMult
+        = alpaka::getValidWorkDiv(kernelCfg, devAcc, MultKernel(), bufAccInputAPtr, bufAccInputBPtr);
+    auto const workDivAdd
+        = alpaka::getValidWorkDiv(kernelCfg, devAcc, AddKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr);
+
+    auto const workDivTriad = alpaka::getValidWorkDiv(
+        kernelCfg,
+        devAcc,
+        TriadKernel(),
+        bufAccInputAPtr,
+        bufAccInputBPtr,
+        bufAccOutputCPtr);
+
+    // Vector of average run-times of babelstream kernels
+    std::vector<double> avgExecTimesOfKernels;
+    std::vector<double> minExecTimesOfKernels;
+    std::vector<double> maxExecTimesOfKernels;
+    std::vector<std::string> kernelLabels;
+    // Vector for collecting successive run-times of a single kernel in benchmark macro
+    std::vector<double> times;
+
+    // Lambda for measuring run-time
+    auto measureKernelExec = [&](auto&& kernelFunc, [[maybe_unused]] auto&& kernelLabel)
+    {
+        for(auto i = 0; i < numberOfRuns; i++)
+        {
+            double runtime = 0.0;
+            auto start = std::chrono::high_resolution_clock::now();
+            kernelFunc();
+            alpaka::wait(queue);
+            auto end = std::chrono::high_resolution_clock::now();
+            std::chrono::duration<double> duration = end - start;
+            runtime = duration.count();
+            times.push_back(runtime);
+        }
+
+        // find the minimum of the durations array.
+        // In benchmarking the first item of the runtimes array is not included in calculations.
+        const auto minmaxPair = findMinMax(times);
+        minExecTimesOfKernels.push_back(minmaxPair.first);
+        maxExecTimesOfKernels.push_back(minmaxPair.second);
+        avgExecTimesOfKernels.push_back(findAverage(times));
+        kernelLabels.push_back(kernelLabel);
+        times.clear();
+    };
+
+    // Run kernels one by one
+    // Test the init-kernel.
+    measureKernelExec(
+        [&]()
+        {
+            alpaka::exec<Acc>(
+                queue,
+                workDivInit,
+                InitKernel(),
+                bufAccInputAPtr,
+                bufAccInputBPtr,
+                bufAccOutputCPtr,
+                static_cast<DataType>(valA));
+        },
+        "InitKernel");
+
+    // Test the copy-kernel. Copy A one by one to B.
+    measureKernelExec(
+        [&]() { alpaka::exec<Acc>(queue, workDivCopy, CopyKernel(), bufAccInputAPtr, bufAccInputBPtr); },
+        "CopyKernel");
+
+    // Test the scaling-kernel. Calculate B=scalar*A.
+    measureKernelExec(
+        [&]() { alpaka::exec<Acc>(queue, workDivMult, MultKernel(), bufAccInputAPtr, bufAccInputBPtr); },
+        "MultKernel");
+
+    // Test the addition-kernel. Calculate C=A+B. Where B=scalar*A.
+    measureKernelExec(
+        [&]()
+        { alpaka::exec<Acc>(queue, workDivAdd, AddKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr); },
+        "AddKernel");
+
+    // Test the Triad-kernel. Calculate C=A+scalar*B where B=scalar*A.
+    measureKernelExec(
+        [&]()
+        { alpaka::exec<Acc>(queue, workDivTriad, TriadKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr); },
+        "TriadKernel");
+
+
+    // Copy arrays back to host
+    alpaka::memcpy(queue, bufHostOutputC, bufAccOutputC, arraySize);
+    alpaka::memcpy(queue, bufHostOutputB, bufAccInputB, arraySize);
+    alpaka::memcpy(queue, bufHostOutputA, bufAccInputA, arraySize);
+
+    // Verify the results
+    //
+    // Find sum of the errors as sum of the differences from expected values
+    DataType initVal{static_cast<DataType>(0.0)};
+    DataType sumErrC{initVal}, sumErrB{initVal}, sumErrA{initVal};
+
+    auto const expectedC = static_cast<DataType>(valA + scalarVal * scalarVal * valA);
+    auto const expectedB = static_cast<DataType>(scalarVal * valA);
+    auto const expectedA = static_cast<DataType>(valA);
+
+    // sum of the errors for each array
+    for(Idx i = 0; i < arraySize; ++i)
+    {
+        sumErrC += bufHostOutputC[static_cast<Idx>(i)] - expectedC;
+        sumErrB += bufHostOutputB[static_cast<Idx>(i)] - expectedB;
+        sumErrA += bufHostOutputA[static_cast<Idx>(i)] - expectedA;
+    }
+
+    // Normalize and compare sum of the errors
+    REQUIRE(FuzzyEqual(sumErrC / static_cast<DataType>(arraySize) / expectedC, static_cast<DataType>(0.0)));
+    REQUIRE(FuzzyEqual(sumErrB / static_cast<DataType>(arraySize) / expectedB, static_cast<DataType>(0.0)));
+    REQUIRE(FuzzyEqual(sumErrA / static_cast<DataType>(arraySize) / expectedA, static_cast<DataType>(0.0)));
+    alpaka::wait(queue);
+
+    // Test Dot kernel with specific blocksize which is larger than 1
+    if constexpr(alpaka::accMatchesTags<TAcc, alpaka::TagGpuCudaRt, alpaka::TagGpuHipRt, alpaka::TagGpuSyclIntel>)
+    {
+        using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
+        // Threads per block for Dot kernel
+        constexpr Idx blockThreadExtent = blockThreadExtentMain;
+        // Blocks per grid for Dot kernel
+        constexpr Idx gridBlockExtent = static_cast<Idx>(256);
+        // Vector of sums of each block
+        auto bufAccSumPerBlock = alpaka::allocBuf<DataType, Idx>(devAcc, gridBlockExtent);
+        auto bufHostSumPerBlock = alpaka::allocBuf<DataType, Idx>(devHost, gridBlockExtent);
+        // A specific work-division is used for dotKernel
+        auto const workDivDot = WorkDiv{Vec{gridBlockExtent}, Vec{blockThreadExtent}, Vec::all(1)};
+
+        measureKernelExec(
+            [&]()
+            {
+                alpaka::exec<Acc>(
+                    queue,
+                    workDivDot,
+                    DotKernel(), // Dot kernel
+                    alpaka::getPtrNative(bufAccInputA),
+                    alpaka::getPtrNative(bufAccInputB),
+                    alpaka::getPtrNative(bufAccSumPerBlock),
+                    static_cast<alpaka::Idx<Acc>>(arraySize));
+            },
+            "DotKernel");
+
+        alpaka::memcpy(queue, bufHostSumPerBlock, bufAccSumPerBlock, gridBlockExtent);
+        alpaka::wait(queue);
+
+        DataType const* sumPtr = std::data(bufHostSumPerBlock);
+        auto const result = std::reduce(sumPtr, sumPtr + gridBlockExtent, DataType{0});
+        // Since vector values are 1, dot product should be identical to arraySize
+        REQUIRE(FuzzyEqual(static_cast<DataType>(result), static_cast<DataType>(arraySize * 2)));
+        // Add workdiv to the list of workdivs to print later
+        metaData.setItem(BMInfoDataType::WorkDivDot, workDivDot);
+    }
+
+
+    //
+    // Calculate and Display Benchmark Results
+    //
+    std::vector<double> bytesReadWriteMB = {
+        getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize)),
+        getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize)),
+        getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize)),
+        getDataThroughput<DataType>(3u, static_cast<unsigned>(arraySize)),
+        getDataThroughput<DataType>(3u, static_cast<unsigned>(arraySize)),
+        getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize)),
+    };
+
+    // calculate the bandwidth as throughput per seconds
+    std::vector<double> bandwidthsPerKernel;
+    if(minExecTimesOfKernels.size() == kernelLabels.size())
+    {
+        for(size_t i = 0; i < minExecTimesOfKernels.size(); ++i)
+        {
+            bandwidthsPerKernel.push_back(calculateBandwidth(bytesReadWriteMB.at(i), minExecTimesOfKernels.at(i)));
+        }
+    }
+
+    // Setting fields of Benchmark Info map. All information about benchmark and results are stored in a single map
+    metaData.setItem(BMInfoDataType::TimeStamp, getCurrentTimestamp());
+    metaData.setItem(BMInfoDataType::NumRuns, std::to_string(numberOfRuns));
+    metaData.setItem(BMInfoDataType::DataSize, std::to_string(arraySizeMain));
+    metaData.setItem(BMInfoDataType::DataType, dataTypeStr);
+
+    metaData.setItem(BMInfoDataType::WorkDivInit, workDivInit);
+    metaData.setItem(BMInfoDataType::WorkDivCopy, workDivCopy);
+    metaData.setItem(BMInfoDataType::WorkDivAdd, workDivAdd);
+    metaData.setItem(BMInfoDataType::WorkDivMult, workDivMult);
+    metaData.setItem(BMInfoDataType::WorkDivTriad, workDivTriad);
+
+    // Device and accelerator
+    metaData.setItem(BMInfoDataType::DeviceName, alpaka::getName(devAcc));
+    metaData.setItem(BMInfoDataType::AcceleratorType, alpaka::getAccName<Acc>());
+    // XML reporter of catch2 always converts to Nano Seconds
+    metaData.setItem(BMInfoDataType::TimeUnit, "Nano Seconds");
+    // Join elements and create a comma separated string
+    metaData.setItem(BMInfoDataType::KernelNames, joinElements(kernelLabels, ", "));
+    metaData.setItem(BMInfoDataType::KernelDataUsageValues, joinElements(bytesReadWriteMB, ", "));
+    metaData.setItem(BMInfoDataType::KernelBandwidths, joinElements(bandwidthsPerKernel, ", "));
+    metaData.setItem(BMInfoDataType::KernelMinTimes, joinElements(minExecTimesOfKernels, ", "));
+    metaData.setItem(BMInfoDataType::KernelMaxTimes, joinElements(maxExecTimesOfKernels, ", "));
+    metaData.setItem(BMInfoDataType::KernelAvgTimes, joinElements(avgExecTimesOfKernels, ", "));
+
+    // Print the summary as a table, if a standard serialization is needed other functions of the class can be used
+    std::cout << metaData.serializeAsTable() << std::endl;
+}
+
+using TestAccs1D = alpaka::test::EnabledAccs<alpaka::DimInt<1u>, std::uint32_t>;
+
+// Run for all Accs given by the argument
+TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Five Kernels<Float>", "[benchmark-test]", TestAccs1D)
+{
+    using Acc = TestType;
+    // Run tests for the float data type
+    testKernels<Acc, float>();
+}
+
+// Run for all Accs given by the argument
+TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Five Kernels<Double>", "[benchmark-test]", TestAccs1D)
+{
+    using Acc = TestType;
+    // Run tests for the double data type
+    testKernels<Acc, double>();
+}
diff --git a/alpaka/cmake/addExecutable.cmake b/alpaka/cmake/addExecutable.cmake
index f80f3ae8..2f7ad491 100644
--- a/alpaka/cmake/addExecutable.cmake
+++ b/alpaka/cmake/addExecutable.cmake
@@ -20,8 +20,8 @@ macro(alpaka_add_executable In_Name)
     if(alpaka_ACC_GPU_CUDA_ENABLE)
        enable_language(CUDA)
        foreach(_file ${ARGN})
-            if((${_file} MATCHES "\\.cpp$") OR 
-               (${_file} MATCHES "\\.cxx$") OR 
+            if((${_file} MATCHES "\\.cpp$") OR
+               (${_file} MATCHES "\\.cxx$") OR
                (${_file} MATCHES "\\.cu$")
             )
                 set_source_files_properties(${_file} PROPERTIES LANGUAGE CUDA)
@@ -49,7 +49,7 @@ macro(alpaka_add_executable In_Name)
 
         # We have to set this here because CMake currently doesn't provide hip_std_${VERSION} for
         # target_compile_features() and HIP_STANDARD isn't propagated by interface libraries.
-        set_target_properties(${In_Name} PROPERTIES 
+        set_target_properties(${In_Name} PROPERTIES
                               HIP_STANDARD ${alpaka_CXX_STANDARD}
                               HIP_STANDARD_REQUIRED ON)
     endif()
diff --git a/alpaka/cmake/addLibrary.cmake b/alpaka/cmake/addLibrary.cmake
index 103b3563..f6cf7028 100644
--- a/alpaka/cmake/addLibrary.cmake
+++ b/alpaka/cmake/addLibrary.cmake
@@ -13,10 +13,10 @@
 # Using a macro to stay in the scope (fixes lost assignment of linker command in FindHIP.cmake)
 # https://github.com/ROCm-Developer-Tools/HIP/issues/631
 
-macro(alpaka_add_library libraryName)
+macro(alpaka_add_library In_Name)
     # add_library( <name> [STATIC | SHARED | MODULE] [EXCLUDE_FROM_ALL] [<source>...])
 
-    add_library(${libraryName} ${ARGN})
+    add_library(${In_Name} ${ARGN})
 
     if(alpaka_ACC_GPU_CUDA_ENABLE)
         enable_language(CUDA)
@@ -50,7 +50,7 @@ macro(alpaka_add_library libraryName)
 
         # We have to set this here because CMake currently doesn't provide hip_std_${VERSION} for
         # target_compile_features() and HIP_STANDARD isn't propagated by interface libraries.
-        set_target_properties(${In_Name} PROPERTIES 
+        set_target_properties(${In_Name} PROPERTIES
                               HIP_STANDARD ${alpaka_CXX_STANDARD}
                               HIP_STANDARD_REQUIRED ON)
     endif()
diff --git a/alpaka/cmake/alpakaCommon.cmake b/alpaka/cmake/alpakaCommon.cmake
index b17fefa4..09872bec 100644
--- a/alpaka/cmake/alpakaCommon.cmake
+++ b/alpaka/cmake/alpakaCommon.cmake
@@ -177,12 +177,12 @@ if(MSVC)
 else()
     # For std::future we need to pass the correct pthread flag for the compiler and the linker:
     # https://github.com/alpaka-group/cupla/pull/128#issuecomment-545078917
-    
+
     # Allow users to override the "-pthread" preference.
     if(NOT THREADS_PREFER_PTHREAD_FLAG)
         set(THREADS_PREFER_PTHREAD_FLAG TRUE)
     endif()
-    
+
     find_package(Threads REQUIRED)
     target_link_libraries(alpaka INTERFACE Threads::Threads)
 
@@ -200,7 +200,7 @@ else()
                                                           "$<$<AND:$<CONFIG:Debug>,$<CXX_COMPILER_ID:GNU>,$<COMPILE_LANGUAGE:CUDA>>:SHELL:-Xcompiler -Og>"
                                                           "$<$<AND:$<CONFIG:Debug>,$<CXX_COMPILER_ID:Clang,AppleClang,IntelLLVM>>:SHELL:-O0>"
                                                           "$<$<AND:$<CONFIG:Debug>,$<CXX_COMPILER_ID:MSVC>>:SHELL:/Od>")
-    
+
     target_link_options(alpaka INTERFACE "$<$<AND:$<CONFIG:Debug>,$<CXX_COMPILER_ID:GNU>>:SHELL:-Og>"
                                          "$<$<AND:$<CONFIG:Debug>,$<CXX_COMPILER_ID:Clang,AppleClang,IntelLLVM>>:SHELL:-O0>")
 endif()
@@ -350,6 +350,11 @@ if(alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE OR alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE)
     else()
         find_package(OpenMP REQUIRED COMPONENTS CXX)
         target_link_libraries(alpaka INTERFACE OpenMP::OpenMP_CXX)
+        # shown with CMake 3.29 and cray clang 17
+        # workaround: cmake is missing to add '-fopenmp' to the linker flags
+        if(CMAKE_CXX_COMPILER_ID STREQUAL "CrayClang")
+            target_link_libraries(alpaka INTERFACE -fopenmp)
+        endif()
     endif()
 endif()
 
@@ -358,7 +363,7 @@ endif()
 if(alpaka_ACC_GPU_CUDA_ENABLE)
     # Save the user-defined host compiler (if any)
     set(_alpaka_CUDA_HOST_COMPILER ${CMAKE_CUDA_HOST_COMPILER})
-    
+
     check_language(CUDA)
 
     if(CMAKE_CUDA_COMPILER)
@@ -373,7 +378,44 @@ if(alpaka_ACC_GPU_CUDA_ENABLE)
             endif()
         endif()
 
+        # the CMake compiler detection of clang 17 and 18 as CUDA compiler is broken
+        # the detection try to compile an empty file with the default C++ standard of clang, which is -std=gnu++17
+        # but CUDA does not support the 128 bit float extension, therefore the test failes
+        # more details: https://gitlab.kitware.com/cmake/cmake/-/issues/25861
+        # this workaround disable the gnu extensions for the compiler detection
+        # the bug is fixed in clang 19: https://github.com/llvm/llvm-project/issues/88695
+        if("${CMAKE_CUDA_COMPILER}" MATCHES "clang*")
+            # get compiler version without enable_language()
+            execute_process(COMMAND ${CMAKE_CUDA_COMPILER} -dumpversion
+                   OUTPUT_VARIABLE _CLANG_CUDA_VERSION
+                   RESULT_VARIABLE _CLANG_CUDA_VERSION_ERROR_CODE)
+
+            if(NOT "${_CLANG_CUDA_VERSION_ERROR_CODE}" STREQUAL "0")
+                message(FATAL_ERROR "running '${CMAKE_CUDA_COMPILER} -dumpversion' failed: ${_CLANG_CUDA_VERSION_ERROR_CODE}")
+            endif()
+
+            string(STRIP ${_CLANG_CUDA_VERSION} _CLANG_CUDA_VERSION)
+            message(DEBUG "Workaround: manual checked Clang-CUDA version: ${_CLANG_CUDA_VERSION}")
+
+            if(${_CLANG_CUDA_VERSION} VERSION_GREATER_EQUAL 17 AND ${_CLANG_CUDA_VERSION} VERSION_LESS 19)
+                message(DEBUG "Workaround: apply -std=c++98 for clang as cuda compiler")
+                set(_CMAKE_CUDA_FLAGS_BEFORE ${CMAKE_CUDA_FLAGS})
+                # we need to use C++ 98 for the detection test, because from new, disabling the extension is ignored for C++ 98
+                set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -std=c++98")
+            endif()
+        endif()
+
         enable_language(CUDA)
+
+        if(DEFINED _CLANG_CUDA_VERSION)
+            message(DEBUG "Workaround: reset variables for clang as cuda compiler -std=c++98 fix")
+            # remove the flag compiler -std=c++98
+            set(CMAKE_CUDA_FLAGS ${_CMAKE_CUDA_FLAGS_BEFORE})
+            unset(_CMAKE_CUDA_FLAGS_BEFORE)
+            unset(_CLANG_CUDA_VERSION)
+            unset(_CLANG_CUDA_VERSION_ERROR_CODE)
+        endif()
+
         find_package(CUDAToolkit REQUIRED)
 
         target_compile_features(alpaka INTERFACE cuda_std_${alpaka_CXX_STANDARD})
@@ -426,6 +468,10 @@ if(alpaka_ACC_GPU_CUDA_ENABLE)
         elseif(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
             message(STATUS "nvcc is used as CUDA compiler")
 
+            if(alpaka_CXX_STANDARD GREATER_EQUAL 20 AND CMAKE_VERSION VERSION_LESS "3.25.0")
+                message(FATAL_ERROR "CMake 3.24 and older does not support C++20 for nvcc")
+            endif()
+
             # nvcc sets no linux/__linux macros on OpenPOWER linux
             # nvidia bug id: 2448610
             if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
@@ -494,9 +540,31 @@ if(alpaka_ACC_GPU_CUDA_ENABLE)
             endif()
         endif()
 
+        # Use the Shared CUDA Runtime library by default
+        if(NOT DEFINED CMAKE_CUDA_RUNTIME_LIBRARY)
+            set(CMAKE_CUDA_RUNTIME_LIBRARY "Shared")
+        endif()
+
+        # Link the CUDA Runtime library
+        if(CMAKE_CUDA_RUNTIME_LIBRARY STREQUAL "Shared")
+            target_link_libraries(alpaka INTERFACE CUDA::cudart)
+        elseif(CMAKE_CUDA_RUNTIME_LIBRARY STREQUAL "Static")
+            target_link_libraries(alpaka INTERFACE CUDA::cudart_static)
+        elseif(CMAKE_CUDA_RUNTIME_LIBRARY STREQUAL "None")
+            message(WARNING "Building alpaka applications with CMAKE_CUDA_RUNTIME_LIBRARY=None is not supported.")
+        else()
+            message(FATAL_ERROR "Invalid setting for CMAKE_CUDA_RUNTIME_LIBRARY.")
+        endif()
+
         if(NOT alpaka_DISABLE_VENDOR_RNG)
             # Use cuRAND random number generators
-            target_link_libraries(alpaka INTERFACE CUDA::cudart CUDA::curand)
+            if(CMAKE_CUDA_RUNTIME_LIBRARY STREQUAL "Shared")
+                target_link_libraries(alpaka INTERFACE CUDA::curand)
+            elseif(CMAKE_CUDA_RUNTIME_LIBRARY STREQUAL "Static")
+                target_link_libraries(alpaka INTERFACE CUDA::curand_static)
+            elseif(CMAKE_CUDA_RUNTIME_LIBRARY STREQUAL "None")
+                message(FATAL_ERROR "cuRAND requires the CUDA runtime library.")
+            endif()
         endif()
     else()
         message(FATAL_ERROR "Optional alpaka dependency CUDA could not be found!")
@@ -514,7 +582,7 @@ if(alpaka_ACC_GPU_HIP_ENABLE)
         find_package(hip REQUIRED)
 
         set(_alpaka_HIP_MIN_VER 5.1)
-        set(_alpaka_HIP_MAX_VER 6.0)
+        set(_alpaka_HIP_MAX_VER 6.2)
 
         # construct hip version only with major and minor level
         # cannot use hip_VERSION because of the patch level
@@ -526,6 +594,8 @@ if(alpaka_ACC_GPU_HIP_ENABLE)
             message(WARNING "HIP ${_hip_MAJOR_MINOR_VERSION} is not official supported by alpaka. Supported versions: ${_alpaka_HIP_MIN_VER} - ${_alpaka_HIP_MAX_VER}")
         endif()
 
+        # let the compiler find the HIP headers also when building host-only code
+        target_include_directories(alpaka SYSTEM INTERFACE ${hip_INCLUDE_DIR})
 
         target_link_libraries(alpaka INTERFACE "$<$<LINK_LANGUAGE:CXX>:hip::host>")
         alpaka_set_compiler_options(HOST_DEVICE target alpaka "$<$<COMPILE_LANGUAGE:CXX>:-D__HIP_PLATFORM_AMD__>")
@@ -566,7 +636,7 @@ if(alpaka_ACC_GPU_HIP_ENABLE)
         endif()
 
         if(alpaka_RELOCATABLE_DEVICE_CODE STREQUAL ON)
-            alpaka_set_compiler_options(DEVICE target alpaka "$<$<COMPILE_LANGUAGE:HIP>:SHELL-fgpu-rdc>")
+            alpaka_set_compiler_options(DEVICE target alpaka "$<$<COMPILE_LANGUAGE:HIP>:SHELL:-fgpu-rdc>")
             target_link_options(alpaka INTERFACE "$<$<LINK_LANGUAGE:HIP>:SHELL:-fgpu-rdc --hip-link>")
         elseif(alpaka_RELOCATABLE_DEVICE_CODE STREQUAL OFF)
             alpaka_set_compiler_options(DEVICE target alpaka "$<$<COMPILE_LANGUAGE:HIP>:SHELL:-fno-gpu-rdc>")
@@ -619,9 +689,9 @@ if(alpaka_ACC_SYCL_ENABLE)
         list(JOIN alpaka_SYCL_TARGETS "," alpaka_SYCL_TARGETS_CONCAT)
         alpaka_set_compiler_options(HOST_DEVICE target alpaka "-fsycl-targets=${alpaka_SYCL_TARGETS_CONCAT}")
         target_link_options(alpaka INTERFACE "-fsycl-targets=${alpaka_SYCL_TARGETS_CONCAT}")
-        
+
         #-----------------------------------------------------------------------------------------------------------------
-        # Determine actual hardware to compile for 
+        # Determine actual hardware to compile for
         if(alpaka_SYCL_ONEAPI_CPU)
             set(alpaka_SYCL_ONEAPI_CPU_ISA "avx2" CACHE STRING "Intel ISA to compile for")
             set_property(CACHE alpaka_SYCL_ONEAPI_CPU_ISA PROPERTY STRINGS "sse4.2;avx;avx2;avx512")
@@ -663,7 +733,7 @@ if(alpaka_ACC_SYCL_ENABLE)
                         PROPERTY STRINGS "intel_gpu_pvc;intel_gpu_acm_g12;intel_gpu_acm_g11;intel_gpu_acm_g10;intel_gpu_dg1;intel_gpu_adl_n;intel_gpu_adl_p;intel_gpu_rpl_s;intel_gpu_adl_s;intel_gpu_rkl;intel_gpu_tgllp;intel_gpu_icllp;intel_gpu_cml;intel_gpu_aml;intel_gpu_whl;intel_gpu_glk;intel_gpu_apl;intel_gpu_cfl;intel_gpu_kbl;intel_gpu_skl;intel_gpu_bdw")
             # If the user has given us a list turn all ';' into ',' to pacify the Intel OpenCL compiler.
             string(REPLACE ";" "," alpaka_SYCL_ONEAPI_GPU_DEVICES "${alpaka_SYCL_ONEAPI_GPU_DEVICES}")
-            
+
             target_compile_definitions(alpaka INTERFACE "ALPAKA_SYCL_ONEAPI_GPU")
         endif()
 
@@ -679,7 +749,7 @@ if(alpaka_ACC_SYCL_ENABLE)
             target_link_options(alpaka INTERFACE "-fno-sycl-rdc")
         endif()
     else()
-        message(FATAL_ERROR "alpaka currently does not support SYCL implementations other than oneAPI.")
+        message(FATAL_ERROR "alpaka currently does not support SYCL implementations other than oneAPI: ${CMAKE_CXX_COMPILER_ID}.")
     endif()
 
     if(NOT alpaka_DISABLE_VENDOR_RNG)
diff --git a/alpaka/docs/cheatsheet/cheatsheet.style b/alpaka/docs/cheatsheet/cheatsheet.style
index 8e8162ed..33eb4909 100644
--- a/alpaka/docs/cheatsheet/cheatsheet.style
+++ b/alpaka/docs/cheatsheet/cheatsheet.style
@@ -8,7 +8,7 @@
                   "firstTemplate": "twoColumn",
                   "width": "29.7cm",
                   "height": "21cm"
-		 },
+         },
     "pageTemplates" : {
         "threeColumn": {
             "frames": [
@@ -16,36 +16,36 @@
                 ["35.333%", "0cm", "29.333%", "100%"],
                 ["68.666%", "0cm", "29.333%", "100%"]
             ]
-	}
+    }
     },
     "fontsAlias" : {
         "stdMono": "CPMono_v07 Plain"
     },
     "styles" : [
-	[ "base", {
-	    "fontSize": 10
-	}
-	],
-	["code" , {
-	    "parent": "literal",
-	    "leftIndent": 0,
-	    "spaceBefore": 0,
-	    "spaceAfter": 4,
-	    "backColor": null,
-	    "borderColor": null,
-	    "borderWidth": 0,
-	    "leading":7,
-	    "borderPadding": [1,1,5,1],
-	    "fontSize": 8
-	}],
-	["bodytext" , {
-	    "spaceBefore":0
-	}],
-	["small" , {
-	    "parent": "base",
-	    "fontSize": 6
-	}],
-	["heading1", {
+    [ "base", {
+        "fontSize": 10
+    }
+    ],
+    ["code" , {
+        "parent": "literal",
+        "leftIndent": 0,
+        "spaceBefore": 0,
+        "spaceAfter": 4,
+        "backColor": null,
+        "borderColor": null,
+        "borderWidth": 0,
+        "leading":7,
+        "borderPadding": [1,1,5,1],
+        "fontSize": 8
+    }],
+    ["bodytext" , {
+        "spaceBefore":0
+    }],
+    ["small" , {
+        "parent": "base",
+        "fontSize": 6
+    }],
+    ["heading1", {
             "backColor": "#00599dff",
             "borderColor": "#00599dff",
             "borderWidth": 0.2,
@@ -57,102 +57,102 @@
             "leftIndent": 0,
             "fontSize": 12,
             "fontName": "stdSansBold"
-	}],
-	["faketitle" , {
-	    "borderPadding": [3,0,1,0],
-	    "fontSize": 8,
-	    "spaceBefore": 4,
-	    "spaceAfter": 4,
-	    "fontName": "stdSansBold"
-	}],
-	["nota", { "parent": "heading",
-		   "fontSize": 6,
-		   "fontName": "stdSansBold",
-		   "textColor": "#FFFFFF",
-		   "alignment": "TA_RIGHT"
-		 }],
-	["table" , {
-	    "spaceBefore":0,
-	    "spaceAfter":3,
-	    "colWidths": ["50%","50%"],
-	    "commands": [
-		[ "VALIGN", [0, 0], [-1, -1], "TOP" ],
-		[ "BOTTOMPADDING", [0, 0], [-1, -1], 0 ],
-		[ "TOPPADDING", [0, 0], [-1, -1], 1 ],
-		[ "LINEBELOW", [0, 0], [-1, -2], 0.2, "#E1E6EA" ]
-	    ]
-	}],
-	["exampletable1" , {
-	    "spaceBefore":0,
-	    "spaceAfter":3,
-	    "colWidths": ["33.3%","33.3%","33.3%"],
-	    "commands": [
-		[ "VALIGN", [0, 0], [-1, -1], "TOP" ],
-		[ "BOTTOMPADDING", [0, 0], [-1, -1], -1 ],
-		[ "GRID", [0, 0], [-1, -1], 0.2, "#446885" ],
-		[ "BOX", [0, 0], [-1, -1], 0.2, "#446885" ]
-	    ]
-	}],
-	["faketrans" , {
-	    "spaceBefore":3,
-	    "spaceAfter":3,
-	    "colWidths": ["100%"],
-	    "commands": [
-		[ "LINEABOVE", [0, 0], [-1, -1], 0.8, "#446885" ]
-	    ]
-	}],
-	["tablapie" , {
-	    "spaceBefore":0,
-	    "spaceAfter":0,
-	    "colWidths": ["52%","19%","12%","17%"],
-	    "commands": [
-		[ "VALIGN", [0, 0], [-1, -1], "TOP" ],
-		[ "BOTTOMPADDING", [0, 0], [-1, -1], -3 ],
-		[ "LINEABOVE", [0, 0], [-1, -1], 0.4, "#446885" ]
-	    ]
-	}],
-	["izqfina" , {
-	    "spaceBefore":0,
-	    "spaceAfter":6,
-	    "colWidths": ["10%",null],
-	    "commands": [
-		[ "VALIGN", [0, 0], [-1, -1], "MIDDLE" ],
-		[ "BOTTOMPADDING", [0, 0], [-1, -1], -3 ],
-		[ "LINEBELOW", [0, 0], [-1, -2], 0.2, "#E1E6EA" ]
-	    ]
-	}],
-	["tablacreditos", {
-	    "parent": "bodytext",
-	    "spaceBefore":-1,
-	    "spaceAfter":0,
-	    "colWidths": ["50%","50%"],
-	    "commands": [
-		[ "VALIGN", [0, 0], [-1, -1], "MIDDLE" ],
-		[ "BOTTOMPADDING", [0, 0], [-1, -1], -1 ],
-		[ "TOPPADDING", [0, 0], [1, 0], 3 ]
-	    ]
-	}],
-	[ "endnote", {
+    }],
+    ["faketitle" , {
+        "borderPadding": [3,0,1,0],
+        "fontSize": 8,
+        "spaceBefore": 4,
+        "spaceAfter": 4,
+        "fontName": "stdSansBold"
+    }],
+    ["nota", { "parent": "heading",
+           "fontSize": 6,
+           "fontName": "stdSansBold",
+           "textColor": "#FFFFFF",
+           "alignment": "TA_RIGHT"
+         }],
+    ["table" , {
+        "spaceBefore":0,
+        "spaceAfter":3,
+        "colWidths": ["50%","50%"],
+        "commands": [
+        [ "VALIGN", [0, 0], [-1, -1], "TOP" ],
+        [ "BOTTOMPADDING", [0, 0], [-1, -1], 0 ],
+        [ "TOPPADDING", [0, 0], [-1, -1], 1 ],
+        [ "LINEBELOW", [0, 0], [-1, -2], 0.2, "#E1E6EA" ]
+        ]
+    }],
+    ["exampletable1" , {
+        "spaceBefore":0,
+        "spaceAfter":3,
+        "colWidths": ["33.3%","33.3%","33.3%"],
+        "commands": [
+        [ "VALIGN", [0, 0], [-1, -1], "TOP" ],
+        [ "BOTTOMPADDING", [0, 0], [-1, -1], -1 ],
+        [ "GRID", [0, 0], [-1, -1], 0.2, "#446885" ],
+        [ "BOX", [0, 0], [-1, -1], 0.2, "#446885" ]
+        ]
+    }],
+    ["faketrans" , {
+        "spaceBefore":3,
+        "spaceAfter":3,
+        "colWidths": ["100%"],
+        "commands": [
+        [ "LINEABOVE", [0, 0], [-1, -1], 0.8, "#446885" ]
+        ]
+    }],
+    ["tablapie" , {
+        "spaceBefore":0,
+        "spaceAfter":0,
+        "colWidths": ["52%","19%","12%","17%"],
+        "commands": [
+        [ "VALIGN", [0, 0], [-1, -1], "TOP" ],
+        [ "BOTTOMPADDING", [0, 0], [-1, -1], -3 ],
+        [ "LINEABOVE", [0, 0], [-1, -1], 0.4, "#446885" ]
+        ]
+    }],
+    ["izqfina" , {
+        "spaceBefore":0,
+        "spaceAfter":6,
+        "colWidths": ["10%",null],
+        "commands": [
+        [ "VALIGN", [0, 0], [-1, -1], "MIDDLE" ],
+        [ "BOTTOMPADDING", [0, 0], [-1, -1], -3 ],
+        [ "LINEBELOW", [0, 0], [-1, -2], 0.2, "#E1E6EA" ]
+        ]
+    }],
+    ["tablacreditos", {
+        "parent": "bodytext",
+        "spaceBefore":-1,
+        "spaceAfter":0,
+        "colWidths": ["50%","50%"],
+        "commands": [
+        [ "VALIGN", [0, 0], [-1, -1], "MIDDLE" ],
+        [ "BOTTOMPADDING", [0, 0], [-1, -1], -1 ],
+        [ "TOPPADDING", [0, 0], [1, 0], 3 ]
+        ]
+    }],
+    [ "endnote", {
             "parent": "bodytext",
             "colWidths": [52,null],
             "spaceAfter": 4,
             "commands": [
-		[ "VALIGN", [ 0, 0 ], [ -1, -1 ], "TOP" ],
-		[ "BOTTOMPADDING", [0, 0], [-1, -1], 0 ],
-		[ "TOPPADDING", [0, 0], [-1, -1], 1 ],
-		[ "LINEBEFORE", [0, 0], [0,-1], 1, "#E1E6EA" ]
+        [ "VALIGN", [ 0, 0 ], [ -1, -1 ], "TOP" ],
+        [ "BOTTOMPADDING", [0, 0], [-1, -1], 0 ],
+        [ "TOPPADDING", [0, 0], [-1, -1], 1 ],
+        [ "LINEBEFORE", [0, 0], [0,-1], 1, "#E1E6EA" ]
             ]
-	}],
-	["extranote" , {
-	    "spaceBefore":0,
-	    "spaceAfter":0,
-	    "colWidths": [27,null],
-	    "commands": [
-		[ "VALIGN", [0, 0], [-1, -1], "MIDDLE" ],
-		[ "BOTTOMPADDING", [0, 0], [-1, -1], -3 ],
-		[ "BOX", [0, 0], [-1, -1], 0.2, "#446885" ],
-		[ "COLBACKGROUNDS", [0,0], [-1,-1], ["#446885", "#FFFFFF"]]
-	    ]
-	}]
+    }],
+    ["extranote" , {
+        "spaceBefore":0,
+        "spaceAfter":0,
+        "colWidths": [27,null],
+        "commands": [
+        [ "VALIGN", [0, 0], [-1, -1], "MIDDLE" ],
+        [ "BOTTOMPADDING", [0, 0], [-1, -1], -3 ],
+        [ "BOX", [0, 0], [-1, -1], 0.2, "#446885" ],
+        [ "COLBACKGROUNDS", [0,0], [-1,-1], ["#446885", "#FFFFFF"]]
+        ]
+    }]
     ]
 }
diff --git a/alpaka/docs/source/advanced/cmake.rst b/alpaka/docs/source/advanced/cmake.rst
index 3e12f4a1..4c8be593 100644
--- a/alpaka/docs/source/advanced/cmake.rst
+++ b/alpaka/docs/source/advanced/cmake.rst
@@ -117,6 +117,11 @@ alpaka_BUILD_EXAMPLES
 
      Build the examples.
 
+alpaka_BUILD_BENCHMARKS
+  .. code-block::
+
+     Build the benchmarks.
+
 BUILD_TESTING
   .. code-block::
 
@@ -162,7 +167,7 @@ alpaka_FTZ
 alpaka_DEBUG_OFFLOAD_ASSUME_HOST
   .. code-block::
 
-     Allow host-only contructs like assert in offload code in debug mode.
+     Allow host-only constructs like assert in offload code in debug mode.
 
 alpaka_USE_MDSPAN
   .. code-block::
diff --git a/alpaka/docs/source/basic/abstraction.rst b/alpaka/docs/source/basic/abstraction.rst
index 6ecd599a..20f287d5 100644
--- a/alpaka/docs/source/basic/abstraction.rst
+++ b/alpaka/docs/source/basic/abstraction.rst
@@ -189,7 +189,7 @@ Control flow statements result in a predicate and only in those threads where it
 Not only *CUDA* GPUs support the execution of multiple threads in a warp.
 Full blown vector processors with good compilers are capable of combining multiple loop iterations containing complex control flow statements in a similar manner as *CUDA*.
 
-Due to the synchronitiy of threads within a warp, memory operations will always occur at the same time in all threads.
+Due to the synchronization of threads within a warp, memory operations will always occur at the same time in all threads.
 This allows to coalesce memory accesses.
 Different *CUDA* devices support different levels of memory coalescing.
 Older ones only supported combining multiple memory accesses if they were aligned and sequential in the order of thread indices.
@@ -272,4 +272,3 @@ They can be synchronized by using events.
 Blocks can not be synchronized and therefore can use the whole spectrum of parallelism ranging from fully parallel up to fully sequential execution depending on the device.
 Warps combine the execution of multiple threads in lock-step and can be synchronized implicitly by synchronizing the threads they contain.
 Threads within a block are executed in parallel warps and each thread computes a number of data elements sequentially.
-
diff --git a/alpaka/docs/source/basic/cheatsheet.rst b/alpaka/docs/source/basic/cheatsheet.rst
index 3b22aa7a..5295a56d 100644
--- a/alpaka/docs/source/basic/cheatsheet.rst
+++ b/alpaka/docs/source/basic/cheatsheet.rst
@@ -116,20 +116,18 @@ Create a CPU device for memory allocation on the host side
 Allocate a buffer in host memory
   .. code-block:: c++
 
-     Vec<Dim, Idx> extent = value;
-     using BufHost = Buf<DevHost, DataType, Dim, Idx>;
+     // Use alpaka vector as a static array for the extents
+     alpaka::Vec<Dim, Idx> extent = value;
+     // Allocate memory for the alpaka buffer, which is a dynamic array
+     using BufHost = alpaka::Buf<DevHost, DataType, Dim, Idx>;
      BufHost bufHost = allocBuf<DataType, Idx>(devHost, extent);
 
-(Optional, affects CPU – GPU memory copies) Prepare it for asynchronous memory copies
-  .. code-block:: c++
-
-     prepareForAsyncCopy(bufHost);
-
 Create a view to host memory represented by a pointer
   .. code-block:: c++
 
      using Dim = alpaka::DimInt<1u>;
-     Vec<Dim, Idx> extent = size;
+     // Create an alpaka vector which is a static array
+     alpaka::Vec<Dim, Idx> extent = size;
      DataType* ptr = ...;
      auto hostView = createView(devHost, ptr, extent);
 
@@ -151,6 +149,18 @@ Get a raw pointer to a buffer or view initialization, etc.
      DataType* raw = view::getPtrNative(bufHost);
      DataType* rawViewPtr = view::getPtrNative(hostView);
 
+Get the pitches (memory in bytes to the next element in the buffer along the pitch dimension) of a buffer
+  .. code-block:: c++
+
+     auto pitchBufAcc = alpaka::getPitchesInBytes(bufAcc)
+     auto pitchViewAcc = alpaka::getPitchesInBytes(viewAcc)
+
+Get a mdspan to a buffer or view initialization, etc.
+  .. code-block:: c++
+
+     auto bufMdSpan = alpaka::experimental::getMdSpan(bufAcc)
+     auto viewMdSpan = alpaka::experimental::getMdSpan(viewAcc)
+
 Allocate a buffer in device memory
   .. code-block:: c++
 
@@ -159,6 +169,7 @@ Allocate a buffer in device memory
 Enqueue a memory copy from host to device
   .. code-block:: c++
 
+     // arguments can be also alpaka::View instances instead of alpaka::Buf
      memcpy(queue, bufDevice, bufHost, extent);
 
 Enqueue a memory copy from device to host
@@ -172,6 +183,10 @@ Enqueue a memory copy from device to host
 
 Kernel Execution
 ----------------
+Prepare Kernel Bundle
+  .. code-block:: c++
+
+     HeatEquationKernel heatEqKernel;
 
 Automatically select a valid kernel launch configuration
   .. code-block:: c++
@@ -179,11 +194,21 @@ Automatically select a valid kernel launch configuration
      Vec<Dim, Idx> const globalThreadExtent = vectorValue;
      Vec<Dim, Idx> const elementsPerThread = vectorValue;
 
-     auto autoWorkDiv = getValidWorkDiv<Acc>(
-       device,
-       globalThreadExtent, elementsPerThread,
+     KernelCfg<Acc> const kernelCfg = {
+       globalThreadExtent,
+       elementsPerThread,
        false,
-       GridBlockExtentSubDivRestrictions::Unrestricted);
+       GridBlockExtentSubDivRestrictions::Unrestricted};
+
+     auto autoWorkDiv = getValidWorkDiv(
+       kernelCfg,
+       device,
+       heatEqKernel,
+       pCurrAcc,
+       pNextAcc,
+       numNodesX,
+       dx,
+       dt);
 
 Manually set a kernel launch configuration
   .. code-block:: c++
@@ -193,22 +218,28 @@ Manually set a kernel launch configuration
      Vec<Dim, Idx> const elementsPerThread = vectorValue;
 
      using WorkDiv = WorkDivMembers<Dim, Idx>;
-     auto manualWorkDiv = WorkDiv{blocksPerGrid,
-                                  threadsPerBlock,
-				  elementsPerThread};
+     auto manualWorkDiv = WorkDiv{
+       blocksPerGrid,
+       threadsPerBlock,
+       elementsPerThread};
 
-Instantiate a kernel and create a task that will run it (does not launch it yet)
+Instantiate a kernel (does not launch it yet)
   .. code-block:: c++
 
      Kernel kernel{argumentsForConstructor};
-     auto taskRunKernel = createTaskKernel<Acc>(workDiv, kernel, parameters);
 
 acc parameter of the kernel is provided automatically, does not need to be specified here
 
+Get information about the kernel from the device (size, maxThreadsPerBlock, sharedMemSize, registers, etc.)
+  .. code-block:: c++
+
+     auto kernelFunctionAttributes = alpaka::getFunctionAttributes<Acc>(devAcc, kernel, parameters...);
+
+
 Put the kernel for execution
   .. code-block:: c++
 
-     enqueue(queue, taskRunKernel);
+     exec(queue, workDiv, kernel, parameters...);
 
 Kernel Implementation
 ---------------------
@@ -231,7 +262,7 @@ Access multi-dimensional indices and extents of blocks, threads, and elements
      // Origin: Grid, Block, Thread
      // Unit: Blocks, Threads, Elems
 
-Access components of and destructuremulti-dimensional indices and extents
+Access components of and destructure multi-dimensional indices and extents
   .. code-block:: c++
 
      auto idxX = idx[0];
@@ -240,7 +271,12 @@ Access components of and destructuremulti-dimensional indices and extents
 Linearize multi-dimensional vectors
   .. code-block:: c++
 
-     auto linearIdx = mapIdx<1u>(idx, extent);
+     auto linearIdx = mapIdx<1u>(idxND, extentND);
+
+More generally, index multi-dimensional vectors with a different dimensionality
+  .. code-block:: c++
+
+     auto idxND = alpaka::mapIdx<N>(idxMD, extentMD);
 
 .. raw:: pdf
 
diff --git a/alpaka/docs/source/basic/install.rst b/alpaka/docs/source/basic/install.rst
index 1f716faf..bfdb241c 100644
--- a/alpaka/docs/source/basic/install.rst
+++ b/alpaka/docs/source/basic/install.rst
@@ -3,18 +3,61 @@
 Installation
 ============
 
+**Installing dependencies**
+
+alpaka requires **Boost** and a modern C++ compiler (g++, clang++, Visual C++, …). In order to install **Boost**:
+
+On Linux:
+
 .. code-block::
 
-  # Clone alpaka from github.com
-  git clone --branch 0.9.0 https://github.com/alpaka-group/alpaka.git
-  cd alpaka
-  mkdir build && cd build
-  cmake -DCMAKE_INSTALL_PREFIX=/install/ ..
-  cmake --install .
+  # RPM
+  sudo dnf install boost-devel
+  # DEB
+  sudo apt install libboost-all-dev
+
+On macOS:
+
+.. code-block::
+
+  # Using Homebrew, https://brew.sh
+  brew install boost
+  # Using MacPorts, https://macports.org
+  sudo port install boost
+
+On Windows:
+
+.. code-block::
+
+  # Using vcpkg, https://github.com/microsoft/vcpkg
+  vcpkg install boost
+
+**CMake** is the preferred system for configuration the build tree, building and installing. In order to install **CMake**:
+
+On Linux:
+
+.. code-block::
+
+  # RPM
+  sudo dnf install cmake
+  # DEB
+  sudo apt install cmake
+
+On macOS or Windows:
+
+Download the installer from https://cmake.org/download/
+
+**Dependencies to use specific backends**: Depending on your target platform you may need additional packages to compile and run alpaka.
+
+- NVIDIA GPUs: CUDA Toolkit (https://developer.nvidia.com/cuda-toolkit)
+- AMD GPUs: ROCm / HIP (https://rocmdocs.amd.com/en/latest/index.html)
+- Intel GPUs: OneAPI Toolkit (https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html#gs.9x3lnh)
 
 Tests and Examples
 ++++++++++++++++++
 
+The examples and tests can be compiled without installing alpaka. They will use alpaka headers from the source directory.
+
 **Build and run examples:**
 
 .. code-block::
@@ -56,3 +99,17 @@ In the overview of :doc:`cmake arguments </advanced/cmake>` you will find all CM
 .. hint::
 
   When the test or examples are activated, the alpaka build system automatically activates the ``serial backend``, as it is needed for many tests. Therefore, the tests are run with the ``serial backend`` by default. If you want to test another backend, you have to activate it at CMake configuration time, for example the ``HIP`` backend: ``cmake .. -DBUILD_TESTING=ON -Dalpaka_ACC_GPU_HIP_ENABLE=ON``. Some alpaka tests use a selector algorithm to choose a specific accelerator for the test cases. The selector works with accelerator priorities. Therefore, it is recommended to enable only one accelerator for a build to make sure that the right one is used.
+
+
+**Installing alpaka**
+
+If user is going to create her/his own project/example outside the source tree alpaka should be installed. Since alpaka is a header only library compilation is not needed before installation.
+
+.. code-block::
+
+  # Clone alpaka from github.com
+  git clone --branch 1.1.0 https://github.com/alpaka-group/alpaka.git
+  cd alpaka
+  mkdir build && cd build
+  cmake -DCMAKE_INSTALL_PREFIX=/install/ ..
+  cmake --install .
diff --git a/alpaka/docs/source/basic/library.rst b/alpaka/docs/source/basic/library.rst
index d9083f02..ab73d716 100644
--- a/alpaka/docs/source/basic/library.rst
+++ b/alpaka/docs/source/basic/library.rst
@@ -124,7 +124,7 @@ Kernels can also be defined via lambda expressions.
 
       int main() {
           // ...
-	  using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+	  using Acc = alpaka::AccGpuCudaRt<Dim, Idx>;
 
 	  auto kernel = [] ALPAKA_FN_ACC (Acc const & acc /* , ... */) -> void {
 	      // ...
@@ -211,8 +211,11 @@ Memory Management
 
 The memory allocation function of the *alpaka* library (``alpaka::allocBuf<TElem>(device, extents)``) is uniform for all devices, even for the host device.
 It does not return raw pointers but reference counted memory buffer objects that remove the necessity for manual freeing and the possibility of memory leaks.
-Additionally the memory buffer objects know their extents, their pitches as well as the device they reside on.
-This allows buffers that possibly reside on different devices with different pitches to be copied only by providing the buffer objects as well as the extents of the region to copy (``alpaka::memcpy(bufDevA, bufDevB, copyExtents``).
+Additionally, the memory buffer objects know their extents, their pitches as well as the device they reside on.
+Due to padding, the allocated number of bytes may be more than the required storage; the pitch value gives the correct stride for each dimension for row-major access.
+This allows buffers that possibly reside on different devices with different pitches to be copied by providing the buffer objects as well as the extents of the region to copy (``alpaka::memcpy(queue, bufDevA, bufDevB, copyExtents``).
+
+If the data is already in a contiguous STL container on the host; the container can be converted to a View to be used in ``alpaka::memcpy`` function. The data structure ``alpaka::View`` knows the the extent and the device of the data; therefore can be used in memcpy. (``alpaka::memcpy(queue, bufDevA, viewDevB, copyExtents``).
 
 Kernel Execution
 ````````````````
diff --git a/alpaka/docs/source/conf.py b/alpaka/docs/source/conf.py
index 56623cbd..3b43daca 100644
--- a/alpaka/docs/source/conf.py
+++ b/alpaka/docs/source/conf.py
@@ -7,20 +7,20 @@
 
 # -- Project information -----------------------------------------------------
 
-project = 'alpaka'
-copyright = 'Documentation under CC-BY 4.0, Benjamin Worpitz, René Widera, Axel Huebl, Michael Bussmann'
-author = 'Benjamin Worpitz, René Widera, Axel Huebl, Michael Bussmann'
+project = "alpaka"
+copyright = "Documentation under CC-BY 4.0, Benjamin Worpitz, René Widera, Axel Huebl, Michael Bussmann"
+author = "Benjamin Worpitz, René Widera, Axel Huebl, Michael Bussmann"
 # The short X.Y version.
-version = u'1.0.0'
+version = "1.0.0"
 # The full version, including alpha/beta/rc tags.
-release = u'1.0.0-rc1'
+release = "1.0.0-rc1"
 
 # The master toctree document.
-master_doc = 'index'
+master_doc = "index"
 
 # -- General configuration ---------------------------------------------------
 
-on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
+on_rtd = os.environ.get("READTHEDOCS", None) == "True"
 
 show_authors = True
 
@@ -28,28 +28,28 @@
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
-    'sphinx.ext.mathjax',
-#    'sphinx.ext.napoleon',
-    'breathe',
-    'sphinxcontrib.programoutput',
-#    'matplotlib.sphinxext.plot_directive'
-    'sphinx.ext.autosectionlabel',
+    "sphinx.ext.mathjax",
+    #    'sphinx.ext.napoleon',
+    "breathe",
+    "sphinxcontrib.programoutput",
+    #    'matplotlib.sphinxext.plot_directive'
+    "sphinx.ext.autosectionlabel",
 ]
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
 exclude_patterns = ["Thumbs.db", ".DS_Store"]
 
-source_suffix = ['.rst']
-master_doc = 'index'
+source_suffix = [".rst"]
+master_doc = "index"
 language = "en"
 
 # The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx' #'default'
+pygments_style = "sphinx"  #'default'
 
 # If true, `todo` and `todoList` produce output, else they produce nothing.
 todo_include_todos = False
@@ -59,24 +59,22 @@
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = 'sphinx_rtd_theme'
+html_theme = "sphinx_rtd_theme"
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
 
 # modifies the HTML Sphinx Doc layout
 html_css_files = ["custom.css"]
 
 html_logo = "../logo/alpaka.svg"
-html_theme_options = {
-    "logo_only"  : True
-}
+html_theme_options = {"logo_only": True}
 
 # -- Options for HTMLHelp output ---------------------------------------------
 
-htmlhelp_basename = 'alpakadoc'
+htmlhelp_basename = "alpakadoc"
 
 # -- Options for LaTeX output ------------------------------------------------
 
@@ -84,17 +82,14 @@
     # The paper size ('letterpaper' or 'a4paper').
     #
     # 'papersize': 'letterpaper',
-    'papersize': 'a4paper',
-
+    "papersize": "a4paper",
     # The font size ('10pt', '11pt' or '12pt').
     #
     # 'pointsize': '10pt',
-
     # Additional stuff for the LaTeX preamble.
     #
     # 'preamble': '',
-    'preamble': r'\setcounter{tocdepth}{2}',
-
+    "preamble": r"\setcounter{tocdepth}{2}",
     # Latex figure (float) alignment
     #
     # 'figure_align': 'htbp',
@@ -105,18 +100,14 @@
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-    (master_doc, 'alpaka-doc.tex', u'alpaka Documentation',
-     u'The alpaka Community', 'manual'),
+    (master_doc, "alpaka-doc.tex", "alpaka Documentation", "The alpaka Community", "manual"),
 ]
 
 # -- Options for manual page output ------------------------------------------
 
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
-man_pages = [
-    (master_doc, 'alpaka', u'alpaka Documentation',
-     [author], 1)
-]
+man_pages = [(master_doc, "alpaka", "alpaka Documentation", [author], 1)]
 
 # -- Options for Texinfo output ----------------------------------------------
 
@@ -124,51 +115,56 @@
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-    (master_doc, 'alpaka', u'alpaka Documentation',
-     author, 'alpaka', 'Abstraction Library for Parallel Kernel Acceleration',
-     """
+    (
+        master_doc,
+        "alpaka",
+        "alpaka Documentation",
+        author,
+        "alpaka",
+        "Abstraction Library for Parallel Kernel Acceleration",
+        """
      The alpaka library is a header-only C++17 abstraction library for
      accelerator development. Its aim is to provide performance portability
      across accelerators through the abstraction (not hiding!) of the underlying
      levels of parallelism.
-     """),
+     """,
+    ),
 ]
 
 # -- Options for Epub output -------------------------------------------------
 
 # A list of files that should not be packed into the epub file.
-epub_exclude_files = ['search.html']
+epub_exclude_files = ["search.html"]
 
 
 # -- Extension configuration -------------------------------------------------
 
-breathe_projects = { "alpaka": "../doxygen/xml" }
+breathe_projects = {"alpaka": "../doxygen/xml"}
 breathe_default_project = "alpaka"
 
-breathe_domain_by_extension = {
-    "cpp":   "cpp",
-    "h":     "cpp",
-    "hpp":   "cpp",
-    "tpp":   "cpp"
-}
+breathe_domain_by_extension = {"cpp": "cpp", "h": "cpp", "hpp": "cpp", "tpp": "cpp"}
 
 # define alpaka attributes
 # breath has problems to parse C++ attributes
-cpp_id_attributes = ["ALPAKA_FN_ACC",
-                     "ALPAKA_FN_HOST",
-                     "ALPAKA_FN_HOST_ACC",
-                     "ALPAKA_FN_INLINE",
-                     "ALPAKA_NO_HOST_ACC_WARNING",
-                     "ALPAKA_STATIC_ACC_MEM_CONSTANT",
-                     "ALPAKA_STATIC_ACC_MEM_GLOBAL",
-                     ]
+cpp_id_attributes = [
+    "ALPAKA_FN_ACC",
+    "ALPAKA_FN_HOST",
+    "ALPAKA_FN_HOST_ACC",
+    "ALPAKA_FN_INLINE",
+    "ALPAKA_NO_HOST_ACC_WARNING",
+    "ALPAKA_STATIC_ACC_MEM_CONSTANT",
+    "ALPAKA_STATIC_ACC_MEM_GLOBAL",
+]
 
 # -- processing --
 
 if on_rtd:
-    subprocess.call('cd ..; doxygen', shell=True)
-    subprocess.call('cd ../cheatsheet; rst2pdf -s cheatsheet.style ../source/basic/cheatsheet.rst -o cheatsheet.pdf', shell=True)
+    subprocess.call("cd ..; doxygen", shell=True)
+    subprocess.call(
+        "cd ../cheatsheet; rst2pdf -s cheatsheet.style ../source/basic/cheatsheet.rst -o cheatsheet.pdf", shell=True
+    )
 else:
     import sphinx_rtd_theme
+
     html_theme = "sphinx_rtd_theme"
     html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
diff --git a/alpaka/docs/source/dev/backends.rst b/alpaka/docs/source/dev/backends.rst
index c9a6bccc..386cb3b0 100644
--- a/alpaka/docs/source/dev/backends.rst
+++ b/alpaka/docs/source/dev/backends.rst
@@ -195,7 +195,7 @@ Depending on the cmake argument ``ALPAKA_ACC_GPU_CUDA_ONLY_MODE`` the function a
     +----------+-------------------------------------+
     | CUDA     | alpaka                              |
     +==========+=====================================+
-    | ``dim3`` | ``alpaka::Vec< TDim, TVal >``  |
+    | ``dim3`` | ``alpaka::Vec< TDim, TVal >``       |
     +----------+-------------------------------------+
 
 
@@ -242,7 +242,7 @@ The following tables list the functions available in the `CUDA Runtime API <http
     +---------------------------------+-----------------------------------------------------------------------+
     | cudaGetDevice                   | n/a (no current device)                                               |
     +---------------------------------+-----------------------------------------------------------------------+
-    | cudaGetDeviceCount              | std::sizet alpaka::getDevCount< TPlatform >()                         |
+    | cudaGetDeviceCount              | std::size_t alpaka::getDevCount< TPlatform >()                         |
     +---------------------------------+-----------------------------------------------------------------------+
     | cudaGetDeviceFlags              | --                                                                    |
     +---------------------------------+-----------------------------------------------------------------------+
@@ -353,7 +353,7 @@ The following tables list the functions available in the `CUDA Runtime API <http
     +----------------------------+--------------------------------------------------------------------------------------------+
     | cudaFreeAsync              | n/a (automatic memory management with reference counted memory handles)                    |
     +----------------------------+--------------------------------------------------------------------------------------------+
-    | cudaFreeHost               | n/a (automatic memory management with reference counted memory handles)                                                                                       |
+    | cudaFreeHost               | n/a (automatic memory management with reference counted memory handles)                    |
     +----------------------------+--------------------------------------------------------------------------------------------+
     | cudaFreeMipmappedArray     | --                                                                                         |
     +----------------------------+--------------------------------------------------------------------------------------------+
@@ -363,7 +363,7 @@ The following tables list the functions available in the `CUDA Runtime API <http
     +----------------------------+--------------------------------------------------------------------------------------------+
     | cudaGetSymbolSize          | --                                                                                         |
     +----------------------------+--------------------------------------------------------------------------------------------+
-    | cudaHostAlloc              | alpaka::allocMappedBuf<TPlatform, TElement>(host, extents) 1D, 2D, 3D supported!           |
+    | cudaHostAlloc              | alpaka::allocMappedBuf<TElement, TIdx>(host, platform, extents) 1D, 2D, 3D supported! [1]  |
     +----------------------------+--------------------------------------------------------------------------------------------+
     | cudaHostGetDevicePointer   | --                                                                                         |
     +----------------------------+--------------------------------------------------------------------------------------------+
@@ -383,7 +383,7 @@ The following tables list the functions available in the `CUDA Runtime API <http
     +----------------------------+--------------------------------------------------------------------------------------------+
     | cudaMallocAsync            | alpaka::allocAsyncBuf<TElement>(queue, extents1D)                                          |
     +----------------------------+--------------------------------------------------------------------------------------------+
-    | cudaMallocHost             | alpaka::allocMappedBuf<TPlatform, TElement>(host, extents) 1D, 2D, 3D supported!           |
+    | cudaMallocHost             | alpaka::allocMappedBuf<TElement, TIdx>(host, platform, extents) 1D, 2D, 3D supported! [1]  |
     +----------------------------+--------------------------------------------------------------------------------------------+
     | cudaMallocManaged          | --                                                                                         |
     +----------------------------+--------------------------------------------------------------------------------------------+
@@ -473,6 +473,7 @@ The following tables list the functions available in the `CUDA Runtime API <http
     | cudaMemcpyDeviceToHost     | n/a (direction of copy is determined automatically)                                        |
     +----------------------------+--------------------------------------------------------------------------------------------+
 
+[1] Not every platform supports mapped buffers, so `alpaka::allocMappedBufIfSupported<TElement, TIdx>(host, platform, extents)` should be used instead to support these platforms as well.
 
 *Execution Control*
 
diff --git a/alpaka/docs/source/dev/details.rst b/alpaka/docs/source/dev/details.rst
index d81e4fb3..d9ecb748 100644
--- a/alpaka/docs/source/dev/details.rst
+++ b/alpaka/docs/source/dev/details.rst
@@ -180,7 +180,7 @@ A type can model the queue concept completely by defining specializations for ``
 This functionality can be accessed by the corresponding ``alpaka::enqueue`` and ``alpaka::empty`` template functions.
 
 Currently there is no native language support for describing and checking concepts in C++ at compile time.
-A study group (SG8) is working on the ISO `specification for conecpts <http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4377.pdf>`_ and compiler forks implementing them do exist.
+A study group (SG8) is working on the ISO `specification for concepts <http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4377.pdf>`_ and compiler forks implementing them do exist.
 For usage in current C++ there are libraries like `Boost.ConceptCheck <https://www.boost.org/doc/libs/1_58_0/libs/concept_check/concept_check.htm>`_ which try to emulate requirement checking of concept types.
 Those libraries often exploit the preprocessor and require non-trivial changes to the function declaration syntax.
 Therefore the *alpaka* library does not currently make use of *Boost.ConceptCheck*.
@@ -239,7 +239,7 @@ However, due to SFINAE, this would not result in a compiler error but rather onl
 The ``std::enable_if`` template results in a valid expression, if the condition it contains evaluates to true, and an invalid expression if it is false.
 Therefore it can be used to disable specializations depending on arbitrary boolean conditions.
 It is utilized in the case where the ``TaskId`` member is unequal one or the ``TQueue`` does not inherit from ``UserQueue``.
-In this cirumstances, the condition itself results in valid code but because it evaluates to false, the ``std::enable_if`` specialization results in invalid code and the whole ``Enqueue`` template specialization gets omitted.
+In this circumstances, the condition itself results in valid code but because it evaluates to false, the ``std::enable_if`` specialization results in invalid code and the whole ``Enqueue`` template specialization gets omitted.
 
 Argument dependent lookup for math functions
 --------------------------------------------
diff --git a/alpaka/docs/source/dev/style.rst b/alpaka/docs/source/dev/style.rst
index c95bfc5b..d92c53b7 100644
--- a/alpaka/docs/source/dev/style.rst
+++ b/alpaka/docs/source/dev/style.rst
@@ -3,148 +3,123 @@
 Coding Guidelines
 ==================
 
-.. attention::
-   The Coding Guidelines are currently revised
+Pre-commit
+----------
 
-General
--------
+This project is set up for use with `pre-commit <https://pre-commit.com>`_. Using it will make your code conform with most
+of our (easily automatable) code style guidelines automatically. Pre-commit is a tool that manages
+`git hooks <https://git-scm.com/docs/githooks>`_ conveniently for you.
+In very short (for anything further see `pre-commit <https://pre-commit.com>`_), after running the following in your
+working clone of alpaka
 
-* Use the ``.clang-format`` file supplied in alpaka's top-level directory to format your code. This will handle indentation,
-whitespace and braces automatically. Usage:
+.. code-block:: bash
+
+  # if not yet done, install the pre-commit executable following https://pre-commit.com
+  cd /path/to/alpaka-working-clone
+  pre-commit install
+
+``git`` will run a number of checks prior to every commit and push and will refuse to perform the
+pertinent action if they fail. Most of them (like e.g. the formatter) will have automatically altered your working tree
+with the necessary changes such that
 
 .. code-block:: bash
 
-  clang-format-16 -i <sourcefile>
+  git add -u
 
-* If you want to format the entire code base execute the following command from alpaka's top-level directory:
+will make the next commit pass. Although discouraged, in urgent cases it might be needed to be able to commit even if
+the checks fail. For such cases, you can either use
 
 .. code-block:: bash
 
-  find example include test -name '*.hpp' -o -name '*.cpp' | xargs clang-format-16 -i
+  git commit --no-verify [...]
 
-Windows users should use `Visual Studio's native clang-format integration
-<https://devblogs.microsoft.com/cppblog/clangformat-support-in-visual-studio-2017-15-7-preview-1/>`.
+to completely skip all checks or use the more fine-grained control described `here <https://pre-commit.com/#temporarily-disabling-hooks>`_.
 
-Naming
-------
+You can use
 
-* Types are always in PascalCase (KernelExecCuda, BufT, ...) and singular.
-* Variables are always in camelCase (memBufHost, ...) and plural for collections and singular else.
-* Namespaces are always in lowercase and singular is preferred.
-* There are no two consecutive upper case letters (AccOpenMp, HtmlRenderer, IoHandler, ...). This makes names more easily readable.
+.. code-block:: bash
+
+   pre-commit run --all-files
 
+to run all the hooks on all files.
 
-Types
------
+Formatting
+----------
 
-* Always use integral types with known width (``int32_t``, ``uint64_t``, ...).
-  Never use ``int``, ``unsigned long``, etc.
+Use the ``.clang-format`` file supplied in alpaka's top-level directory to format your code.
+This will handle indentation, whitespace and braces automatically.
+Checkout ``CONTRIBUTING.md`` on how to run it.
 
+Naming
+------
+
+* Types are always in PascalCase (KernelExecCuda, BufT, ...) and singular.
+* Variables are always in camelCase (memBufHost, ...) and singular by default. Use plural for collections.
+* Namespaces are always in lowercase and singular is preferred.
+* Avoid consecutive upper case letters. E.g.: AccOpenMp instead of AccOpenMP, or HtmlRenderer instead of HTMLRenderer.
+  This makes names more easily readable.
 
 Type Qualifiers
 ---------------
 
-The order of  type qualifiers should be:
+The order of type qualifiers should be:
 ``Type const * const`` for a const pointer to a const Type.
 ``Type const &`` for a reference to a const Type.
 
 The reason is that types can be read from right to left correctly without jumping back and forth.
 ``const Type * const`` and ``const Type &`` would require jumping in either way to read them correctly.
+clang-format should handle this automatically in most cases.
 
 
 Variables
 ---------
 
-* Variables should always be initialized on construction because this can produce hard to debug errors.
-  This can (nearly) always be done even in performance critical code without sacrificing speed by using a functional programming style.
-* Variables should (nearly) always be ``const`` to make the code more easy to understand.
-  This is equivalent to functional programming and the SSA (static single assignment) style used by LLVM.
-  This should have no speed implication as every half baked compiler analyses the usage of variables and reuses registers.
-* Variable definitions should be differentiated from assignments by using either ``(...)`` or ``{...}`` but never ``=`` for definitions.
-  Use ``uint32_t const iUsageOfThisVariable(42);`` instead of ``uint32_t const iUsageOfThisVariable = 42;``
-
+* Variables should be initialized at definition to avoid hard to debug errors, even in performance critical code.
+  If you suspect a slowdown, measure first.
+* Variables should be ``const`` to make the code more easy to understand.
+  This is equivalent to functional programming and the SSA (static single assignment) style.
+* Prefer direct-initialization using braces for variable definitions, e.g. ``T t{...}``,
+  over copy-initialization, e.g. ``T t = {...}``.
+  Avoid direct-initialization using parenthesis, e.g. ``T t(...)``.
 
 Comments
 --------
 
-* Always use C++-Style comments ``//``
-* For types use
-  ``//#############################################################################``
-  to start the comment block.
-* For functions use
-  ``//-----------------------------------------------------------------------------``
-  to start the comment block.
-
+* Always use C++-style comments ``//``
 
 Functions
 ---------
 
-* Always use the trailing return type syntax with the return type on a new line even if the return type is void:
+* Always use the trailing return type syntax, even if the return type is ``void``:
 
 .. code-block::
 
-   auto func()
-   -> bool
+   auto func() -> bool
 
-* This makes it easier to see the return type because it is on its own line.
 * This leads to a consistent style for constructs where there is no alternative style (lambdas, functions templates with dependent return types) and standard functions.
-* Each function parameter is on a new indented line:
-
-.. code-block::
-
-   auto func(
-       float f1,
-       float f2)
-   -> bool
-   {
-       return true
-   }
-
-.. code-block::
-
-   func(
-       1.0f,
-       2.0f);
-
-* Makes it easier to see how many parameters there are and which position they have.
-
 
 Templates
 ---------
 
-* Template parameters are prefixed with ``T`` to differentiate them from class or function local typedefs.
-* Each template parameter is on a new indented line:
+* Template parameters, which are not a single letter, are prefixed with ``T`` to differentiate them from class or function local aliases.
 
 .. code-block:: c++
 
-   template<
-       typename TParam,
-       typename TArgs...>
-   auto func()
-   -> bool
+   template<int I, typename TParam, typename TArgs...>
+   auto func() -> bool
 
-* Makes it easier to see how many template parameters there are and which position they have.
-* Always use ``typename`` for template parameters. There is NO difference to class and typename matches the intent better.
+* Always use ``typename`` instead of ``class`` for template parameters.
+  There is NO semantic difference between them, but ``typename`` matches the intent better.
 
 
 Traits
 ------
 
-* Trait classes always have one more template parameter (with default parameter) then is required for enabling SFINAE in the specialization:
+* Trait classes must have one additional template parameter (defaulted to ``void``) then required to enable SFINAE in specializations:
 
 .. code-block::
 
-   template<
-       typename T,
-       typename TSfinae = void>
+   template<typename T, typename TSfinae = void>
    struct GetOffsets;
 
-* Template trait aliases always end with a ``T`` e.g. ``BufT`` while the corresponding trait ends with ``Type`` e.g. ``BufType``
 * Traits for implementations always have the same name as the accessor function but in PascalCase while the member function is camelCase again: ``sin(){...}`` and ``Sin{sin(){...}};``
-
-Includes
---------
-
-* The order of includes is from the most specialized header to the most general one.
-  This order helps to find missing includes in more specialized headers because the general ones are always included afterwards.
-* A comment with the types or functions included by a include file make it easier to find out why a special header is included.
diff --git a/alpaka/docs/source/dev/test.rst b/alpaka/docs/source/dev/test.rst
index c793fe61..ca5ee28f 100644
--- a/alpaka/docs/source/dev/test.rst
+++ b/alpaka/docs/source/dev/test.rst
@@ -3,7 +3,7 @@
 Writing a unit test
 ===================
 
-After implementing a new functionality in Alpaka, it is recommended to test it. Indeed, having no compile time errors
+After implementing a new functionality in alpaka, it is recommended to test it. Indeed, having no compile time errors
 when building alpaka with any backend doesn't mean that such functionality is well implemented and behaves as expected.
 
 Unit tests are written and integrated with `Catch2 <https://github.com/catchorg/Catch2>`_ and they are standalone executables located in the ``test/unit`` and
@@ -46,7 +46,7 @@ Test cases using an alpaka accelerator are then typically introduced with the ``
 It takes three arguments:
 * a free form test name (must be unique)
 * a tag
-* the accelerator(s) that must run the test case(s) (i.e. ``alpaka::test::TestAccs`` targets all the accelereators selected
+* the accelerator(s) that must run the test case(s) (i.e. ``alpaka::test::TestAccs`` targets all the accelerators selected
 by the ``TestAccs`` header).
 
 Some aliases might be useful:
diff --git a/alpaka/example/CMakeLists.txt b/alpaka/example/CMakeLists.txt
index b74c45b8..4d3ae6a1 100644
--- a/alpaka/example/CMakeLists.txt
+++ b/alpaka/example/CMakeLists.txt
@@ -17,12 +17,17 @@ project("alpakaExamples" LANGUAGES CXX)
 
 add_subdirectory("bufferCopy/")
 add_subdirectory("complex/")
+add_subdirectory("convolution1D/")
+add_subdirectory("convolution2D/")
+add_subdirectory("conv2DWithMdspan/")
 add_subdirectory("counterBasedRng/")
 add_subdirectory("heatEquation/")
+add_subdirectory("heatEquation2D/")
 add_subdirectory("helloWorld/")
 add_subdirectory("helloWorldLambda/")
 add_subdirectory("kernelSpecialization/")
 add_subdirectory("ls/")
+add_subdirectory("matrixMulWithMdspan/")
 add_subdirectory("monteCarloIntegration/")
 add_subdirectory("openMPSchedule/")
 add_subdirectory("parallelLoopPatterns/")
@@ -31,4 +36,3 @@ add_subdirectory("randomCells2D/")
 add_subdirectory("reduce/")
 add_subdirectory("tagSpecialization/")
 add_subdirectory("vectorAdd/")
-add_subdirectory("babelstream/")
diff --git a/alpaka/example/babelstream/CMakeLists.txt b/alpaka/example/babelstream/CMakeLists.txt
deleted file mode 100644
index b4cea499..00000000
--- a/alpaka/example/babelstream/CMakeLists.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-#
-# Copyright 2023 Erik Zenker, Benjamin Worpitz, Jan Stephan, Bernhard Manfred Gruber
-# SPDX-License-Identifier: ISC
-#
-
-cmake_minimum_required(VERSION 3.22)
-set_property(GLOBAL PROPERTY USE_FOLDERS ON)
-
-project(babelstream LANGUAGES CXX)
-
-if(NOT TARGET alpaka::alpaka)
-    option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)
-    if(alpaka_USE_SOURCE_TREE)
-        # Don't build the examples recursively
-        set(alpaka_BUILD_EXAMPLES OFF)
-        add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
-    else()
-        find_package(alpaka REQUIRED)
-    endif()
-endif()
-
-alpaka_add_executable(${PROJECT_NAME} src/main.cpp src/Stream.h src/AlpakaStream.cpp src/AlpakaStream.h)
-target_compile_definitions(${PROJECT_NAME} PUBLIC ALPAKA)
-target_link_libraries(${PROJECT_NAME} PUBLIC alpaka::alpaka)
-set_target_properties(${PROJECT_NAME} PROPERTIES FOLDER example)
diff --git a/alpaka/example/babelstream/src/AlpakaStream.cpp b/alpaka/example/babelstream/src/AlpakaStream.cpp
deleted file mode 100644
index 7f618549..00000000
--- a/alpaka/example/babelstream/src/AlpakaStream.cpp
+++ /dev/null
@@ -1,264 +0,0 @@
-// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
-// University of Bristol HPC
-//
-// For full license terms please see the LICENSE file distributed with this
-// source code
-//
-// Cupla version created by Jeff Young in 2021
-// Ported from cupla to alpaka by Bernhard Manfred Gruber in 2022
-
-#include "AlpakaStream.h"
-
-#include <numeric>
-
-namespace
-{
-    constexpr auto blockSize = 1024;
-    constexpr auto dotBlockSize = 256;
-} // namespace
-
-template<typename T>
-AlpakaStream<T>::AlpakaStream(Idx arraySize, Idx deviceIndex)
-    : arraySize(arraySize)
-    , devHost(alpaka::getDevByIdx(platformHost, 0))
-    , devAcc(alpaka::getDevByIdx(platformAcc, deviceIndex))
-    , sums(alpaka::allocBuf<T, Idx>(devHost, dotBlockSize))
-    , d_a(alpaka::allocBuf<T, Idx>(devAcc, arraySize))
-    , d_b(alpaka::allocBuf<T, Idx>(devAcc, arraySize))
-    , d_c(alpaka::allocBuf<T, Idx>(devAcc, arraySize))
-    , d_sum(alpaka::allocBuf<T, Idx>(devAcc, dotBlockSize))
-    , queue(devAcc)
-{
-    if(arraySize % blockSize != 0)
-        throw std::runtime_error("Array size must be a multiple of " + std::to_string(blockSize));
-    std::cout << "Using alpaka device " << alpaka::getName(devAcc) << std::endl;
-}
-
-struct InitKernel
-{
-    template<typename TAcc, typename T>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T* b, T* c, T initA, T initB, T initC) const
-    {
-        auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        a[i] = initA;
-        b[i] = initB;
-        c[i] = initC;
-    }
-};
-
-template<typename T>
-void AlpakaStream<T>::init_arrays(T initA, T initB, T initC)
-{
-    auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
-    // auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);
-    alpaka::exec<Acc>(
-        queue,
-        workdiv,
-        InitKernel{},
-        alpaka::getPtrNative(d_a),
-        alpaka::getPtrNative(d_b),
-        alpaka::getPtrNative(d_c),
-        initA,
-        initB,
-        initC);
-    alpaka::wait(queue);
-}
-
-template<typename T>
-void AlpakaStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
-{
-    alpaka::memcpy(queue, alpaka::createView(devHost, a), d_a);
-    alpaka::memcpy(queue, alpaka::createView(devHost, b), d_b);
-    alpaka::memcpy(queue, alpaka::createView(devHost, c), d_c);
-}
-
-struct CopyKernel
-{
-    template<typename TAcc, typename T>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T* c) const
-    {
-        auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        c[i] = a[i];
-    }
-};
-
-template<typename T>
-void AlpakaStream<T>::copy()
-{
-    auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
-    // auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);
-    alpaka::exec<Acc>(queue, workdiv, CopyKernel{}, alpaka::getPtrNative(d_a), alpaka::getPtrNative(d_c));
-    alpaka::wait(queue);
-}
-
-struct MulKernel
-{
-    template<typename TAcc, typename T>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T* b, T const* c) const
-    {
-        const T scalar = startScalar;
-        auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        b[i] = scalar * c[i];
-    }
-};
-
-template<typename T>
-void AlpakaStream<T>::mul()
-{
-    auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
-    // auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);
-    alpaka::exec<Acc>(queue, workdiv, MulKernel{}, alpaka::getPtrNative(d_b), alpaka::getPtrNative(d_c));
-    alpaka::wait(queue);
-}
-
-struct AddKernel
-{
-    template<typename TAcc, typename T>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* c) const
-    {
-        auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        c[i] = a[i] + b[i];
-    }
-};
-
-template<typename T>
-void AlpakaStream<T>::add()
-{
-    auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
-    // auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);
-    alpaka::exec<Acc>(
-        queue,
-        workdiv,
-        AddKernel{},
-        alpaka::getPtrNative(d_a),
-        alpaka::getPtrNative(d_b),
-        alpaka::getPtrNative(d_c));
-    alpaka::wait(queue);
-}
-
-struct TriadKernel
-{
-    template<typename TAcc, typename T>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T const* b, T const* c) const
-    {
-        const T scalar = startScalar;
-        auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        a[i] = b[i] + scalar * c[i];
-    }
-};
-
-template<typename T>
-void AlpakaStream<T>::triad()
-{
-    auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
-    // auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);
-    alpaka::exec<Acc>(
-        queue,
-        workdiv,
-        TriadKernel{},
-        alpaka::getPtrNative(d_a),
-        alpaka::getPtrNative(d_b),
-        alpaka::getPtrNative(d_c));
-    alpaka::wait(queue);
-}
-
-struct NstreamKernel
-{
-    template<typename TAcc, typename T>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T const* b, T const* c) const
-    {
-        const T scalar = startScalar;
-        auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        a[i] += b[i] + scalar * c[i];
-    }
-};
-
-template<typename T>
-void AlpakaStream<T>::nstream()
-{
-    auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
-    // auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);
-    alpaka::exec<Acc>(
-        queue,
-        workdiv,
-        NstreamKernel{},
-        alpaka::getPtrNative(d_a),
-        alpaka::getPtrNative(d_b),
-        alpaka::getPtrNative(d_c));
-    alpaka::wait(queue);
-}
-
-struct DotKernel
-{
-    template<typename TAcc, typename T>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* sum, int arraySize) const
-    {
-        // TODO(Jeff Young) - test if sharedMem bug is affecting performance here
-        auto& tbSum = alpaka::declareSharedVar<T[blockSize], __COUNTER__>(acc);
-
-        auto [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        auto const [local_i] = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
-        auto const [totalThreads] = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
-
-        T threadSum = 0;
-        for(; i < arraySize; i += totalThreads) // NOLINT(bugprone-infinite-loop)
-            threadSum += a[i] * b[i];
-        tbSum[local_i] = threadSum;
-
-        auto const [blockDim] = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
-        for(int offset = blockDim / 2; offset > 0; offset /= 2)
-        {
-            alpaka::syncBlockThreads(acc);
-            if(local_i < offset)
-                tbSum[local_i] += tbSum[local_i + offset];
-        }
-
-        auto const [blockIdx] = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);
-        if(local_i == 0)
-            sum[blockIdx] = tbSum[local_i];
-    }
-};
-
-template<typename T>
-auto AlpakaStream<T>::dot() -> T
-{
-    auto const workdiv = WorkDiv{dotBlockSize, blockSize, 1};
-    // auto const workdiv = alpaka::getValidWorkDiv(devAcc, dotBlockSize * blockSize);
-    alpaka::exec<Acc>(
-        queue,
-        workdiv,
-        DotKernel{},
-        alpaka::getPtrNative(d_a),
-        alpaka::getPtrNative(d_b),
-        alpaka::getPtrNative(d_sum),
-        arraySize);
-    alpaka::wait(queue);
-
-    alpaka::memcpy(queue, sums, d_sum);
-    T const* sumPtr = alpaka::getPtrNative(sums);
-    // TODO(bgruber): replace by std::reduce, when gcc 9.3 is the baseline
-    return std::accumulate(sumPtr, sumPtr + dotBlockSize, T{0});
-}
-
-void listDevices()
-{
-    auto const platform = alpaka::Platform<Acc>{};
-    auto const count = alpaka::getDevCount(platform);
-    std::cout << "Devices:" << std::endl;
-    for(int i = 0; i < count; i++)
-        std::cout << i << ": " << getDeviceName(i) << std::endl;
-}
-
-auto getDeviceName(int deviceIndex) -> std::string
-{
-    auto const platform = alpaka::Platform<Acc>{};
-    return alpaka::getName(alpaka::getDevByIdx(platform, deviceIndex));
-}
-
-auto getDeviceDriver([[maybe_unused]] int device) -> std::string
-{
-    return "Not supported";
-}
-
-template class AlpakaStream<float>;
-template class AlpakaStream<double>;
diff --git a/alpaka/example/babelstream/src/AlpakaStream.h b/alpaka/example/babelstream/src/AlpakaStream.h
deleted file mode 100644
index ba556b02..00000000
--- a/alpaka/example/babelstream/src/AlpakaStream.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
-// University of Bristol HPC
-//
-// For full license terms please see the LICENSE file distributed with this
-// source code
-//
-// Cupla version created by Jeff Young in 2021
-// Ported from cupla to alpaka by Bernhard Manfred Gruber in 2022
-
-#pragma once
-
-#include "Stream.h"
-
-#include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
-
-#include <vector>
-
-inline constexpr auto IMPLEMENTATION_STRING = "alpaka";
-
-using Dim = alpaka::DimInt<1>;
-using Idx = int;
-using Vec = alpaka::Vec<Dim, Idx>;
-using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
-
-template<typename T>
-struct AlpakaStream : Stream<T>
-{
-    AlpakaStream(Idx arraySize, Idx deviceIndex);
-
-    void copy() override;
-    void add() override;
-    void mul() override;
-    void triad() override;
-    void nstream() override;
-    auto dot() -> T override;
-
-    void init_arrays(T initA, T initB, T initC) override;
-    void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
-
-    using PlatformHost = alpaka::PlatformCpu;
-    using DevHost = alpaka::Dev<PlatformHost>;
-    using PlatformAcc = alpaka::Platform<Acc>;
-    using DevAcc = alpaka::Dev<Acc>;
-    using BufHost = alpaka::Buf<alpaka::DevCpu, T, Dim, Idx>;
-    using BufAcc = alpaka::Buf<Acc, T, Dim, Idx>;
-    using Queue = alpaka::Queue<Acc, alpaka::Blocking>;
-
-    using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
-
-private:
-    Idx arraySize;
-    PlatformHost platformHost;
-    DevHost devHost;
-    PlatformAcc platformAcc;
-    DevAcc devAcc;
-    BufHost sums;
-    BufAcc d_a;
-    BufAcc d_b;
-    BufAcc d_c;
-    BufAcc d_sum;
-    Queue queue;
-};
diff --git a/alpaka/example/babelstream/src/README.md b/alpaka/example/babelstream/src/README.md
deleted file mode 100644
index 781cdf31..00000000
--- a/alpaka/example/babelstream/src/README.md
+++ /dev/null
@@ -1,6 +0,0 @@
-This is a port of [BabelStream](https://github.com/UoB-HPC/BabelStream) to alpaka.
-This work is based on the [cupla port of BabelStream](https://github.com/jyoung3131/BabelStream) from Jeff Young.
-The benchmark driver (`main.cpp` and `Stream.h`) is taken from BabelStream.
-No other backends are available, only alpaka.
-Thus, there is no need to select a backend, just run the executable.
-Please refer to the BabelStream documentation of more information on how to run the benchmark.
diff --git a/alpaka/example/babelstream/src/Stream.h b/alpaka/example/babelstream/src/Stream.h
deleted file mode 100644
index d4548428..00000000
--- a/alpaka/example/babelstream/src/Stream.h
+++ /dev/null
@@ -1,48 +0,0 @@
-
-// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
-// University of Bristol HPC
-//
-// For full license terms please see the LICENSE file distributed with this
-// source code
-
-// NOLINTBEGIN
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-// Array values
-#define startA (0.1)
-#define startB (0.2)
-#define startC (0.0)
-#define startScalar (0.4)
-
-template<class T>
-class Stream
-{
-public:
-    virtual ~Stream()
-    {
-    }
-
-    // Kernels
-    // These must be blocking calls
-    virtual void copy() = 0;
-    virtual void mul() = 0;
-    virtual void add() = 0;
-    virtual void triad() = 0;
-    virtual void nstream() = 0;
-    virtual T dot() = 0;
-
-    // Copy memory between host and device
-    virtual void init_arrays(T initA, T initB, T initC) = 0;
-    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0;
-};
-
-// Implementation specific device functions
-void listDevices(void);
-std::string getDeviceName(int const);
-std::string getDeviceDriver(int const);
-
-// NOLINTEND
diff --git a/alpaka/example/babelstream/src/main.cpp b/alpaka/example/babelstream/src/main.cpp
deleted file mode 100644
index acef1c33..00000000
--- a/alpaka/example/babelstream/src/main.cpp
+++ /dev/null
@@ -1,588 +0,0 @@
-
-// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
-// University of Bristol HPC
-//
-// For full license terms please see the LICENSE file distributed with this
-// source code
-
-// NOLINTBEGIN
-
-#include <algorithm>
-#include <chrono>
-#include <cmath>
-#include <cstring>
-#include <iomanip>
-#include <iostream>
-#include <limits>
-#include <numeric>
-#include <vector>
-
-#define VERSION_STRING "4.0"
-
-#include "Stream.h"
-
-#if defined(CUDA)
-#    include "CUDAStream.h"
-#elif defined(STD_DATA)
-#    include "STDDataStream.h"
-#elif defined(STD_INDICES)
-#    include "STDIndicesStream.h"
-#elif defined(STD_RANGES)
-#    include "STDRangesStream.hpp"
-#elif defined(TBB)
-#    include "TBBStream.hpp"
-#elif defined(THRUST)
-#    include "ThrustStream.h"
-#elif defined(HIP)
-#    include "HIPStream.h"
-#elif defined(HC)
-#    include "HCStream.h"
-#elif defined(OCL)
-#    include "OCLStream.h"
-#elif defined(USE_RAJA)
-#    include "RAJAStream.hpp"
-#elif defined(KOKKOS)
-#    include "KokkosStream.hpp"
-#elif defined(ACC)
-#    include "ACCStream.h"
-#elif defined(SYCL)
-#    include "SYCLStream.h"
-#elif defined(SYCL2020)
-#    include "SYCLStream2020.h"
-#elif defined(OMP)
-#    include "OMPStream.h"
-#elif defined(ALPAKA)
-#    include "AlpakaStream.h"
-#endif
-
-// Default size of 2^25
-int ARRAY_SIZE = 33'554'432;
-unsigned int num_times = 100;
-unsigned int deviceIndex = 0;
-bool use_float = false;
-bool output_as_csv = false;
-bool mibibytes = false;
-std::string csv_separator = ",";
-
-template<typename T>
-void check_solution(unsigned int const ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c, T& sum);
-
-template<typename T>
-void run();
-
-// Options for running the benchmark:
-// - All 5 kernels (Copy, Add, Mul, Triad, Dot).
-// - Triad only.
-// - Nstream only.
-enum class Benchmark
-{
-    All,
-    Triad,
-    Nstream
-};
-
-// Selected run options.
-Benchmark selection = Benchmark::All;
-
-void parseArguments(int argc, char* argv[]);
-
-int main(int argc, char* argv[])
-{
-    parseArguments(argc, argv);
-
-    if(!output_as_csv)
-    {
-        std::cout << "BabelStream" << std::endl
-                  << "Version: " << VERSION_STRING << std::endl
-                  << "Implementation: " << IMPLEMENTATION_STRING << std::endl;
-    }
-
-    if(use_float)
-        run<float>();
-    else
-        run<double>();
-}
-
-// Run the 5 main kernels
-template<typename T>
-std::vector<std::vector<double>> run_all(Stream<T>* stream, T& sum)
-{
-    // List of times
-    std::vector<std::vector<double>> timings(5);
-
-    // Declare timers
-    std::chrono::high_resolution_clock::time_point t1, t2;
-
-    // Main loop
-    for(unsigned int k = 0; k < num_times; k++)
-    {
-        // Execute Copy
-        t1 = std::chrono::high_resolution_clock::now();
-        stream->copy();
-        t2 = std::chrono::high_resolution_clock::now();
-        timings[0].push_back(std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count());
-
-        // Execute Mul
-        t1 = std::chrono::high_resolution_clock::now();
-        stream->mul();
-        t2 = std::chrono::high_resolution_clock::now();
-        timings[1].push_back(std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count());
-
-        // Execute Add
-        t1 = std::chrono::high_resolution_clock::now();
-        stream->add();
-        t2 = std::chrono::high_resolution_clock::now();
-        timings[2].push_back(std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count());
-
-        // Execute Triad
-        t1 = std::chrono::high_resolution_clock::now();
-        stream->triad();
-        t2 = std::chrono::high_resolution_clock::now();
-        timings[3].push_back(std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count());
-
-        // Execute Dot
-        t1 = std::chrono::high_resolution_clock::now();
-        sum = stream->dot();
-        t2 = std::chrono::high_resolution_clock::now();
-        timings[4].push_back(std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count());
-    }
-
-    // Compiler should use a move
-    return timings;
-}
-
-// Run the Triad kernel
-template<typename T>
-std::vector<std::vector<double>> run_triad(Stream<T>* stream)
-{
-    std::vector<std::vector<double>> timings(1);
-
-    // Declare timers
-    std::chrono::high_resolution_clock::time_point t1, t2;
-
-    // Run triad in loop
-    t1 = std::chrono::high_resolution_clock::now();
-    for(unsigned int k = 0; k < num_times; k++)
-    {
-        stream->triad();
-    }
-    t2 = std::chrono::high_resolution_clock::now();
-
-    double runtime = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count();
-    timings[0].push_back(runtime);
-
-    return timings;
-}
-
-// Run the Nstream kernel
-template<typename T>
-std::vector<std::vector<double>> run_nstream(Stream<T>* stream)
-{
-    std::vector<std::vector<double>> timings(1);
-
-    // Declare timers
-    std::chrono::high_resolution_clock::time_point t1, t2;
-
-    // Run nstream in loop
-    for(int k = 0; k < num_times; k++)
-    {
-        t1 = std::chrono::high_resolution_clock::now();
-        stream->nstream();
-        t2 = std::chrono::high_resolution_clock::now();
-        timings[0].push_back(std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count());
-    }
-
-    return timings;
-}
-
-// Generic run routine
-// Runs the kernel(s) and prints output.
-template<typename T>
-void run()
-{
-    std::streamsize ss = std::cout.precision();
-
-    if(!output_as_csv)
-    {
-        if(selection == Benchmark::All)
-            std::cout << "Running kernels " << num_times << " times" << std::endl;
-        else if(selection == Benchmark::Triad)
-        {
-            std::cout << "Running triad " << num_times << " times" << std::endl;
-            std::cout << "Number of elements: " << ARRAY_SIZE << std::endl;
-        }
-
-
-        if(sizeof(T) == sizeof(float))
-            std::cout << "Precision: float" << std::endl;
-        else
-            std::cout << "Precision: double" << std::endl;
-
-
-        if(mibibytes)
-        {
-            // MiB = 2^20
-            std::cout << std::setprecision(1) << std::fixed
-                      << "Array size: " << ARRAY_SIZE * sizeof(T) * pow(2.0, -20.0) << " MiB"
-                      << " (=" << ARRAY_SIZE * sizeof(T) * pow(2.0, -30.0) << " GiB)" << std::endl;
-            std::cout << "Total size: " << 3.0 * ARRAY_SIZE * sizeof(T) * pow(2.0, -20.0) << " MiB"
-                      << " (=" << 3.0 * ARRAY_SIZE * sizeof(T) * pow(2.0, -30.0) << " GiB)" << std::endl;
-        }
-        else
-        {
-            // MB = 10^6
-            std::cout << std::setprecision(1) << std::fixed << "Array size: " << ARRAY_SIZE * sizeof(T) * 1.0E-6
-                      << " MB"
-                      << " (=" << ARRAY_SIZE * sizeof(T) * 1.0E-9 << " GB)" << std::endl;
-            std::cout << "Total size: " << 3.0 * ARRAY_SIZE * sizeof(T) * 1.0E-6 << " MB"
-                      << " (=" << 3.0 * ARRAY_SIZE * sizeof(T) * 1.0E-9 << " GB)" << std::endl;
-        }
-        std::cout.precision(ss);
-    }
-
-    Stream<T>* stream;
-
-#if defined(CUDA)
-    // Use the CUDA implementation
-    stream = new CUDAStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(HIP)
-    // Use the HIP implementation
-    stream = new HIPStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(HC)
-    // Use the HC implementation
-    stream = new HCStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(OCL)
-    // Use the OpenCL implementation
-    stream = new OCLStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(USE_RAJA)
-    // Use the RAJA implementation
-    stream = new RAJAStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(KOKKOS)
-    // Use the Kokkos implementation
-    stream = new KokkosStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(STD_DATA)
-    // Use the C++ STD data-oriented implementation
-    stream = new STDDataStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(STD_INDICES)
-    // Use the C++ STD index-oriented implementation
-    stream = new STDIndicesStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(STD_RANGES)
-    // Use the C++ STD ranges implementation
-    stream = new STDRangesStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(TBB)
-    // Use the C++20 implementation
-    stream = new TBBStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(THRUST)
-    // Use the Thrust implementation
-    stream = new ThrustStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(ACC)
-    // Use the OpenACC implementation
-    stream = new ACCStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(SYCL) || defined(SYCL2020)
-    // Use the SYCL implementation
-    stream = new SYCLStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(OMP)
-    // Use the OpenMP implementation
-    stream = new OMPStream<T>(ARRAY_SIZE, deviceIndex);
-
-#elif defined(ALPAKA)
-    // Use the alpaka implementation
-    stream = new AlpakaStream<T>(ARRAY_SIZE, deviceIndex);
-
-#endif
-
-    stream->init_arrays(startA, startB, startC);
-
-    // Result of the Dot kernel, if used.
-    T sum = 0.0;
-
-    std::vector<std::vector<double>> timings;
-
-    switch(selection)
-    {
-    case Benchmark::All:
-        timings = run_all<T>(stream, sum);
-        break;
-    case Benchmark::Triad:
-        timings = run_triad<T>(stream);
-        break;
-    case Benchmark::Nstream:
-        timings = run_nstream<T>(stream);
-        break;
-    };
-
-    // Check solutions
-    // Create host vectors
-    std::vector<T> a(ARRAY_SIZE);
-    std::vector<T> b(ARRAY_SIZE);
-    std::vector<T> c(ARRAY_SIZE);
-
-
-    stream->read_arrays(a, b, c);
-    check_solution<T>(num_times, a, b, c, sum);
-
-    // Display timing results
-    if(output_as_csv)
-    {
-        std::cout << "function" << csv_separator << "num_times" << csv_separator << "n_elements" << csv_separator
-                  << "sizeof" << csv_separator << ((mibibytes) ? "max_mibytes_per_sec" : "max_mbytes_per_sec")
-                  << csv_separator << "min_runtime" << csv_separator << "max_runtime" << csv_separator << "avg_runtime"
-                  << std::endl;
-    }
-    else
-    {
-        std::cout << std::left << std::setw(12) << "Function" << std::left << std::setw(12)
-                  << ((mibibytes) ? "MiBytes/sec" : "MBytes/sec") << std::left << std::setw(12) << "Min (sec)"
-                  << std::left << std::setw(12) << "Max" << std::left << std::setw(12) << "Average" << std::endl
-                  << std::fixed;
-    }
-
-
-    if(selection == Benchmark::All || selection == Benchmark::Nstream)
-    {
-        std::vector<std::string> labels;
-        std::vector<size_t> sizes;
-
-        if(selection == Benchmark::All)
-        {
-            labels = {"Copy", "Mul", "Add", "Triad", "Dot"};
-            sizes
-                = {2 * sizeof(T) * ARRAY_SIZE,
-                   2 * sizeof(T) * ARRAY_SIZE,
-                   3 * sizeof(T) * ARRAY_SIZE,
-                   3 * sizeof(T) * ARRAY_SIZE,
-                   2 * sizeof(T) * ARRAY_SIZE};
-        }
-        else if(selection == Benchmark::Nstream)
-        {
-            labels = {"Nstream"};
-            sizes = {4 * sizeof(T) * ARRAY_SIZE};
-        }
-
-        for(int i = 0; i < timings.size(); ++i)
-        {
-            // Get min/max; ignore the first result
-            auto minmax = std::minmax_element(timings[i].begin() + 1, timings[i].end());
-
-            // Calculate average; ignore the first result
-            double average = std::accumulate(timings[i].begin() + 1, timings[i].end(), 0.0) / (double) (num_times - 1);
-
-            // Display results
-            if(output_as_csv)
-            {
-                std::cout << labels[i] << csv_separator << num_times << csv_separator << ARRAY_SIZE << csv_separator
-                          << sizeof(T) << csv_separator
-                          << ((mibibytes) ? pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) << csv_separator
-                          << *minmax.first << csv_separator << *minmax.second << csv_separator << average << std::endl;
-            }
-            else
-            {
-                std::cout << std::left << std::setw(12) << labels[i] << std::left << std::setw(12)
-                          << std::setprecision(3)
-                          << ((mibibytes) ? pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) << std::left
-                          << std::setw(12) << std::setprecision(5) << *minmax.first << std::left << std::setw(12)
-                          << std::setprecision(5) << *minmax.second << std::left << std::setw(12)
-                          << std::setprecision(5) << average << std::endl;
-            }
-        }
-    }
-    else if(selection == Benchmark::Triad)
-    {
-        // Display timing results
-        double total_bytes = 3 * sizeof(T) * ARRAY_SIZE * num_times;
-        double bandwidth = ((mibibytes) ? pow(2.0, -30.0) : 1.0E-9) * (total_bytes / timings[0][0]);
-
-        if(output_as_csv)
-        {
-            std::cout << "function" << csv_separator << "num_times" << csv_separator << "n_elements" << csv_separator
-                      << "sizeof" << csv_separator << ((mibibytes) ? "gibytes_per_sec" : "gbytes_per_sec")
-                      << csv_separator << "runtime" << std::endl;
-            std::cout << "Triad" << csv_separator << num_times << csv_separator << ARRAY_SIZE << csv_separator
-                      << sizeof(T) << csv_separator << bandwidth << csv_separator << timings[0][0] << std::endl;
-        }
-        else
-        {
-            std::cout << "--------------------------------" << std::endl
-                      << std::fixed << "Runtime (seconds): " << std::left << std::setprecision(5) << timings[0][0]
-                      << std::endl
-                      << "Bandwidth (" << ((mibibytes) ? "GiB/s" : "GB/s") << "):  " << std::left
-                      << std::setprecision(3) << bandwidth << std::endl;
-        }
-    }
-
-    delete stream;
-}
-
-template<typename T>
-void check_solution(unsigned int const ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c, T& sum)
-{
-    // Generate correct solution
-    T goldA = startA;
-    T goldB = startB;
-    T goldC = startC;
-    T goldSum = 0.0;
-
-    const T scalar = startScalar;
-
-    for(unsigned int i = 0; i < ntimes; i++)
-    {
-        // Do STREAM!
-        if(selection == Benchmark::All)
-        {
-            goldC = goldA;
-            goldB = scalar * goldC;
-            goldC = goldA + goldB;
-            goldA = goldB + scalar * goldC;
-        }
-        else if(selection == Benchmark::Triad)
-        {
-            goldA = goldB + scalar * goldC;
-        }
-        else if(selection == Benchmark::Nstream)
-        {
-            goldA += goldB + scalar * goldC;
-        }
-    }
-
-    // Do the reduction
-    goldSum = goldA * goldB * ARRAY_SIZE;
-
-    // Calculate the average error
-    double errA
-        = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val) { return sum + fabs(val - goldA); });
-    errA /= a.size();
-    double errB
-        = std::accumulate(b.begin(), b.end(), 0.0, [&](double sum, const T val) { return sum + fabs(val - goldB); });
-    errB /= b.size();
-    double errC
-        = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val) { return sum + fabs(val - goldC); });
-    errC /= c.size();
-    double errSum = fabs((sum - goldSum) / goldSum);
-
-    double epsi = std::numeric_limits<T>::epsilon() * 100.0;
-
-    if(errA > epsi)
-        std::cerr << "Validation failed on a[]. Average error " << errA << std::endl;
-    if(errB > epsi)
-        std::cerr << "Validation failed on b[]. Average error " << errB << std::endl;
-    if(errC > epsi)
-        std::cerr << "Validation failed on c[]. Average error " << errC << std::endl;
-    // Check sum to 8 decimal places
-    if(selection == Benchmark::All && errSum > 1.0E-8)
-        std::cerr << "Validation failed on sum. Error " << errSum << std::endl
-                  << std::setprecision(15) << "Sum was " << sum << " but should be " << goldSum << std::endl;
-}
-
-int parseUInt(char const* str, unsigned int* output)
-{
-    char* next;
-    *output = strtoul(str, &next, 10);
-    return !strlen(next);
-}
-
-int parseInt(char const* str, int* output)
-{
-    char* next;
-    *output = strtol(str, &next, 10);
-    return !strlen(next);
-}
-
-void parseArguments(int argc, char* argv[])
-{
-    for(int i = 1; i < argc; i++)
-    {
-        if(!std::string("--list").compare(argv[i]))
-        {
-            listDevices();
-            exit(EXIT_SUCCESS);
-        }
-        else if(!std::string("--device").compare(argv[i]))
-        {
-            if(++i >= argc || !parseUInt(argv[i], &deviceIndex))
-            {
-                std::cerr << "Invalid device index." << std::endl;
-                exit(EXIT_FAILURE);
-            }
-        }
-        else if(!std::string("--arraysize").compare(argv[i]) || !std::string("-s").compare(argv[i]))
-        {
-            if(++i >= argc || !parseInt(argv[i], &ARRAY_SIZE) || ARRAY_SIZE <= 0)
-            {
-                std::cerr << "Invalid array size." << std::endl;
-                exit(EXIT_FAILURE);
-            }
-        }
-        else if(!std::string("--numtimes").compare(argv[i]) || !std::string("-n").compare(argv[i]))
-        {
-            if(++i >= argc || !parseUInt(argv[i], &num_times))
-            {
-                std::cerr << "Invalid number of times." << std::endl;
-                exit(EXIT_FAILURE);
-            }
-            if(num_times < 2)
-            {
-                std::cerr << "Number of times must be 2 or more" << std::endl;
-                exit(EXIT_FAILURE);
-            }
-        }
-        else if(!std::string("--float").compare(argv[i]))
-        {
-            use_float = true;
-        }
-        else if(!std::string("--triad-only").compare(argv[i]))
-        {
-            selection = Benchmark::Triad;
-        }
-        else if(!std::string("--nstream-only").compare(argv[i]))
-        {
-            selection = Benchmark::Nstream;
-        }
-        else if(!std::string("--csv").compare(argv[i]))
-        {
-            output_as_csv = true;
-        }
-        else if(!std::string("--mibibytes").compare(argv[i]))
-        {
-            mibibytes = true;
-        }
-        else if(!std::string("--help").compare(argv[i]) || !std::string("-h").compare(argv[i]))
-        {
-            std::cout << std::endl;
-            std::cout << "Usage: " << argv[0] << " [OPTIONS]" << std::endl << std::endl;
-            std::cout << "Options:" << std::endl;
-            std::cout << "  -h  --help               Print the message" << std::endl;
-            std::cout << "      --list               List available devices" << std::endl;
-            std::cout << "      --device     INDEX   Select device at INDEX" << std::endl;
-            std::cout << "  -s  --arraysize  SIZE    Use SIZE elements in the array" << std::endl;
-            std::cout << "  -n  --numtimes   NUM     Run the test NUM times (NUM >= 2)" << std::endl;
-            std::cout << "      --float              Use floats (rather than doubles)" << std::endl;
-            std::cout << "      --triad-only         Only run triad" << std::endl;
-            std::cout << "      --nstream-only       Only run nstream" << std::endl;
-            std::cout << "      --csv                Output as csv table" << std::endl;
-            std::cout << "      --mibibytes          Use MiB=2^20 for bandwidth calculation (default MB=10^6)"
-                      << std::endl;
-            std::cout << std::endl;
-            exit(EXIT_SUCCESS);
-        }
-        else
-        {
-            std::cerr << "Unrecognized argument '" << argv[i] << "' (try '--help')" << std::endl;
-            exit(EXIT_FAILURE);
-        }
-    }
-}
-
-// NOLINTEND
diff --git a/alpaka/example/bufferCopy/src/bufferCopy.cpp b/alpaka/example/bufferCopy/src/bufferCopy.cpp
index 998df953..7c2f9e16 100644
--- a/alpaka/example/bufferCopy/src/bufferCopy.cpp
+++ b/alpaka/example/bufferCopy/src/bufferCopy.cpp
@@ -1,10 +1,10 @@
-/* Copyright 2023 Alexander Matthes, Benjamin Worpitz, Erik Zenker, Matthias Werner, Bernhard Manfred Gruber,
- *                Jan Stephan
+/* Copyright 2024 Alexander Matthes, Benjamin Worpitz, Erik Zenker, Matthias Werner, Bernhard Manfred Gruber,
+ *                Jan Stephan, Andrea Bocci
  * SPDX-License-Identifier: ISC
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 
 #include <cstdint>
 #include <iostream>
@@ -15,13 +15,12 @@ struct PrintBufferKernel
     template<typename TAcc, typename MdSpan>
     ALPAKA_FN_ACC auto operator()(TAcc const& acc, MdSpan data) const -> void
     {
-        auto const idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        auto const gridSize = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
-
-        for(size_t z = idx[0]; z < data.extent(0); z += gridSize[0])
-            for(size_t y = idx[1]; y < data.extent(1); y += gridSize[1])
-                for(size_t x = idx[2]; x < data.extent(2); x += gridSize[2])
-                    printf("%zu,%zu,%zu:%u ", z, y, x, static_cast<uint32_t>(data(z, y, x)));
+        // Use three nested loops along the dimensions 0, 1 and 2
+        for(size_t z : alpaka::uniformElementsAlong<0>(acc, data.extent(0)))
+            for(size_t y : alpaka::uniformElementsAlong<1>(acc, data.extent(1)))
+                for(size_t x : alpaka::uniformElementsAlong<2>(acc, data.extent(2)))
+                    // %zu prints garbage in some cases, while %lu seems to be working correctly
+                    printf("%lu,%lu,%lu: %u\t", z, y, x, static_cast<uint32_t>(data(z, y, x)));
     }
 };
 
@@ -31,17 +30,15 @@ struct TestBufferKernel
     template<typename TAcc, typename MdSpan>
     ALPAKA_FN_ACC auto operator()(TAcc const& acc, MdSpan data) const -> void
     {
-        using Vec = alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>>;
-
-        auto const idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        auto const gridSize = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
-
-        for(size_t z = idx[0]; z < data.extent(0); z += gridSize[0])
-            for(size_t y = idx[1]; y < data.extent(1); y += gridSize[1])
-                for(size_t x = idx[2]; x < data.extent(2); x += gridSize[2])
+        // Use three nested loops along the dimensions z, y and x
+        for(size_t z : alpaka::uniformElementsAlongZ(acc, data.extent(0)))
+            for(size_t y : alpaka::uniformElementsAlongY(acc, data.extent(1)))
+                for(size_t x : alpaka::uniformElementsAlongX(acc, data.extent(2)))
                     ALPAKA_ASSERT_ACC(
                         data(z, y, x)
-                        == alpaka::mapIdx<1u>(Vec{z, y, x}, Vec{data.extent(0), data.extent(1), data.extent(2)})[0]);
+                        == alpaka::mapIdx<1u>(
+                            alpaka::Vec{z, y, x},
+                            alpaka::Vec{data.extent(0), data.extent(1), data.extent(2)})[0]);
     }
 };
 
@@ -51,57 +48,36 @@ struct FillBufferKernel
     template<typename TAcc, typename MdSpan>
     ALPAKA_FN_ACC auto operator()(TAcc const& acc, MdSpan data) const -> void
     {
-        using Vec = alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>>;
-
-        auto const idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        auto const gridSize = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
-
-        for(size_t z = idx[0]; z < data.extent(0); z += gridSize[0])
-            for(size_t y = idx[1]; y < data.extent(1); y += gridSize[1])
-                for(size_t x = idx[2]; x < data.extent(2); x += gridSize[2])
-                    data(z, y, x)
-                        = alpaka::mapIdx<1u>(Vec{z, y, x}, Vec{data.extent(0), data.extent(1), data.extent(2)})[0];
+        // Use a single 3-dimensional loop
+        for(auto idx : alpaka::uniformElementsND(acc, alpaka::Vec{data.extent(0), data.extent(1), data.extent(2)}))
+            data(idx.z(), idx.y(), idx.x()) // equivalent to data(idx[0], idx[1], idx[2])
+                = alpaka::mapIdx<1u>(idx, alpaka::Vec{data.extent(0), data.extent(1), data.extent(2)})[0];
     }
 };
 
-auto main() -> int
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
-// Fallback for the CI with disabled sequential backend
-#if defined(ALPAKA_CI) && !defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
-    return EXIT_SUCCESS;
-#else
     // Define the index domain
     using Dim = alpaka::DimInt<3u>;
     using Idx = std::size_t;
 
     // Define the device accelerator
-    //
-    // It is possible to choose from a set of accelerators:
-    // - AccGpuCudaRt
-    // - AccGpuHipRt
-    // - AccCpuThreads
-    // - AccCpuOmp2Threads
-    // - AccCpuOmp2Blocks
-    // - AccCpuTbbBlocks
-    // - AccCpuSerial
-    // using Acc = alpaka::AccCpuSerial<Dim, Idx>;
-    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    using Acc = alpaka::TagToAcc<TAccTag, Dim, Idx>;
     std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
-    // Defines the synchronization behavior of a queue
+    // Defines the synchronization behavior of the device queue
     //
     // choose between Blocking and NonBlocking
     using AccQueueProperty = alpaka::Blocking;
     using DevQueue = alpaka::Queue<Acc, AccQueueProperty>;
 
-    // Define the device accelerator
-    //
-    // It is possible to choose from a set of accelerators:
-    // - AccCpuThreads
-    // - AccCpuOmp2Threads
-    // - AccCpuOmp2Blocks
-    // - AccCpuSerial
+    // Define the host accelerator
     using Host = alpaka::AccCpuSerial<Dim, Idx>;
-    // Defines the synchronization behavior of a queue
+    // Defines the synchronization behavior of the host queue
     //
     // choose between Blocking and NonBlocking
     using HostQueueProperty = alpaka::Blocking;
@@ -119,21 +95,8 @@ auto main() -> int
 
     // Define the work division for kernels to be run on devAcc and devHost
     using Vec = alpaka::Vec<Dim, Idx>;
-    Vec const elementsPerThread(Vec::all(static_cast<Idx>(1)));
-    Vec const threadsPerGrid(Vec::all(static_cast<Idx>(10)));
-    using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
-    WorkDiv const devWorkDiv = alpaka::getValidWorkDiv<Acc>(
-        devAcc,
-        threadsPerGrid,
-        elementsPerThread,
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
-    WorkDiv const hostWorkDiv = alpaka::getValidWorkDiv<Host>(
-        devHost,
-        threadsPerGrid,
-        elementsPerThread,
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
+    Vec const elementsPerThread(Vec::all(static_cast<Idx>(3)));
+    Vec const elementsPerGrid(Vec::all(static_cast<Idx>(10)));
 
     // Create host and device buffers
     //
@@ -146,14 +109,14 @@ auto main() -> int
     using Data = std::uint32_t;
     constexpr Idx nElementsPerDim = 2;
 
-    const Vec extents(Vec::all(static_cast<Idx>(nElementsPerDim)));
+    Vec const extents = Vec::all(nElementsPerDim);
 
     // Allocate host memory buffers
     //
     // The `alloc` method returns a reference counted buffer handle.
     // When the last such handle is destroyed, the memory is freed automatically.
     using BufHost = alpaka::Buf<Host, Data, Dim, Idx>;
-    BufHost hostBuffer(alpaka::allocBuf<Data, Idx>(devHost, extents));
+    BufHost hostBuffer = alpaka::allocBuf<Data, Idx>(devHost, extents);
     // You can also use already allocated memory and wrap it within a view (irrespective of the device type).
     // The view does not own the underlying memory. So you have to make sure that
     // the view does not outlive its underlying memory.
@@ -164,8 +127,8 @@ auto main() -> int
     //
     // The interface to allocate a buffer is the same on the host and on the device.
     using BufAcc = alpaka::Buf<Acc, Data, Dim, Idx>;
-    BufAcc deviceBuffer1(alpaka::allocBuf<Data, Idx>(devAcc, extents));
-    BufAcc deviceBuffer2(alpaka::allocBuf<Data, Idx>(devAcc, extents));
+    BufAcc deviceBuffer1 = alpaka::allocBuf<Data, Idx>(devAcc, extents);
+    BufAcc deviceBuffer2 = alpaka::allocBuf<Data, Idx>(devAcc, extents);
 
 
     // Init host buffer
@@ -180,9 +143,9 @@ auto main() -> int
     // some values into the buffer memory.
     // Mind, that only a host can write on host memory.
     // The same holds true for device memory.
-    for(Idx z(0); z < extents[0]; ++z)
-        for(Idx y(0); y < extents[1]; ++y)
-            for(Idx x(0); x < extents[2]; ++x)
+    for(Idx z = 0; z < extents[0]; ++z)
+        for(Idx y = 0; y < extents[1]; ++y)
+            for(Idx x = 0; x < extents[2]; ++x)
                 hostBufferMdSpan(z, y, x) = static_cast<Data>(z * extents[1] * extents[2] + y * extents[2] + x);
 
     // Memory views and buffers can also be initialized by executing a kernel.
@@ -192,10 +155,12 @@ auto main() -> int
 
     FillBufferKernel fillBufferKernel;
 
+    alpaka::KernelCfg<Host> const hostKernelCfg = {elementsPerGrid, elementsPerThread};
+    auto const hostWorkDiv = alpaka::getValidWorkDiv(hostKernelCfg, devHost, fillBufferKernel, hostViewPlainPtrMdSpan);
+
     alpaka::exec<Host>(hostQueue, hostWorkDiv, fillBufferKernel,
                        hostViewPlainPtrMdSpan); // 1st kernel argument
 
-
     // Copy host to device Buffer
     //
     // A copy operation of one buffer into
@@ -228,10 +193,14 @@ auto main() -> int
     auto deviceBufferMdSpan2 = alpaka::experimental::getMdSpan(deviceBuffer2);
 
     TestBufferKernel testBufferKernel;
+
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    alpaka::KernelCfg<Acc> const devKernelCfg = {elementsPerGrid, elementsPerThread};
+    auto const devWorkDiv = alpaka::getValidWorkDiv(devKernelCfg, devAcc, testBufferKernel, deviceBufferMdSpan1);
+
     alpaka::exec<Acc>(devQueue, devWorkDiv, testBufferKernel, deviceBufferMdSpan1);
     alpaka::exec<Acc>(devQueue, devWorkDiv, testBufferKernel, deviceBufferMdSpan2);
 
-
     // Print device Buffer
     //
     // Because we really like to flood our
@@ -243,22 +212,43 @@ auto main() -> int
     // completely distorted.
 
     PrintBufferKernel printBufferKernel;
-    alpaka::exec<Acc>(devQueue, devWorkDiv, printBufferKernel, deviceBufferMdSpan1);
+
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    auto const hostPrintWorkDiv
+        = alpaka::getValidWorkDiv(hostKernelCfg, devHost, printBufferKernel, hostViewPlainPtrMdSpan);
+    auto const devPrintWorkDiv = alpaka::getValidWorkDiv(devKernelCfg, devAcc, printBufferKernel, deviceBufferMdSpan1);
+
+    alpaka::exec<Acc>(devQueue, devPrintWorkDiv, printBufferKernel, deviceBufferMdSpan1);
     alpaka::wait(devQueue);
     std::cout << std::endl;
 
-    alpaka::exec<Acc>(devQueue, devWorkDiv, printBufferKernel, deviceBufferMdSpan2);
+    alpaka::exec<Acc>(devQueue, devPrintWorkDiv, printBufferKernel, deviceBufferMdSpan2);
     alpaka::wait(devQueue);
     std::cout << std::endl;
 
-    alpaka::exec<Host>(hostQueue, hostWorkDiv, printBufferKernel, hostBufferMdSpan);
+    alpaka::exec<Host>(hostQueue, hostPrintWorkDiv, printBufferKernel, hostBufferMdSpan);
     alpaka::wait(hostQueue);
     std::cout << std::endl;
 
-    alpaka::exec<Host>(hostQueue, hostWorkDiv, printBufferKernel, hostViewPlainPtrMdSpan);
+    alpaka::exec<Host>(hostQueue, hostPrintWorkDiv, printBufferKernel, hostViewPlainPtrMdSpan);
     alpaka::wait(hostQueue);
     std::cout << std::endl;
 
     return EXIT_SUCCESS;
-#endif
+}
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
 }
diff --git a/alpaka/example/complex/CMakeLists.txt b/alpaka/example/complex/CMakeLists.txt
index 1bd7783d..f04dddc2 100644
--- a/alpaka/example/complex/CMakeLists.txt
+++ b/alpaka/example/complex/CMakeLists.txt
@@ -15,7 +15,7 @@ set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 
 set(_TARGET_NAME complexNumbers)
 
-project(${_TARGET_NAME})
+project(${_TARGET_NAME} LANGUAGES CXX)
 
 #-------------------------------------------------------------------------------
 # Find alpaka.
diff --git a/alpaka/example/complex/src/complex.cpp b/alpaka/example/complex/src/complex.cpp
index beadbc77..d9da52a0 100644
--- a/alpaka/example/complex/src/complex.cpp
+++ b/alpaka/example/complex/src/complex.cpp
@@ -3,7 +3,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 
 #include <cstdint>
 #include <iostream>
@@ -28,33 +28,17 @@ struct ComplexKernel
     }
 };
 
-auto main() -> int
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
-// Fallback for the CI with disabled sequential backend
-#if defined(ALPAKA_CI) && !defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
-    return EXIT_SUCCESS;
-#else
     using Idx = std::size_t;
 
     // Define the accelerator
-    //
-    // It is possible to choose from a set of accelerators:
-    // - AccGpuCudaRt
-    // - AccGpuHipRt
-    // - AccCpuThreads
-    // - AccCpuOmp2Threads
-    // - AccCpuOmp2Blocks
-    // - AccCpuTbbBlocks
-    // - AccCpuSerial
-    //
-    // Each accelerator has strengths and weaknesses. Therefore,
-    // they need to be choosen carefully depending on the actual
-    // use case. Furthermore, some accelerators only support a
-    // particular workdiv, but workdiv can also be generated
-    // automatically.
-
-    // By exchanging the Acc and Queue types you can select where to execute the kernel.
-    using Acc = alpaka::ExampleDefaultAcc<alpaka::DimInt<1>, Idx>;
+    using Acc = alpaka::TagToAcc<TAccTag, alpaka::DimInt<1>, Idx>;
     std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
 
     // Defines the synchronization behavior of a queue
@@ -69,17 +53,18 @@ auto main() -> int
     Queue queue(devAcc);
 
     // Define the work division
-    Idx const threadsPerGrid = 1u;
+    Idx const elementsPerGrid = 1u;
     Idx const elementsPerThread = 1u;
-    auto const workDiv = alpaka::getValidWorkDiv<Acc>(
-        devAcc,
-        threadsPerGrid,
-        elementsPerThread,
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
+
+    ComplexKernel complexKernel;
+
+    alpaka::KernelCfg<Acc> const kernelCfg = {elementsPerGrid, elementsPerThread};
+
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, complexKernel);
 
     // Run the kernel
-    alpaka::exec<Acc>(queue, workDiv, ComplexKernel{});
+    alpaka::exec<Acc>(queue, workDiv, complexKernel);
     alpaka::wait(queue);
 
     // Usage of alpaka::Complex<T> on the host side is the same as inside kernels, except math functions are not
@@ -91,5 +76,20 @@ auto main() -> int
     alpaka::Complex<float> z = x + y;
 
     return EXIT_SUCCESS;
-#endif
+}
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
 }
diff --git a/alpaka/example/conv2DWithMdspan/CMakeLists.txt b/alpaka/example/conv2DWithMdspan/CMakeLists.txt
new file mode 100644
index 00000000..50e5c765
--- /dev/null
+++ b/alpaka/example/conv2DWithMdspan/CMakeLists.txt
@@ -0,0 +1,53 @@
+#
+# Copyright 2023 Erik Zenker, Benjamin Worpitz, Jan Stephan
+# SPDX-License-Identifier: ISC
+#
+
+################################################################################
+# Required CMake version.
+
+cmake_minimum_required(VERSION 3.22)
+
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+################################################################################
+# Project.
+
+set(_TARGET_NAME conv2DWithMdspan)
+
+project(${_TARGET_NAME} LANGUAGES CXX)
+
+#-------------------------------------------------------------------------------
+# Find alpaka.
+
+if(NOT TARGET alpaka::alpaka)
+    option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)
+
+    if(alpaka_USE_SOURCE_TREE)
+        # Don't build the examples recursively
+        set(alpaka_BUILD_EXAMPLES OFF)
+        add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
+    else()
+        find_package(alpaka REQUIRED)
+    endif()
+endif()
+
+
+if (alpaka_USE_MDSPAN STREQUAL "OFF")
+    message(STATUS "The conv2DWithMdspan example requires mdspan. Please set alpaka_USE_MDSPAN accordingly. Example disabled.")
+    return()
+endif ()
+
+#-------------------------------------------------------------------------------
+# Add executable.
+
+alpaka_add_executable(
+    ${_TARGET_NAME}
+    src/conv2DWithMdspan.cpp)
+target_link_libraries(
+    ${_TARGET_NAME}
+    PUBLIC alpaka::alpaka)
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)
+
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
diff --git a/alpaka/example/conv2DWithMdspan/src/conv2DWithMdspan.cpp b/alpaka/example/conv2DWithMdspan/src/conv2DWithMdspan.cpp
new file mode 100644
index 00000000..698a8d12
--- /dev/null
+++ b/alpaka/example/conv2DWithMdspan/src/conv2DWithMdspan.cpp
@@ -0,0 +1,245 @@
+/* Copyright 2023 Mehmet Yusufoglu, Bernhard Manfred Gruber, René Widera
+ * SPDX-License-Identifier: ISC
+ */
+
+#include <alpaka/alpaka.hpp>
+// Needed for running example for all backends available; one by one
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
+
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+//! Convolution Example using Mdspan structure to pass multi-dimensional data to the kernel
+//!
+//! A 2D Convolutional filter applied to a matrix. Mdspan data structure is used, therefore pitch and size values are
+//! not needed to be passed to the kernel. Results can be tested by comparing with the results of the Matlab call: Y =
+//! filter2(FilterMatrix,InputMatrix,'same');
+
+/**
+ * @brief 2D Convolutional Filter using only global memory for the input-matrix and the filter-matrix
+ */
+struct ConvolutionKernelMdspan2D
+{
+    //! \tparam TAcc Accelerator type
+    //! \tparam TElem The input-matrix and filter-matrix element type
+    //! \param acc Accelerator
+    //! \param input Input matrix
+    //! \param output Output matrix
+    //! \param filter Filter-matrix
+
+    template<typename TAcc, typename MdSpan>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, MdSpan const input, MdSpan output, MdSpan const filter) const
+        -> void
+    {
+        static_assert(
+            alpaka::Dim<TAcc>::value == 2u,
+            "The accelerator used for the Alpaka Kernel has to be 2 dimensional!");
+
+        auto matrixWidth = input.extent(0);
+        auto matrixHeight = input.extent(1);
+        // Filter matrix is square
+        int32_t filterWidth = filter.extent(0);
+
+        // Get thread index, the center of filter-matrix is positioned to the item on this index.
+        int32_t const row = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
+        int32_t const col = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[1];
+
+        // The convolutional filter-matrix applied to the input matrix, it's position is row and col. If some of the
+        // items of the filter are outside the matrix, those are not taken into calculation or they are assumed zero.
+        if(col < matrixWidth && row < matrixHeight)
+        {
+            float pValue{0.0f};
+            for(int32_t fRow = 0; fRow < filterWidth; fRow++)
+            {
+                for(int32_t fCol = 0; fCol < filterWidth; fCol++)
+                {
+                    // Position of input matrix element to be multiplied with the corresponding element at filter
+                    auto const exactRow = row - filterWidth / 2 + fRow;
+                    auto const exactCol = col - filterWidth / 2 + fCol;
+                    if(exactRow >= 0 && exactRow < matrixHeight && exactCol >= 0 && exactCol < matrixWidth)
+                    {
+                        pValue += filter(fRow, fCol) * input(exactRow, exactCol);
+                    }
+                }
+            }
+            output(row, col) = pValue;
+        }
+    }
+};
+
+auto FuzzyEqual(float a, float b) -> bool
+{
+    return std::fabs(a - b) < std::numeric_limits<float>::epsilon() * 1000.0f;
+}
+
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
+{
+    // Define the index domain
+    using Dim = alpaka::DimInt<2>;
+    // Index type
+    using Idx = std::uint32_t;
+    using Vec = alpaka::Vec<Dim, Idx>;
+    // Define the accelerator
+    using DevAcc = alpaka::TagToAcc<TAccTag, Dim, Idx>;
+    using QueueAcc = alpaka::Queue<DevAcc, alpaka::NonBlocking>;
+
+    using DataType = float;
+    static constexpr Idx filterWidth = 5;
+    static constexpr Idx matrixWidth = 128;
+    static constexpr Idx matrixHeight = 128;
+
+    std::cout << "Using alpaka accelerator: " << alpaka::getAccName<DevAcc>() << std::endl;
+
+    auto const devHost = alpaka::getDevByIdx(alpaka::PlatformCpu{}, 0);
+    // Select a device from the accelerator
+    auto const platformAcc = alpaka::Platform<DevAcc>{};
+    auto const devAcc = alpaka::getDevByIdx(platformAcc, 0);
+
+    // Create a queue on the device
+    QueueAcc queueAcc(devAcc);
+    // Define the 2D extent (dimensions)
+    Vec const extent(static_cast<Idx>(matrixWidth), static_cast<Idx>(matrixHeight));
+
+    //
+    // Input vector allocation and copy to device buffer
+    //
+    std::vector<DataType> bufInputHost1D(extent.prod(), 1);
+    // Use increasing values as input
+    std::iota(bufInputHost1D.begin(), bufInputHost1D.end(), 1.0f);
+    for(DataType& element : bufInputHost1D)
+    {
+        element /= matrixWidth;
+    }
+    // Create 2D view
+    auto bufInputHostView = alpaka::createView(devHost, bufInputHost1D.data(), extent);
+
+    // Input buffer at device
+    auto bufInputAcc = alpaka::allocBuf<DataType, Idx>(devAcc, extent);
+    // Copy input view from host to device by copying to alpaka buffer type
+    alpaka::memcpy(queueAcc, bufInputAcc, bufInputHostView);
+    alpaka::wait(queueAcc);
+    //
+    //  Output buffer allocation at device
+    //
+    auto outputDeviceMemory = alpaka::allocBuf<DataType, Idx>(devAcc, extent);
+
+    //  Prepare convolution filter at host
+    //
+    std::vector<DataType> const filter = {0.11, 0.12, 0.13, 0.14, 0.15, 0.21, 0.22, 0.23, 0.24, 0.25, 0.31, 0.32, 0.33,
+                                          0.34, 0.35, 0.41, 0.42, 0.43, 0.44, 0.45, 0.51, 0.52, 0.53, 0.54, 0.55};
+
+    Vec const filterExtent(static_cast<Idx>(filterWidth), static_cast<Idx>(filterWidth));
+    // Create 2D view from std::vector in order to use in alpaka::memcpy
+    auto bufFilterHostView = alpaka::createView(devHost, filter.data(), filterExtent);
+
+    // The buffer for the filter data at device
+    auto bufFilterAcc = alpaka::allocBuf<DataType, Idx>(devAcc, filterExtent);
+    // Copy input view from host to device by copying to alpaka buffer type
+    alpaka::memcpy(queueAcc, bufFilterAcc, bufFilterHostView);
+    alpaka::wait(queueAcc);
+
+    //  Construct kernel object
+    ConvolutionKernelMdspan2D convolutionKernel2D;
+
+    //   Let alpaka calculate good block and grid sizes given our full problem extent.
+    alpaka::KernelCfg<DevAcc> const kernelCfg = {extent, Vec::ones()};
+    auto const workDiv = alpaka::getValidWorkDiv(
+        kernelCfg,
+        devAcc,
+        convolutionKernel2D,
+        alpaka::experimental::getMdSpan(bufInputAcc),
+        alpaka::experimental::getMdSpan(outputDeviceMemory),
+        alpaka::experimental::getMdSpan(bufFilterAcc));
+
+
+    // Run the kernel, pass 3 arrays as 2D mdspans
+    alpaka::exec<DevAcc>(
+        queueAcc,
+        workDiv,
+        convolutionKernel2D,
+        alpaka::experimental::getMdSpan(bufInputAcc),
+        alpaka::experimental::getMdSpan(outputDeviceMemory),
+        alpaka::experimental::getMdSpan(bufFilterAcc));
+
+    // Allocate memory on host to receive the resulting matrix as an array
+    auto resultGpuHost = alpaka::allocBuf<DataType, Idx>(devHost, extent);
+    // Copy result from device memory to host
+    alpaka::memcpy(queueAcc, resultGpuHost, outputDeviceMemory, extent);
+    alpaka::wait(queueAcc);
+
+    //  Print results
+    std::cout << "Convolution filter kernel ConvolutionKernelMdspan2D.\n";
+    std::cout << "Matrix Size:" << matrixWidth << "x" << matrixHeight << ", Filter Size:" << filterWidth << "x"
+              << filterWidth << "\n";
+
+    // Print 2D output as 1D
+    //  for(size_t i{0}; i < matrixWidth * matrixHeight; ++i)
+    //  {
+    //     std::cout << "output[" << i << "]:" << std::setprecision(6) << *(std::data(resultGpuHost) + i) << std::endl;
+    //  }
+
+    // Print output using MdSpan
+    for(size_t i{0}; i < matrixHeight; ++i)
+    {
+        for(size_t j{0}; j < matrixWidth; ++j)
+        {
+            std::cout << "outputMdSpan[" << i << "," << j << "]:" << std::setprecision(6)
+                      << alpaka::experimental::getMdSpan(resultGpuHost)(i, j) << std::endl;
+        }
+    }
+
+    // Expected array of sampled results
+    std::vector<DataType> const expectedOutput{
+        4.622344e+00,
+        1.106426e+02,
+        2.162168e+02,
+        3.217910e+02,
+        4.273652e+02,
+        4.199258e+02,
+        6.385137e+02,
+        7.440879e+02,
+        8.496621e+02,
+        9.552363e+02,
+        4.390715e+02};
+    // Select samples from output to check results
+    size_t const numberOfSamples{10};
+    size_t const samplePeriod{matrixWidth * matrixHeight / numberOfSamples};
+    bool allEqual{true};
+    for(size_t i{0}; i < numberOfSamples; ++i)
+    {
+        // Compare with the reference results, select one from every samplePeriod element
+        bool fuzzyEqual = FuzzyEqual(*(std::data(resultGpuHost) + i * samplePeriod), expectedOutput[i]);
+        if(!fuzzyEqual)
+            std::cout << *(std::data(resultGpuHost) + i * samplePeriod) << " " << expectedOutput[i] << std::endl;
+        allEqual = allEqual && fuzzyEqual;
+    }
+    if(!allEqual)
+    {
+        std::cout << "Error: Some 2D convolution results doesn't match!\n";
+        return EXIT_FAILURE;
+    }
+    std::cout << "Sampled result checks are correct!\n";
+    return EXIT_SUCCESS;
+}
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
+}
diff --git a/alpaka/example/convolution1D/CMakeLists.txt b/alpaka/example/convolution1D/CMakeLists.txt
new file mode 100644
index 00000000..a43a1721
--- /dev/null
+++ b/alpaka/example/convolution1D/CMakeLists.txt
@@ -0,0 +1,47 @@
+#
+# Copyright 2023 Erik Zenker, Benjamin Worpitz, Jan Stephan
+# SPDX-License-Identifier: ISC
+#
+
+################################################################################
+# Required CMake version.
+
+cmake_minimum_required(VERSION 3.22)
+
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+################################################################################
+# Project.
+
+set(_TARGET_NAME convolution1D)
+
+project(${_TARGET_NAME} LANGUAGES CXX)
+
+#-------------------------------------------------------------------------------
+# Find alpaka.
+
+if(NOT TARGET alpaka::alpaka)
+    option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)
+
+    if(alpaka_USE_SOURCE_TREE)
+        # Don't build the examples recursively
+        set(alpaka_BUILD_EXAMPLES OFF)
+        add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
+    else()
+        find_package(alpaka REQUIRED)
+    endif()
+endif()
+
+#-------------------------------------------------------------------------------
+# Add executable.
+
+alpaka_add_executable(
+    ${_TARGET_NAME}
+    src/convolution1D.cpp)
+target_link_libraries(
+    ${_TARGET_NAME}
+    PUBLIC alpaka::alpaka)
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)
+
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
diff --git a/alpaka/example/convolution1D/src/convolution1D.cpp b/alpaka/example/convolution1D/src/convolution1D.cpp
new file mode 100644
index 00000000..1e3aec67
--- /dev/null
+++ b/alpaka/example/convolution1D/src/convolution1D.cpp
@@ -0,0 +1,205 @@
+/* Copyright 2023  Bernhard Manfred Gruber, Simeon Ehrig, Rene Widera, Mehmet Yusufoglu.
+ * SPDX-License-Identifier: ISC
+ */
+
+#include <alpaka/alpaka.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
+
+#include <cmath>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <type_traits>
+
+//! Convolution Example
+//!
+//! 1D convolution example: Creates two 1D arrays, applies convolution filter.
+//! Array sizes are hardcoded.
+//!
+
+/**
+ * @brief The ConvolutionKernel function-object
+ * Calculates 1D convolution using input and filter arrays.
+ */
+struct ConvolutionKernel
+{
+    /** @brief Main convolution code
+     *  @param Accelerator
+     *  @param Input array, first input of convolution integral
+     *  @param Filter array, second input of convolution integral
+     *  @param Empty output array to be filled
+     *  @param Input array size
+     *  @param Filter size
+     */
+    template<typename TAcc, typename TElem>
+    ALPAKA_FN_ACC auto operator()(
+        TAcc const& acc,
+        TElem const* const input,
+        TElem const* const filter,
+        TElem* const output,
+        std::size_t const inputSize,
+        std::size_t const filterSize) const -> void
+    {
+        auto const globalThreadIdxX = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
+
+        // Since the kernel is launched 1-D calculating linearizedGlobalThreadIdx line is unnecessary.
+        // globalThreadIdx[0] can be used to map all the threads.
+        if(globalThreadIdxX < inputSize)
+        {
+            int32_t const halfFilterSize = filterSize / 2;
+            TElem result = 0.0f;
+            // Calculate sum of multiplications of corresponding elements
+            auto const start
+                = static_cast<int32_t>(std::max(static_cast<int32_t>(globalThreadIdxX) - halfFilterSize, 0));
+            auto const stop = std::min(globalThreadIdxX + halfFilterSize, inputSize - 1);
+            for(int32_t i = start; i <= stop; ++i)
+                result += input[i] * filter[i + halfFilterSize - static_cast<int32_t>(globalThreadIdxX)];
+            output[globalThreadIdxX] = result;
+        }
+    }
+};
+
+auto FuzzyEqual(float a, float b) -> bool
+{
+    return std::fabs(a - b) < std::numeric_limits<float>::epsilon() * 10.0f;
+}
+
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
+{
+    // Size of 1D arrays to be used in convolution integral
+    // Here instead of "convolution kernel" the term "filter" is used because kernel has a different meaning in GPU
+    // programming. Secondly filter array is not reversed. Implemented like a convolutional layer in CNN.
+    constexpr size_t filterSize = 3;
+    using DataType = float;
+    constexpr size_t inputSize = 8;
+    constexpr std::array<DataType, inputSize> expectedOutput = {0.8f, 1.4f, 2.0f, 2.6f, 3.2f, 3.8f, 4.4f, 2.3f};
+
+    // Define the index domain
+    using Dim = alpaka::DimInt<1u>;
+    // Index type
+    using Idx = std::size_t;
+
+    // Define the accelerator
+    using DevAcc = alpaka::TagToAcc<TAccTag, Dim, Idx>;
+    using QueueProperty = alpaka::Blocking;
+    using QueueAcc = alpaka::Queue<DevAcc, QueueProperty>;
+    using BufAcc = alpaka::Buf<DevAcc, DataType, Dim, Idx>;
+
+    std::cout << "Using alpaka accelerator: " << alpaka::getAccName<DevAcc>() << '\n';
+
+    auto const platformHost = alpaka::PlatformCpu{};
+    auto const devHost = alpaka::getDevByIdx(platformHost, 0);
+
+    // Select a device
+    auto const platformAcc = alpaka::Platform<DevAcc>{};
+    auto const devAcc = alpaka::getDevByIdx(platformAcc, 0);
+
+    // Create a queue on the device
+    QueueAcc queue(devAcc);
+
+    // Allocate memory host input
+    auto hostInputMemory = alpaka::allocBuf<DataType, Idx>(devHost, inputSize);
+
+    // Fill array with data
+    for(size_t i = 0; i < inputSize; i++)
+        hostInputMemory[i] = static_cast<DataType>(i + 1);
+
+    // Allocate memory host filter
+    auto hostFilterMemory = alpaka::allocBuf<DataType, Idx>(devHost, filterSize);
+
+    // Fill array with any data
+    for(size_t i = 0; i < filterSize; i++)
+        hostFilterMemory[i] = static_cast<DataType>(i + 1) / 10.0f;
+
+    // Allocate memory in device
+    BufAcc inputDeviceMemory = alpaka::allocBuf<DataType, Idx>(devAcc, inputSize);
+    BufAcc filterDeviceMemory = alpaka::allocBuf<DataType, Idx>(devAcc, filterSize);
+    BufAcc outputDeviceMemory = alpaka::allocBuf<DataType, Idx>(devAcc, static_cast<Idx>(inputSize));
+
+    // Copy input and filter (convolution kernel array) from host to device
+    alpaka::memcpy(queue, inputDeviceMemory, hostInputMemory, inputSize);
+    alpaka::memcpy(queue, filterDeviceMemory, hostFilterMemory, filterSize);
+    // Make sure memcpy finished.
+    alpaka::wait(queue);
+    using Vec = alpaka::Vec<Dim, Idx>;
+
+    auto const elementsPerThread = Vec::all(static_cast<Idx>(1));
+    // Grid size
+    auto const elementsPerGrid = inputSize;
+
+    // Instantiate the kernel (gpu code) function-object
+    ConvolutionKernel convolutionKernel;
+
+    // Native pointers needed for the kernel execution function
+    DataType* nativeFilterDeviceMemory = std::data(filterDeviceMemory);
+    DataType* nativeInputDeviceMemory = std::data(inputDeviceMemory);
+    DataType* nativeOutputDeviceMemory = std::data(outputDeviceMemory);
+
+    alpaka::KernelCfg<DevAcc> const kernelCfg = {elementsPerGrid, elementsPerThread};
+
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    auto const workDiv = alpaka::getValidWorkDiv(
+        kernelCfg,
+        devAcc,
+        convolutionKernel,
+        nativeInputDeviceMemory,
+        nativeFilterDeviceMemory,
+        nativeOutputDeviceMemory,
+        inputSize,
+        filterSize);
+
+    // Run the kernel
+    alpaka::exec<DevAcc>(
+        queue,
+        workDiv,
+        convolutionKernel,
+        nativeInputDeviceMemory,
+        nativeFilterDeviceMemory,
+        nativeOutputDeviceMemory,
+        inputSize,
+        filterSize);
+
+    // Allocate memory on host
+    auto resultGpuHost = alpaka::allocBuf<DataType, Idx>(devHost, inputSize);
+    // Copy from device memory to host
+    alpaka::memcpy(queue, resultGpuHost, outputDeviceMemory, inputSize);
+    alpaka::wait(queue);
+
+    bool allEqual{true};
+    // Print result array at the host
+    for(size_t i{0}; i < inputSize; i++)
+    {
+        std::cout << "output[" << i << "]:" << std::setprecision(3) << resultGpuHost[i] << "\n";
+        // Compare with the reference output
+        bool fuzzyEqual = FuzzyEqual(resultGpuHost[i], expectedOutput[i]);
+        allEqual = allEqual && fuzzyEqual;
+    }
+    if(!allEqual)
+    {
+        std::cout << "Error: Some convolution results doesn't match!\n";
+        return EXIT_FAILURE;
+    }
+    std::cout << "All results are correct!\n";
+    return EXIT_SUCCESS;
+}
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
+}
diff --git a/alpaka/example/convolution2D/CMakeLists.txt b/alpaka/example/convolution2D/CMakeLists.txt
new file mode 100644
index 00000000..2324f7b8
--- /dev/null
+++ b/alpaka/example/convolution2D/CMakeLists.txt
@@ -0,0 +1,47 @@
+#
+# Copyright 2023 Erik Zenker, Benjamin Worpitz, Jan Stephan
+# SPDX-License-Identifier: ISC
+#
+
+################################################################################
+# Required CMake version.
+
+cmake_minimum_required(VERSION 3.22)
+
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+################################################################################
+# Project.
+
+set(_TARGET_NAME convolution2D)
+
+project(${_TARGET_NAME} LANGUAGES CXX)
+
+#-------------------------------------------------------------------------------
+# Find alpaka.
+
+if(NOT TARGET alpaka::alpaka)
+    option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)
+
+    if(alpaka_USE_SOURCE_TREE)
+        # Don't build the examples recursively
+        set(alpaka_BUILD_EXAMPLES OFF)
+        add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
+    else()
+        find_package(alpaka REQUIRED)
+    endif()
+endif()
+
+#-------------------------------------------------------------------------------
+# Add executable.
+
+alpaka_add_executable(
+    ${_TARGET_NAME}
+    src/convolution2D.cpp)
+target_link_libraries(
+    ${_TARGET_NAME}
+    PUBLIC alpaka::alpaka)
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)
+
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
diff --git a/alpaka/example/convolution2D/src/convolution2D.cpp b/alpaka/example/convolution2D/src/convolution2D.cpp
new file mode 100644
index 00000000..87f618c7
--- /dev/null
+++ b/alpaka/example/convolution2D/src/convolution2D.cpp
@@ -0,0 +1,409 @@
+/* Copyright 2023 Mehmet Yusufoglu, Bernhard Manfred Gruber, René Widera
+ * SPDX-License-Identifier: ISC
+ */
+
+#include <alpaka/alpaka.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
+
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+//! Convolution Example
+//!
+//! A 2D Convolutional filter applied to a matrix. The first kernel, ConvolutionKernel2DGlobalMemory, uses only global
+//! memory. The second kernel ConvolutionKernel2DSharedMemory uses tiling and shared memory. Block size is assumed to
+//! be equal to the tile size. First, the tile is copied to shared memory, since an element in a tile is accessed many
+//! times; using the shared memory for the main matrix data increases performance. Each block works on the domain of
+//! one tile. But at the border of the tile, some external matrix values are needed (at the border with another tile)
+//! then those matrix values are taken from the global memory.
+//! Results can be tested by comparing with the results of the Matlab call: Y =
+//! filter2(FilterMatrix,InputMatrix,'same');
+
+/**
+ * @brief 2D Convolutional Filter using only global memory for the input-matrix and the filter-matrix
+ */
+struct ConvolutionKernel2DGlobalMemory
+{
+    //! \tparam TAcc Accelerator type
+    //! \tparam TElem The input-matrix and filter-matrix element type
+    //! \param acc Accelerator
+    //! \param input Input matrix
+    //! \param output Output matrix
+    //! \param matrixWidth Input matrix width
+    //! \param matrixHeight Input matrix height
+    //! \param filter Filter-matrix
+    //! \param filterWidth Filter-matrix width
+    //! \param intputWidthAllocated Input-matrix width allocated (possibly larger than normal width due to paddding)
+    //! \param filterWidthAllocated Filter-matrix width allocated (possibly larger than normal width due to paddding
+
+    template<typename TAcc, typename TElem>
+    ALPAKA_FN_ACC auto operator()(
+        TAcc const& acc,
+        TElem const* const input,
+        TElem* output,
+        int32_t const matrixWidth,
+        int32_t const matrixHeight,
+        TElem const* const filter,
+        int32_t const filterWidth,
+        int32_t const intputWidthAllocated,
+        int32_t const filterWidthAllocated) const -> void
+    {
+        // Get thread index, the center of filter-matrix is positioned to the item on this index.
+        auto const [row, col] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        // Block index with respect to thread
+        auto const [blockThreadY, blockThreadX] = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
+
+        // The convolutional filter-matrix applied to the input matrix, it's position is row and col. If some of the
+        // items of the filter are outside the matrix, those are not taken into calculation or they are assumed zero.
+        if(col < matrixWidth && row < matrixHeight)
+        {
+            TElem pValue{0.0f};
+            for(int32_t fRow = 0; fRow < filterWidth; fRow++)
+            {
+                for(int32_t fCol = 0; fCol < filterWidth; fCol++)
+                {
+                    // Position of input matrix element to be multiplied with the corresponding element at filter
+                    auto const exactRow = static_cast<int32_t>(row) - filterWidth / 2 + fRow;
+                    auto const exactCol = static_cast<int32_t>(col) - filterWidth / 2 + fCol;
+                    if(exactRow >= 0 && exactRow < matrixHeight && exactCol >= 0 && exactCol < matrixWidth)
+                    {
+                        pValue += filter[fRow * filterWidthAllocated + fCol]
+                                  * input[exactRow * intputWidthAllocated + exactCol];
+                    }
+                }
+                output[row * matrixWidth + col] = pValue;
+            }
+        }
+    }
+};
+
+/**
+ * @brief ConvolutionKernel2DSharedMemory struct. The kernel for 2D Convolutional Filter, uses
+ tiling method. Tiles of matrix are kept in the shared memory. Block
+ dimensions are equal to tile dimensions.
+ */
+struct ConvolutionKernel2DSharedMemory
+{
+    //! \tparam TAcc Accelerator type
+    //! \tparam TElem The input-matrix and filter-matrix element type
+    //! \param acc Accelerator
+    //! \param input Input matrix
+    //! \param output Output matrix
+    //! \param matrixWidth Input matrix width
+    //! \param matrixHeight Input matrix height
+    //! \param filter Filter-matrix
+    //! \param filterWidth Filter-matrix width
+    //! \param intputWidthAllocated Input-matrix width allocated (possibly larger than normal width due to paddding
+    //! \param filterWidthAllocated Filter-matrix width allocated (possibly larger than normal width due to paddding
+
+    template<typename TAcc, typename TElem>
+    ALPAKA_FN_ACC auto operator()(
+        TAcc const& acc,
+        TElem const* const input,
+        TElem* output,
+        int32_t const matrixWidth,
+        int32_t const matrixHeight,
+        TElem const* const filter,
+        int32_t const filterWidth,
+        int32_t const intputWidthAllocated,
+        int32_t const filterWidthAllocated) const -> void
+    {
+        auto const [row, col] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        // Get extents(dimensions)
+        auto const gridBlockExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc);
+        auto const blockThreadExtent = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
+        // Get indexes
+        auto const blockThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
+        auto const blockThreadIdx1D = alpaka::mapIdx<1u>(blockThreadIdx, blockThreadExtent)[0u];
+        // Get elements from 2-element arrays
+        auto const [blockThreadExtentY, blockThreadExtentX] = blockThreadExtent;
+        auto const [blockThreadY, blockThreadX] = blockThreadIdx;
+        auto const [gridBlockExtentY, gridBlockExtentX] = gridBlockExtent;
+        // Allocate shared memory
+        auto* const sharedN = alpaka::getDynSharedMem<TElem>(acc);
+        // Fill shared memory of device so that tile items are accessed from shared memory
+        if(row < matrixHeight && col < matrixWidth && blockThreadIdx1D < blockThreadExtent.prod())
+        {
+            sharedN[blockThreadIdx1D] = input[row * intputWidthAllocated + col];
+        }
+        else if(blockThreadIdx1D < blockThreadExtent.prod())
+        {
+            sharedN[blockThreadIdx1D] = 0.0f;
+        }
+
+        // Wait for the block fills the shared memory with the tile of the main matrix
+        alpaka::syncBlockThreads(acc);
+
+        if(col < matrixWidth && row < matrixHeight)
+        {
+            TElem pValue{0.0f};
+            for(int32_t fRow = 0; fRow < filterWidth; fRow++)
+            {
+                for(int32_t fCol = 0; fCol < filterWidth; fCol++)
+                {
+                    // Position of input matrix element to be multiplied with the corresponding element at the filter.
+                    // The position is with respect to tile(block)
+                    auto const exactRowBlock = static_cast<int32_t>(blockThreadY) - filterWidth / 2 + fRow;
+                    auto const exactColBlock = static_cast<int32_t>(blockThreadX) - filterWidth / 2 + fCol;
+                    if(exactColBlock >= 0 && exactColBlock < blockThreadExtentX && exactRowBlock >= 0
+                       && exactRowBlock < blockThreadExtentY)
+                    {
+                        // The element is inside the tile. Get the element from the shared memory
+                        pValue += filter[fRow * filterWidthAllocated + fCol]
+                                  * sharedN[exactRowBlock * blockThreadExtentX + exactColBlock];
+                    }
+                    else
+                    { // The element is not in the tile(block)
+                        // Position of input matrix element to be multiplied with the corresponding element at the
+                        // filter. The position is with respect to the input matrix
+                        auto const exactRow = static_cast<int32_t>(row) - filterWidth / 2 + fRow;
+                        auto const exactCol = static_cast<int32_t>(col) - filterWidth / 2 + fCol;
+                        if(exactRow >= 0 && exactRow < matrixHeight && exactCol >= 0 && exactCol < matrixWidth)
+                        {
+                            // get the item from the global memory, use padded width!
+                            pValue += filter[fRow * filterWidthAllocated + fCol]
+                                      * input[exactRow * intputWidthAllocated + exactCol];
+                        }
+                    }
+                }
+            }
+            output[row * matrixWidth + col] = pValue;
+        } // if
+    }
+};
+
+// The specialisation used for calculation of dynamic shared memory size
+namespace alpaka::trait
+{
+    //! The trait for getting the size of the block shared dynamic memory for a kernel.
+    template<typename TAcc>
+    struct BlockSharedMemDynSizeBytes<ConvolutionKernel2DSharedMemory, TAcc>
+    {
+        //! \tparam TVec type for extent array
+        //! \tparam TElem element type of the matrix
+        //! \return The size of the shared memory allocated for a block.
+        template<typename TVec, typename TElem>
+        ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(
+            ConvolutionKernel2DSharedMemory const& /* matMulKernel */,
+            TVec const& blockThreadExtent, // dimensions of thread per block
+            TVec const& threadElemExtent, // dimensions of elements per thread
+            TElem const* const, // input Matrix
+            TElem*, // output array
+            int32_t const, // matrixWidth
+            int32_t const, // matrixHeight
+            TElem const* const, // filter
+            int32_t const, // filter width
+            int32_t const, // allocated input width
+            int32_t const) // allocated filter width
+        {
+            // Reserve the buffer, buffers size is the number of elements in a block (tile)
+            return static_cast<std::size_t>(blockThreadExtent.prod() * threadElemExtent.prod()) * sizeof(TElem);
+        }
+    };
+} // namespace alpaka::trait
+
+auto FuzzyEqual(float a, float b) -> bool
+{
+    return std::fabs(a - b) < std::numeric_limits<float>::epsilon() * 1000.0f;
+}
+
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
+{
+    // Define the index domain
+    using Dim = alpaka::DimInt<2>;
+    // Index type
+    using Idx = std::uint32_t;
+    using Vec = alpaka::Vec<Dim, Idx>;
+    // Define the accelerator
+    using DevAcc = alpaka::TagToAcc<TAccTag, Dim, Idx>;
+    using QueueAcc = alpaka::Queue<DevAcc, alpaka::NonBlocking>;
+
+    using DataType = float;
+    static constexpr Idx filterWidth = 5;
+    static constexpr Idx matrixWidth = 128;
+    static constexpr Idx matrixHeight = 128;
+
+    static_assert(
+        alpaka::Dim<DevAcc>::value == 2u,
+        "The accelerator used for the Alpaka Kernel has to be 2 dimensional!");
+
+    std::cout << "Using alpaka accelerator: " << alpaka::getAccName<DevAcc>() << std::endl;
+
+    auto const devHost = alpaka::getDevByIdx(alpaka::PlatformCpu{}, 0);
+    // Select a device from the accelerator
+    auto const platformAcc = alpaka::Platform<DevAcc>{};
+    auto const devAcc = alpaka::getDevByIdx(platformAcc, 0);
+
+    // Create a queue on the device
+    QueueAcc queueAcc(devAcc);
+    // Define the 2D extent (dimensions)
+    Vec const extent(static_cast<Idx>(matrixWidth), static_cast<Idx>(matrixHeight));
+
+    //
+    // Input vector allocation and copy to device buffer
+    //
+    std::vector<DataType> bufInputHost1D(extent.prod(), 1);
+    // Use increasing values as input
+    std::iota(bufInputHost1D.begin(), bufInputHost1D.end(), 1.0f);
+    for(DataType& element : bufInputHost1D)
+    {
+        element /= matrixWidth;
+    }
+    // Create 2D view
+    auto bufInputHostView = alpaka::createView(devHost, bufInputHost1D.data(), extent);
+
+    // Input buffer at device
+    auto bufInputAcc = alpaka::allocBuf<DataType, Idx>(devAcc, extent);
+    // Copy input view from host to device by copying to alpaka buffer type
+    alpaka::memcpy(queueAcc, bufInputAcc, bufInputHostView);
+    alpaka::wait(queueAcc);
+
+    // Calculate the allocated width, due to padding it might be larger then the matrix width
+    auto const intputWidthAllocated = [&]() -> Idx const
+    {
+        // Calculate pitch: The size of one line in bytes including padding.
+        auto const rowPitchInput{alpaka::getPitchesInBytes(bufInputAcc)[0]};
+        return static_cast<Idx>(rowPitchInput / sizeof(DataType));
+    }();
+
+    //
+    //  Output buffer allocation at device
+    //
+    alpaka::Vec<alpaka::DimInt<1u>, Idx> const extent1D(matrixHeight * matrixWidth);
+    auto outputDeviceMemory = alpaka::allocBuf<DataType, Idx>(devAcc, extent1D);
+
+    //   Prepare convolution filter
+    //
+    std::vector<DataType> const filter = {0.11, 0.12, 0.13, 0.14, 0.15, 0.21, 0.22, 0.23, 0.24, 0.25, 0.31, 0.32, 0.33,
+                                          0.34, 0.35, 0.41, 0.42, 0.43, 0.44, 0.45, 0.51, 0.52, 0.53, 0.54, 0.55};
+
+    Vec const filterExtent(static_cast<Idx>(filterWidth), static_cast<Idx>(filterWidth));
+    // Create 2D view
+    auto bufFilterHostView = alpaka::createView(devHost, filter.data(), filterExtent);
+
+    // Filter buffer at device
+    auto bufFilterAcc = alpaka::allocBuf<DataType, Idx>(devAcc, filterExtent);
+    // Copy input view from host to device by copying to alpaka buffer type
+    alpaka::memcpy(queueAcc, bufFilterAcc, bufFilterHostView);
+    alpaka::wait(queueAcc);
+
+    // Calculate the allocated width, due to padding it might be larger then the matrix width
+    auto const filterWidthAllocated = [&]() -> Idx const
+    {
+        // Calculate pitch: The size of one line in bytes including padding.
+        auto const rowPitchFilter{alpaka::getPitchesInBytes(bufFilterAcc)[0]};
+        return static_cast<Idx>(rowPitchFilter / sizeof(DataType));
+    }();
+
+    //  Construct kernel object, choose on of the kernels provided. ConvolutionKernel2DGlobalMemory and
+    //  ConvolutionKernel2DSharedMemory
+    ConvolutionKernel2DSharedMemory convolutionKernel2D;
+
+    alpaka::KernelCfg<DevAcc> kernelCfg = {extent, Vec::ones()};
+
+    //   Let alpaka calculate good block and grid sizes given our full problem extent.
+    auto const workDiv = alpaka::getValidWorkDiv(
+        kernelCfg,
+        devAcc,
+        convolutionKernel2D,
+        std::data(bufInputAcc),
+        std::data(outputDeviceMemory),
+        matrixWidth,
+        matrixHeight,
+        std::data(bufFilterAcc),
+        filterWidth,
+        intputWidthAllocated,
+        filterWidthAllocated);
+
+    // Run the kernel
+    alpaka::exec<DevAcc>(
+        queueAcc,
+        workDiv,
+        convolutionKernel2D,
+        std::data(bufInputAcc),
+        std::data(outputDeviceMemory),
+        matrixWidth,
+        matrixHeight,
+        std::data(bufFilterAcc),
+        filterWidth,
+        intputWidthAllocated,
+        filterWidthAllocated);
+
+    // Allocate memory on host to receive the resulting matrix as an array
+    auto resultGpuHost = alpaka::allocBuf<DataType, Idx>(devHost, extent1D);
+    // Copy result from device memory to host
+    alpaka::memcpy(queueAcc, resultGpuHost, outputDeviceMemory, extent1D);
+    alpaka::wait(queueAcc);
+
+    //  Print results
+    //
+    std::string const kernelType{
+        std::is_same<decltype(convolutionKernel2D), ConvolutionKernel2DGlobalMemory>::value
+            ? "ConvolutionKernel2DGlobalMemory"
+            : "ConvolutionKernel2DSharedMemory"};
+
+    std::cout << "Convolution filter kernel:" << kernelType << "\n";
+    std::cout << "Matrix Size:" << matrixWidth << "x" << matrixHeight << ", Filter Size:" << filterWidth << "x"
+              << filterWidth << "\n";
+
+    // Print output
+    for(size_t i{0}; i < matrixWidth * matrixHeight; ++i)
+    {
+        std::cout << "output[" << i << "]:" << std::setprecision(6) << resultGpuHost[i] << std::endl;
+    }
+
+    // Expected array of sampled results
+    std::vector<DataType> const expectedOutput{
+        4.622344e+00,
+        1.106426e+02,
+        2.162168e+02,
+        3.217910e+02,
+        4.273652e+02,
+        4.199258e+02,
+        6.385137e+02,
+        7.440879e+02,
+        8.496621e+02,
+        9.552363e+02,
+        4.390715e+02};
+    // Select samples from output to check results
+    size_t const numberOfSamples{10};
+    size_t const samplePeriod{matrixWidth * matrixHeight / numberOfSamples};
+    bool allEqual{true};
+    for(size_t i{0}; i < numberOfSamples; ++i)
+    {
+        // Compare with the reference results, select one from every samplePeriod element
+        bool fuzzyEqual = FuzzyEqual(resultGpuHost[i * samplePeriod], expectedOutput[i]);
+        if(!fuzzyEqual)
+            std::cout << resultGpuHost[i * samplePeriod] << " " << expectedOutput[i] << std::endl;
+        allEqual = allEqual && fuzzyEqual;
+    }
+    if(!allEqual)
+    {
+        std::cout << "Error: Some 2D convolution results doesn't match!\n";
+        return EXIT_FAILURE;
+    }
+    std::cout << "Sampled result checks are correct!\n";
+    return EXIT_SUCCESS;
+}
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
+}
diff --git a/alpaka/example/counterBasedRng/src/counterBasedRng.cpp b/alpaka/example/counterBasedRng/src/counterBasedRng.cpp
index ad86b144..d96ab2b7 100644
--- a/alpaka/example/counterBasedRng/src/counterBasedRng.cpp
+++ b/alpaka/example/counterBasedRng/src/counterBasedRng.cpp
@@ -3,7 +3,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 #include <alpaka/rand/RandPhiloxStateless.hpp>
 
 #include <chrono>
@@ -17,12 +17,9 @@ class CounterBasedRngKernel
 public:
     template<class TAcc>
     using Vec = alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>>;
-    template<class TAcc>
-    using Gen = typename alpaka::rand::PhiloxStateless4x32x10Vector<TAcc>;
-    template<class TAcc>
-    using Key = typename Gen<TAcc>::Key;
-    template<class TAcc>
-    using Counter = typename Gen<TAcc>::Counter;
+    using Gen = typename alpaka::rand::PhiloxStateless4x32x10Vector;
+    using Key = typename Gen::Key;
+    using Counter = typename Gen::Counter;
 
     template<typename TAcc, typename TElem>
     using Mdspan = alpaka::experimental::MdSpan<TElem, alpaka::Idx<TAcc>, alpaka::Dim<TAcc>>;
@@ -36,7 +33,7 @@ class CounterBasedRngKernel
         static ALPAKA_FN_ACC auto elemLoop(
             TAcc const& acc,
             Mdspan<TAcc, TElem> dst,
-            Key<TAcc> const& key,
+            Key const& key,
             Vec<TAcc> const& threadElemExtent,
             Vec<TAcc>& threadFirstElemIdx) -> void
         {
@@ -56,14 +53,14 @@ class CounterBasedRngKernel
             }
             else
             {
-                Counter<TAcc> c = {0, 0, 0, 0};
+                Counter c = {0, 0, 0, 0};
                 for(unsigned int i = 0; i < Dim; ++i)
                     c[i] = threadFirstElemIdx[i];
 
                 for(; threadFirstElemIdx[Dim - 1] < threadLastElemIdxClipped; ++threadFirstElemIdx[Dim - 1])
                 {
                     c[Dim - 1] = threadFirstElemIdx[Dim - 1];
-                    auto const random = Gen<TAcc>::generate(c, key);
+                    auto const random = Gen::generate(c, key);
                     // to make use of the whole random vector we would need to ensure numElement[0] % 4 == 0
                     dst(alpaka::toArray(threadFirstElemIdx)) = TElem(random[0]);
                 }
@@ -82,7 +79,7 @@ class CounterBasedRngKernel
     //! \param extent The matrix dimension in elements.
     ALPAKA_NO_HOST_ACC_WARNING
     template<typename TAcc, typename TElem>
-    ALPAKA_FN_ACC auto operator()(TAcc const& acc, Mdspan<TAcc, TElem> dst, Key<TAcc> const& key) const -> void
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, Mdspan<TAcc, TElem> dst, Key const& key) const -> void
     {
         constexpr auto Dim = alpaka::Dim<TAcc>::value;
         static_assert(Dim <= 4, "The CounterBasedRngKernel expects at most 4-dimensional indices!");
@@ -95,30 +92,19 @@ class CounterBasedRngKernel
     }
 };
 
-auto main() -> int
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
-// Fallback for the CI with disabled sequential backend
-#if defined(ALPAKA_CI) && !defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
-    return EXIT_SUCCESS;
-#else
-
     // Define the index domain
     using Dim = alpaka::DimInt<3u>;
     using Idx = std::size_t;
 
     // Define the accelerator
-    //
-    // It is possible to choose from a set of accelerators:
-    // - AccGpuCudaRt
-    // - AccGpuHipRt
-    // - AccCpuThreads
-    // - AccCpuFibers
-    // - AccCpuOmp2Threads
-    // - AccCpuOmp2Blocks
-    // - AccCpuTbbBlocks
-    // - AccCpuSerial
-    // using Acc = alpaka::AccCpuSerial<Dim, Idx>;
-    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    using Acc = alpaka::TagToAcc<TAccTag, Dim, Idx>;
     std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
 
     using AccHost = alpaka::AccCpuSerial<Dim, Idx>;
@@ -145,19 +131,6 @@ auto main() -> int
     alpaka::Vec<Dim, Idx> const elementsPerThread = {1, 1, 1};
     alpaka::Vec<Dim, Idx> const elementsPerThreadHost = {1, 1, 8};
 
-    // Let alpaka calculate good block and grid sizes given our full problem extent
-    alpaka::WorkDivMembers<Dim, Idx> const workDivAcc(alpaka::getValidWorkDiv<Acc>(
-        devAcc,
-        extent,
-        elementsPerThread,
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted));
-    alpaka::WorkDivMembers<Dim, Idx> const workDivHost(alpaka::getValidWorkDiv<AccHost>(
-        devHost,
-        extent,
-        elementsPerThreadHost,
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted));
 
     // Define the buffer element type
     using Data = std::uint32_t;
@@ -166,23 +139,39 @@ auto main() -> int
     auto bufHost(alpaka::allocBuf<Data, Idx>(devHost, extent));
     auto bufHostDev(alpaka::allocBuf<Data, Idx>(devHost, extent));
 
-    // Initialize the host input vectors A and B
-    Data* const pBufHost(alpaka::getPtrNative(bufHost));
-    Data* const pBufHostDev(alpaka::getPtrNative(bufHostDev));
-
     std::random_device rd{};
-    CounterBasedRngKernel::Key<AccHost> key = {rd(), rd()};
+    CounterBasedRngKernel::Key key = {rd(), rd()};
 
     // Allocate buffer on the accelerator
     using BufAcc = alpaka::Buf<Acc, Data, Dim, Idx>;
     BufAcc bufAcc(alpaka::allocBuf<Data, Idx>(devAcc, extent));
 
+    CounterBasedRngKernel counterBasedRngKernel;
+
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    alpaka::KernelCfg<Acc> kernerlCfgAccDev = {extent, elementsPerThread};
+    auto const workDivAcc = alpaka::getValidWorkDiv(
+        kernerlCfgAccDev,
+        devAcc,
+        counterBasedRngKernel,
+        alpaka::experimental::getMdSpan(bufAcc),
+        key);
+
     // Create the kernel execution task.
     auto const taskKernelAcc = alpaka::createTaskKernel<Acc>(
         workDivAcc,
         CounterBasedRngKernel(),
         alpaka::experimental::getMdSpan(bufAcc),
         key);
+
+    alpaka::KernelCfg<AccHost> kernerlCfgAccHost = {extent, elementsPerThreadHost};
+    auto const workDivHost = alpaka::getValidWorkDiv(
+        kernerlCfgAccHost,
+        devHost,
+        counterBasedRngKernel,
+        alpaka::experimental::getMdSpan(bufHost),
+        key);
+
     auto const taskKernelHost = alpaka::createTaskKernel<AccHost>(
         workDivHost,
         CounterBasedRngKernel(),
@@ -233,5 +222,20 @@ auto main() -> int
                   << "Execution results incorrect!" << std::endl;
         return EXIT_FAILURE;
     }
-#endif
+}
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
 }
diff --git a/alpaka/example/heatEquation/src/heatEquation.cpp b/alpaka/example/heatEquation/src/heatEquation.cpp
index 82cd5a86..a13b3f00 100644
--- a/alpaka/example/heatEquation/src/heatEquation.cpp
+++ b/alpaka/example/heatEquation/src/heatEquation.cpp
@@ -3,7 +3,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 
 #include <algorithm>
 #include <cmath>
@@ -62,11 +62,14 @@ auto exactSolution(double const x, double const t) -> double
 //! Every time step the kernel will be executed numNodesX-times
 //! After every step the curr-buffer will be set to the calculated values
 //! from the next-buffer.
-auto main() -> int
+//!
+//! In standard projects, you typically do not execute the code with any available accelerator.
+//! Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+//! selected accelerator only. If you use the example as the starting point for your project, you can rename the
+//! example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
-#if defined(ALPAKA_CI) && !defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
-    return EXIT_SUCCESS;
-#else
     // Parameters (a user is supposed to change numNodesX, numTimeSteps)
     uint32_t const numNodesX = 1000;
     uint32_t const numTimeSteps = 10000;
@@ -87,9 +90,8 @@ auto main() -> int
     using Dim = alpaka::DimInt<1u>;
     using Idx = uint32_t;
 
-    // Select accelerator-types for host and device
-    // using Acc = alpaka::AccCpuSerial<Dim, Idx>;
-    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    // Define the accelerator
+    using Acc = alpaka::TagToAcc<TAccTag, Dim, Idx>;
     std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
 
     // Select specific devices
@@ -101,13 +103,6 @@ auto main() -> int
     // Get valid workdiv for the given problem
     uint32_t elemPerThread = 1;
     alpaka::Vec<Dim, Idx> const extent{numNodesX};
-    using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
-    auto workdiv = WorkDiv{alpaka::getValidWorkDiv<Acc>(
-        devAcc,
-        extent,
-        elemPerThread,
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted)};
 
     // Select queue
     using QueueProperty = alpaka::Blocking;
@@ -120,16 +115,16 @@ auto main() -> int
     // This buffer will hold the current values (used for the next step)
     auto uCurrBufHost = alpaka::allocBuf<double, Idx>(devHost, extent);
 
-    double* const pCurrHost = alpaka::getPtrNative(uCurrBufHost);
-    double* const pNextHost = alpaka::getPtrNative(uNextBufHost);
+    double* const pCurrHost = std::data(uCurrBufHost);
+    double* const pNextHost = std::data(uNextBufHost);
 
     // Accelerator buffer
     using BufAcc = alpaka::Buf<Acc, double, Dim, Idx>;
     auto uNextBufAcc = BufAcc{alpaka::allocBuf<double, Idx>(devAcc, extent)};
     auto uCurrBufAcc = BufAcc{alpaka::allocBuf<double, Idx>(devAcc, extent)};
 
-    double* pCurrAcc = alpaka::getPtrNative(uCurrBufAcc);
-    double* pNextAcc = alpaka::getPtrNative(uNextBufAcc);
+    double* pCurrAcc = std::data(uCurrBufAcc);
+    double* pNextAcc = std::data(uNextBufAcc);
 
     // Apply initial conditions for the test problem
     for(uint32_t i = 0; i < numNodesX; i++)
@@ -137,7 +132,13 @@ auto main() -> int
         pCurrHost[i] = exactSolution(i * dx, 0.0);
     }
 
-    HeatEquationKernel kernel;
+    HeatEquationKernel heatEqKernel;
+
+    alpaka::KernelCfg<Acc> const kernelCfg = {extent, elemPerThread};
+
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    auto const workDiv
+        = alpaka::getValidWorkDiv(kernelCfg, devAcc, heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
 
     // Copy host -> device
     alpaka::memcpy(queue, uCurrBufAcc, uCurrBufHost);
@@ -148,7 +149,7 @@ auto main() -> int
     for(uint32_t step = 0; step < numTimeSteps; step++)
     {
         // Compute next values
-        alpaka::exec<Acc>(queue, workdiv, kernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
+        alpaka::exec<Acc>(queue, workDiv, heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
 
         // We assume the boundary conditions are constant and so these values
         // do not need to be updated.
@@ -181,5 +182,20 @@ auto main() -> int
                   << std::endl;
         return EXIT_FAILURE;
     }
-#endif
+}
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
 }
diff --git a/alpaka/example/heatEquation2D/CMakeLists.txt b/alpaka/example/heatEquation2D/CMakeLists.txt
new file mode 100644
index 00000000..ab9c5fe6
--- /dev/null
+++ b/alpaka/example/heatEquation2D/CMakeLists.txt
@@ -0,0 +1,67 @@
+#
+# Copyright 2023 Benjamin Worpitz, Jan Stephan
+# SPDX-License-Identifier: ISC
+#
+
+################################################################################
+# Required CMake version.
+
+cmake_minimum_required(VERSION 3.22)
+
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+################################################################################
+# Project.
+
+set(_TARGET_NAME heatEquation2D)
+
+project(${_TARGET_NAME} LANGUAGES CXX)
+
+
+################################################################################
+# PNGwriter
+################################################################################
+
+# find PNGwriter installation
+find_package(PNGwriter 0.7.0 CONFIG)
+
+if(PNGwriter_FOUND)
+  set(PNGWRITER_ENABLED True)
+else()
+  set(PNGWRITER_ENABLED False)
+endif()
+
+#-------------------------------------------------------------------------------
+# Find alpaka.
+
+if(NOT TARGET alpaka::alpaka)
+    option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)
+
+    if(alpaka_USE_SOURCE_TREE)
+        # Don't build the examples recursively
+        set(alpaka_BUILD_EXAMPLES OFF)
+        add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
+    else()
+        find_package(alpaka REQUIRED)
+    endif()
+endif()
+
+#-------------------------------------------------------------------------------
+# Add executable.
+
+alpaka_add_executable(
+    ${_TARGET_NAME}
+    src/heatEquation2D.cpp)
+target_link_libraries(
+    ${_TARGET_NAME}
+    PUBLIC alpaka::alpaka)
+if(PNGwriter_FOUND)
+    target_link_libraries(
+        ${_TARGET_NAME}
+        PRIVATE PNGwriter::PNGwriter)
+    target_compile_definitions(${_TARGET_NAME} PRIVATE PNGWRITER_ENABLED)
+endif()
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)
+
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
diff --git a/alpaka/example/heatEquation2D/src/BoundaryKernel.hpp b/alpaka/example/heatEquation2D/src/BoundaryKernel.hpp
new file mode 100644
index 00000000..e2a4d022
--- /dev/null
+++ b/alpaka/example/heatEquation2D/src/BoundaryKernel.hpp
@@ -0,0 +1,86 @@
+/* Copyright 2024 Tapish Narwal
+ * SPDX-License-Identifier: ISC
+ */
+
+#pragma once
+
+#include "analyticalSolution.hpp"
+#include "helpers.hpp"
+
+#include <alpaka/alpaka.hpp>
+
+//! alpaka version of explicit finite-difference 1d heat equation solver
+//!
+//! Applies boundary conditions
+//! forward difference in t and second-order central difference in x
+//!
+//! \param uBuf grid values of u for each x, y and the current value of t:
+//!                 u(x, y, t)  | t = t_current
+//! \param chunkSize
+//! \param pitch
+//! \param dx step in x
+//! \param dy step in y
+//! \param dt step in t
+struct BoundaryKernel
+{
+    template<typename TAcc, typename TChunk>
+    ALPAKA_FN_ACC auto operator()(
+        TAcc const& acc,
+        double* const uBuf,
+        TChunk const chunkSize,
+        TChunk const pitch,
+        uint32_t step,
+        double const dx,
+        double const dy,
+        double const dt) const -> void
+    {
+        using Dim = alpaka::DimInt<2u>;
+        using Idx = uint32_t;
+
+        // Get extents(dimensions)
+        auto const gridBlockExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc);
+        auto const blockThreadExtent = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
+        auto const numThreadsPerBlock = blockThreadExtent.prod();
+
+        // Get indexes
+        auto const gridBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);
+        auto const blockThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
+        auto const threadIdx1D = alpaka::mapIdx<1>(blockThreadIdx, blockThreadExtent)[0u];
+        auto const blockStartIdx = gridBlockIdx * chunkSize;
+
+        // Lambda function to apply boundary conditions
+        auto applyBoundary = [&](auto const& globalIdxStart, auto const length, bool isRow)
+        {
+            for(auto i = threadIdx1D; i < length; i += numThreadsPerBlock)
+            {
+                auto idx2D = globalIdxStart + (isRow ? alpaka::Vec<Dim, Idx>{0, i} : alpaka::Vec<Dim, Idx>{i, 0});
+                auto elem = getElementPtr(uBuf, idx2D, pitch);
+                *elem = exactSolution(idx2D[1] * dx, idx2D[0] * dy, step * dt);
+            }
+        };
+
+        // Apply boundary conditions for the top row
+        if(gridBlockIdx[0] == 0)
+        {
+            applyBoundary(blockStartIdx + alpaka::Vec<Dim, Idx>{0, 1}, chunkSize[1], true);
+        }
+
+        // Apply boundary conditions for the bottom row
+        if(gridBlockIdx[0] == gridBlockExtent[0] - 1)
+        {
+            applyBoundary(blockStartIdx + alpaka::Vec<Dim, Idx>{chunkSize[0] + 1, 1}, chunkSize[1], true);
+        }
+
+        // Apply boundary conditions for the left column
+        if(gridBlockIdx[1] == 0)
+        {
+            applyBoundary(blockStartIdx + alpaka::Vec<Dim, Idx>{1, 0}, chunkSize[0], false);
+        }
+
+        // Apply boundary conditions for the right column
+        if(gridBlockIdx[1] == gridBlockExtent[1] - 1)
+        {
+            applyBoundary(blockStartIdx + alpaka::Vec<Dim, Idx>{1, chunkSize[1] + 1}, chunkSize[0], false);
+        }
+    }
+};
diff --git a/alpaka/example/heatEquation2D/src/StencilKernel.hpp b/alpaka/example/heatEquation2D/src/StencilKernel.hpp
new file mode 100644
index 00000000..a6121d3d
--- /dev/null
+++ b/alpaka/example/heatEquation2D/src/StencilKernel.hpp
@@ -0,0 +1,89 @@
+/* Copyright 2024 Tapish Narwal
+ * SPDX-License-Identifier: ISC
+ */
+
+#pragma once
+
+#include "helpers.hpp"
+
+#include <alpaka/alpaka.hpp>
+
+//! alpaka version of explicit finite-difference 2D heat equation solver
+//!
+//! \tparam T_SharedMemSize1D size of the shared memory box
+//!
+//! Solving equation u_t(x, t) = u_xx(x, t) + u_yy(y, t) using a simple explicit scheme with
+//! forward difference in t and second-order central difference in x and y
+//!
+//! \param uCurrBuf Current buffer with grid values of u for each x, y pair and the current value of t:
+//!                 u(x, y, t) | t = t_current
+//! \param uNextBuf resulting grid values of u for each x, y pair and the next value of t:
+//!              u(x, y, t) | t = t_current + dt
+//! \param chunkSize The size of the chunk or tile that the user divides the problem into. This defines the size of the
+//!                  workload handled by each thread block.
+//! \param pitchCurr The pitch (or stride) in memory corresponding to the TDim grid in the accelerator's memory.
+//!              This is used to calculate memory offsets when accessing elements in the current buffer.
+//! \param pitchNext The pitch used to calculate memory offsets when accessing elements in the next buffer.
+//! \param dx step in x
+//! \param dy step in y
+//! \param dt step in t
+template<size_t T_SharedMemSize1D>
+struct StencilKernel
+{
+    template<typename TAcc, typename TDim, typename TIdx>
+    ALPAKA_FN_ACC auto operator()(
+        TAcc const& acc,
+        double const* const uCurrBuf,
+        double* const uNextBuf,
+        alpaka::Vec<TDim, TIdx> const chunkSize,
+        alpaka::Vec<TDim, TIdx> const pitchCurr,
+        alpaka::Vec<TDim, TIdx> const pitchNext,
+        double const dx,
+        double const dy,
+        double const dt) const -> void
+    {
+        auto& sdata = alpaka::declareSharedVar<double[T_SharedMemSize1D], __COUNTER__>(acc);
+
+        // Get extents(dimensions)
+        auto const blockThreadExtent = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
+        auto const numThreadsPerBlock = blockThreadExtent.prod();
+
+        // Get indexes
+        auto const gridBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);
+        auto const blockThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
+        auto const threadIdx1D = alpaka::mapIdx<1>(blockThreadIdx, blockThreadExtent)[0u];
+        auto const blockStartIdx = gridBlockIdx * chunkSize;
+
+        constexpr alpaka::Vec<TDim, TIdx> halo{2, 2};
+
+        for(auto i = threadIdx1D; i < T_SharedMemSize1D; i += numThreadsPerBlock)
+        {
+            auto idx2d = alpaka::mapIdx<2>(alpaka::Vec(i), chunkSize + halo);
+            idx2d = idx2d + blockStartIdx;
+            auto elem = getElementPtr(uCurrBuf, idx2d, pitchCurr);
+            sdata[i] = *elem;
+        }
+
+        alpaka::syncBlockThreads(acc);
+
+        // Each kernel executes one element
+        double const rX = dt / (dx * dx);
+        double const rY = dt / (dy * dy);
+
+        // go over only core cells
+        for(auto i = threadIdx1D; i < chunkSize.prod(); i += numThreadsPerBlock)
+        {
+            auto idx2D = alpaka::mapIdx<2>(alpaka::Vec(i), chunkSize);
+            idx2D = idx2D + alpaka::Vec<TDim, TIdx>{1, 1}; // offset for halo above and to the left
+            auto localIdx1D = alpaka::mapIdx<1>(idx2D, chunkSize + halo)[0u];
+
+
+            auto bufIdx = idx2D + blockStartIdx;
+            auto elem = getElementPtr(uNextBuf, bufIdx, pitchNext);
+
+            *elem = sdata[localIdx1D] * (1.0 - 2.0 * rX - 2.0 * rY) + sdata[localIdx1D - 1] * rX
+                    + sdata[localIdx1D + 1] * rX + sdata[localIdx1D - chunkSize[1] - halo[1]] * rY
+                    + sdata[localIdx1D + chunkSize[1] + halo[1]] * rY;
+        }
+    }
+};
diff --git a/alpaka/example/heatEquation2D/src/analyticalSolution.hpp b/alpaka/example/heatEquation2D/src/analyticalSolution.hpp
new file mode 100644
index 00000000..953a5da8
--- /dev/null
+++ b/alpaka/example/heatEquation2D/src/analyticalSolution.hpp
@@ -0,0 +1,71 @@
+/* Copyright 2020 Tapish Narwal
+ * SPDX-License-Identifier: ISC
+ */
+
+#pragma once
+
+#include <alpaka/alpaka.hpp>
+
+#include <cmath>
+
+//! Exact solution to the test problem
+//! u_t(x, y, t) = u_xx(x, t) + u_yy(y, t), x in [0, 1], y in [0, 1], t in [0, T]
+//!
+//! \param x value of x
+//! \param x value of y
+//! \param t value of t
+ALPAKA_FN_HOST_ACC auto exactSolution(double const x, double const y, double const t) -> double
+{
+    constexpr double pi = alpaka::math::constants::pi;
+    return std::exp(-pi * pi * t) * (std::sin(pi * x) + std::sin(pi * y));
+}
+
+//! Valdidate calculated solution in the buffer to the analytical solution at t=tMax
+//!
+//! \param buffer buffer holding the solution at t=tMax
+//! \param extent extents of the buffer
+//! \param dx
+//! \param dy
+//! \param tMax time at simulation end
+template<typename T_Buffer, typename T_Extent>
+auto validateSolution(
+    T_Buffer const& buffer,
+    T_Extent const& extent,
+    double const dx,
+    double const dy,
+    double const tMax) -> std::pair<bool, double>
+{
+    // Calculate error
+    double maxError = 0.0;
+    for(uint32_t j = 1; j < extent[0] - 1; ++j)
+    {
+        for(uint32_t i = 1; i < extent[1] - 1; ++i)
+        {
+            auto const error = std::abs(buffer.data()[j * extent[1] + i] - exactSolution(i * dx, j * dy, tMax));
+            maxError = std::max(maxError, error);
+        }
+    }
+
+    constexpr double errorThreshold = 1e-4;
+    return std::make_pair(maxError < errorThreshold, maxError);
+}
+
+//! Initialize the buffer to the analytical solution at t=0
+//!
+//! \param buffer buffer holding the solution at tMax
+//! \param extent extents of the buffer
+//! \param dx
+//! \param dy
+template<typename TBuffer>
+auto initalizeBuffer(TBuffer& buffer, double const dx, double const dy) -> void
+{
+    auto extents = alpaka::getExtents(buffer);
+    // Apply initial conditions for the test problem
+    for(uint32_t j = 0; j < extents[0]; ++j)
+    {
+        for(uint32_t i = 0; i < extents[1]; ++i)
+        {
+            buffer.data()[j * extents[1] + i] = exactSolution(i * dx, j * dy, 0.0);
+        }
+    }
+}
diff --git a/alpaka/example/heatEquation2D/src/heatEquation2D.cpp b/alpaka/example/heatEquation2D/src/heatEquation2D.cpp
new file mode 100644
index 00000000..b186e9ec
--- /dev/null
+++ b/alpaka/example/heatEquation2D/src/heatEquation2D.cpp
@@ -0,0 +1,219 @@
+/* Copyright 2020 Benjamin Worpitz, Matthias Werner, Jakob Krude, Sergei Bastrakov, Bernhard Manfred Gruber,
+ * Tapish Narwal
+ * SPDX-License-Identifier: ISC
+ */
+
+#include "BoundaryKernel.hpp"
+#include "StencilKernel.hpp"
+#include "analyticalSolution.hpp"
+
+#ifdef PNGWRITER_ENABLED
+#    include "writeImage.hpp"
+#endif
+
+#include <alpaka/alpaka.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <iostream>
+#include <utility>
+
+//! Each kernel computes the next step for one point.
+//! Therefore the number of threads should be equal to numNodesX.
+//! Every time step the kernel will be executed numNodesX-times
+//! After every step the curr-buffer will be set to the calculated values
+//! from the next-buffer.
+//!
+//! In standard projects, you typically do not execute the code with any available accelerator.
+//! Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+//! selected accelerator only. If you use the example as the starting point for your project, you can rename the
+//! example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
+{
+    // Set Dim and Idx type
+    using Dim = alpaka::DimInt<2u>;
+    using Idx = uint32_t;
+
+    // Define the accelerator
+    using Acc = alpaka::TagToAcc<TAccTag, Dim, Idx>;
+    std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
+
+    // Select specific devices
+    auto const platformHost = alpaka::PlatformCpu{};
+    auto const devHost = alpaka::getDevByIdx(platformHost, 0);
+    auto const platformAcc = alpaka::Platform<Acc>{};
+    // get suitable device for this Acc
+    auto const devAcc = alpaka::getDevByIdx(platformAcc, 0);
+
+    // simulation defines
+    // {Y, X}
+    constexpr alpaka::Vec<Dim, Idx> numNodes{64, 64};
+    constexpr alpaka::Vec<Dim, Idx> haloSize{2, 2};
+    constexpr alpaka::Vec<Dim, Idx> extent = numNodes + haloSize;
+
+    constexpr uint32_t numTimeSteps = 4000;
+    constexpr double tMax = 0.1;
+
+    // x, y in [0, 1], t in [0, tMax]
+    constexpr double dx = 1.0 / static_cast<double>(extent[1] - 1);
+    constexpr double dy = 1.0 / static_cast<double>(extent[0] - 1);
+    constexpr double dt = tMax / static_cast<double>(numTimeSteps);
+
+    // Check the stability condition
+    double r = 2 * dt / ((dx * dx * dy * dy) / (dx * dx + dy * dy));
+    if(r > 1.)
+    {
+        std::cerr << "Stability condition check failed: dt/min(dx^2,dy^2) = " << r
+                  << ", it is required to be <= 0.5\n";
+        return EXIT_FAILURE;
+    }
+
+    // Initialize host-buffer
+    // This buffer will hold the current values (used for the next step)
+    auto uBufHost = alpaka::allocBuf<double, Idx>(devHost, extent);
+
+    // Accelerator buffer
+    auto uCurrBufAcc = alpaka::allocBuf<double, Idx>(devAcc, extent);
+    auto uNextBufAcc = alpaka::allocBuf<double, Idx>(devAcc, extent);
+
+    auto const pitchCurrAcc{alpaka::getPitchesInBytes(uCurrBufAcc)};
+    auto const pitchNextAcc{alpaka::getPitchesInBytes(uNextBufAcc)};
+
+    // Set buffer to initial conditions
+    initalizeBuffer(uBufHost, dx, dy);
+
+    // Select queue
+    using QueueProperty = alpaka::NonBlocking;
+    using QueueAcc = alpaka::Queue<Acc, QueueProperty>;
+    QueueAcc dumpQueue{devAcc};
+    QueueAcc computeQueue{devAcc};
+
+    // Copy host -> device
+    alpaka::memcpy(computeQueue, uCurrBufAcc, uBufHost);
+    alpaka::wait(computeQueue);
+
+    // Define a workdiv for the given problem
+    constexpr alpaka::Vec<Dim, Idx> elemPerThread{1, 1};
+
+    // Appropriate chunk size to split your problem for your Acc
+    constexpr Idx xSize = 16u;
+    constexpr Idx ySize = 16u;
+    constexpr Idx halo = 2u;
+    constexpr alpaka::Vec<Dim, Idx> chunkSize{ySize, xSize};
+    constexpr auto sharedMemSize = (ySize + halo) * (xSize + halo);
+
+    constexpr alpaka::Vec<Dim, Idx> numChunks{
+        alpaka::core::divCeil(numNodes[0], chunkSize[0]),
+        alpaka::core::divCeil(numNodes[1], chunkSize[1]),
+    };
+
+    assert(
+        numNodes[0] % chunkSize[0] == 0 && numNodes[1] % chunkSize[1] == 0
+        && "Domain must be divisible by chunk size");
+
+    StencilKernel<sharedMemSize> stencilKernel;
+    BoundaryKernel boundaryKernel;
+
+    // Get max threads that can be run in a block for this kernel
+    auto const kernelFunctionAttributes = alpaka::getFunctionAttributes<Acc>(
+        devAcc,
+        stencilKernel,
+        uCurrBufAcc.data(),
+        uNextBufAcc.data(),
+        chunkSize,
+        pitchCurrAcc,
+        pitchNextAcc,
+        dx,
+        dy,
+        dt);
+    auto const maxThreadsPerBlock = kernelFunctionAttributes.maxThreadsPerBlock;
+
+    auto const threadsPerBlock
+        = maxThreadsPerBlock < chunkSize.prod() ? alpaka::Vec<Dim, Idx>{maxThreadsPerBlock, 1} : chunkSize;
+
+    alpaka::WorkDivMembers<Dim, Idx> workDiv_manual{numChunks, threadsPerBlock, elemPerThread};
+
+    // Simulate
+    for(uint32_t step = 1; step <= numTimeSteps; ++step)
+    {
+        // Compute next values
+        alpaka::exec<Acc>(
+            computeQueue,
+            workDiv_manual,
+            stencilKernel,
+            uCurrBufAcc.data(),
+            uNextBufAcc.data(),
+            chunkSize,
+            pitchCurrAcc,
+            pitchNextAcc,
+            dx,
+            dy,
+            dt);
+
+        // Apply boundaries
+        alpaka::exec<Acc>(
+            computeQueue,
+            workDiv_manual,
+            boundaryKernel,
+            uNextBufAcc.data(),
+            chunkSize,
+            pitchNextAcc,
+            step,
+            dx,
+            dy,
+            dt);
+
+#ifdef PNGWRITER_ENABLED
+        if((step - 1) % 100 == 0)
+        {
+            alpaka::wait(computeQueue);
+            alpaka::memcpy(dumpQueue, uBufHost, uCurrBufAcc);
+            alpaka::wait(dumpQueue);
+            writeImage(step - 1, uBufHost);
+        }
+#endif
+
+        // So we just swap next and curr (shallow copy)
+        std::swap(uNextBufAcc, uCurrBufAcc);
+    }
+
+    // Copy device -> host
+    alpaka::wait(computeQueue);
+    alpaka::memcpy(dumpQueue, uBufHost, uCurrBufAcc);
+    alpaka::wait(dumpQueue);
+
+    // Validate
+    auto const [resultIsCorrect, maxError] = validateSolution(uBufHost, extent, dx, dy, tMax);
+
+    if(resultIsCorrect)
+    {
+        std::cout << "Execution results correct!" << std::endl;
+        return EXIT_SUCCESS;
+    }
+    else
+    {
+        std::cout << "Execution results incorrect: Max error = " << maxError << " (the grid resolution may be too low)"
+                  << std::endl;
+        return EXIT_FAILURE;
+    }
+}
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
+}
diff --git a/alpaka/example/heatEquation2D/src/helpers.hpp b/alpaka/example/heatEquation2D/src/helpers.hpp
new file mode 100644
index 00000000..adac2dfe
--- /dev/null
+++ b/alpaka/example/heatEquation2D/src/helpers.hpp
@@ -0,0 +1,25 @@
+/* Copyright 2020 Tapish Narwal
+ * SPDX-License-Identifier: ISC
+ */
+
+#pragma once
+
+#include <alpaka/alpaka.hpp>
+
+template<typename T, typename U>
+using const_match = std::conditional_t<std::is_const_v<T>, U const, U>;
+
+//! Helper function to get a pointer to an element in a multidimensional buffer
+//!
+//! \tparam T type of the element
+//! \tparam TDim dimension of the buffer
+//! \tparam TIdx index type
+//! \param ptr pointer to the buffer
+//! \param idx index of the element
+//! \param pitch pitch of the buffer
+template<typename T, typename TDim, typename TIdx>
+ALPAKA_FN_ACC T* getElementPtr(T* ptr, alpaka::Vec<TDim, TIdx> idx, alpaka::Vec<TDim, TIdx> pitch)
+{
+    return reinterpret_cast<T*>(
+        reinterpret_cast<const_match<T, std::byte>*>(ptr) + idx[0] * pitch[0] + idx[1] * pitch[1]);
+}
diff --git a/alpaka/example/heatEquation2D/src/writeImage.hpp b/alpaka/example/heatEquation2D/src/writeImage.hpp
new file mode 100644
index 00000000..4b8b6fc5
--- /dev/null
+++ b/alpaka/example/heatEquation2D/src/writeImage.hpp
@@ -0,0 +1,44 @@
+/* Copyright 2024 Tapish Narwal
+ * SPDX-License-Identifier: ISC
+ */
+
+#pragma once
+
+#include <alpaka/extent/Traits.hpp>
+
+#include <pngwriter.h>
+
+#include <cmath>
+#include <cstdint>
+#include <iomanip>
+#include <sstream>
+
+//! Writes the buffer to a png file
+//!
+//! \param currentStep the current step of the simulation
+//! \param buffer the buffer to write to the file
+template<typename T_Buffer>
+auto writeImage(uint32_t const currentStep, T_Buffer const& buffer) -> void
+{
+    std::stringstream step;
+    step << std::setw(6) << std::setfill('0') << currentStep;
+    std::string filename("heat_" + step.str() + ".png");
+    auto extents = alpaka::getExtents(buffer);
+    pngwriter png{static_cast<int>(extents[1]), static_cast<int>(extents[0]), 0, filename.c_str()};
+    png.setcompressionlevel(9);
+
+    for(uint32_t y = 0; y < extents[0]; ++y)
+    {
+        for(uint32_t x = 0; x < extents[1]; ++x)
+        {
+            auto p = buffer.data()[y * extents[1] + x];
+            png.plot(
+                x + 1,
+                extents[0] - y,
+                2 * std::exp(std::sqrt(p)) / std::exp(std::sqrt(2)) - 1,
+                0.4,
+                2 - 2 * std::exp(std::sqrt(p)) / std::exp(std::sqrt(2)));
+        }
+    }
+    png.close();
+}
diff --git a/alpaka/example/helloWorld/src/helloWorld.cpp b/alpaka/example/helloWorld/src/helloWorld.cpp
index ee7dac46..e18df95a 100644
--- a/alpaka/example/helloWorld/src/helloWorld.cpp
+++ b/alpaka/example/helloWorld/src/helloWorld.cpp
@@ -3,7 +3,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 
 #include <iostream>
 
@@ -47,12 +47,13 @@ struct HelloWorldKernel
     }
 };
 
-auto main() -> int
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
-// Fallback for the CI with disabled sequential backend
-#if defined(ALPAKA_CI) && !defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
-    return EXIT_SUCCESS;
-#else
     // Define the index domain
     //
     // Depending on your type of problem, you have to define
@@ -63,25 +64,7 @@ auto main() -> int
     using Idx = std::size_t;
 
     // Define the accelerator
-    //
-    // It is possible to choose from a set of accelerators:
-    // - AccGpuCudaRt
-    // - AccGpuHipRt
-    // - AccCpuThreads
-    // - AccCpuOmp2Threads
-    // - AccCpuOmp2Blocks
-    // - AccCpuTbbBlocks
-    // - AccCpuSerial
-    //
-    // Each accelerator has strengths and weaknesses. Therefore,
-    // they need to be chosen carefully depending on the actual
-    // use case. Furthermore, some accelerators only support a
-    // particular workdiv, but workdiv can also be generated
-    // automatically.
-
-    // By exchanging the Acc and Queue types you can select where to execute the kernel.
-    // using Acc = alpaka::AccCpuSerial<Dim, Idx>;
-    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    using Acc = alpaka::TagToAcc<TAccTag, Dim, Idx>;
     std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
 
     // Defines the synchronization behavior of a queue
@@ -143,14 +126,7 @@ auto main() -> int
     // vector processing unit.
     using Vec = alpaka::Vec<Dim, Idx>;
     auto const elementsPerThread = Vec::all(static_cast<Idx>(1));
-    auto const threadsPerGrid = Vec{4, 2, 4};
-    using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
-    WorkDiv const workDiv = alpaka::getValidWorkDiv<Acc>(
-        devAcc,
-        threadsPerGrid,
-        elementsPerThread,
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
+    auto const elementsPerGrid = Vec{4, 2, 4};
 
     // Instantiate the kernel function object
     //
@@ -159,6 +135,11 @@ auto main() -> int
     // argument. So a kernel can be a class or struct, a lambda, etc.
     HelloWorldKernel helloWorldKernel;
 
+    alpaka::KernelCfg<Acc> const kernelCfg = {elementsPerGrid, elementsPerThread};
+
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, helloWorldKernel);
+
     // Run the kernel
     //
     // To execute the kernel, you have to provide the
@@ -176,5 +157,20 @@ auto main() -> int
     alpaka::wait(queue);
 
     return EXIT_SUCCESS;
-#endif
+}
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
 }
diff --git a/alpaka/example/helloWorldLambda/src/helloWorldLambda.cpp b/alpaka/example/helloWorldLambda/src/helloWorldLambda.cpp
index 90296c39..143b9e7c 100644
--- a/alpaka/example/helloWorldLambda/src/helloWorldLambda.cpp
+++ b/alpaka/example/helloWorldLambda/src/helloWorldLambda.cpp
@@ -3,7 +3,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 
 #include <functional>
 
@@ -39,7 +39,12 @@ void ALPAKA_FN_ACC hiWorldFunction(TAcc const& acc, size_t const nExclamationMar
     printf("\n");
 }
 
-auto main() -> int
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
 // It requires support for extended lambdas when using nvcc as CUDA compiler.
 // Requires sequential backend if CI is used
@@ -51,17 +56,7 @@ auto main() -> int
     using Idx = std::size_t;
 
     // Define the accelerator
-    //
-    // It is possible to choose from a set of accelerators:
-    // - AccGpuCudaRt
-    // - AccGpuHipRt
-    // - AccCpuThreads
-    // - AccCpuOmp2Threads
-    // - AccCpuOmp2Blocks
-    // - AccCpuTbbBlocks
-    // - AccCpuSerial
-    // using Acc = alpaka::AccCpuSerial<Dim, Idx>;
-    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    using Acc = alpaka::TagToAcc<TAccTag, Dim, Idx>;
     std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
 
     // Defines the synchronization behavior of a queue
@@ -80,16 +75,10 @@ auto main() -> int
     // Define the work division
     using Vec = alpaka::Vec<Dim, Idx>;
     auto const elementsPerThread = Vec::all(static_cast<Idx>(1));
-    auto const threadsPerGrid = Vec{4, 2, 4};
-    using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
-    WorkDiv const workDiv = alpaka::getValidWorkDiv<Acc>(
-        devAcc,
-        threadsPerGrid,
-        elementsPerThread,
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
+    auto const elementsPerGrid = Vec{4, 2, 4};
 
-    const size_t nExclamationMarks = 10;
+
+    size_t const nExclamationMarks = 10;
 
     // Run "Hello World" kernel with a lambda function
     //
@@ -106,30 +95,34 @@ auto main() -> int
     // To define a fully generic kernel lambda, the type of acc must be
     // auto. The Nvidia nvcc does not support generic lambdas, so the
     // type is set to Acc.
-    alpaka::exec<Acc>(
-        queue,
-        workDiv,
-        [] ALPAKA_FN_ACC(Acc const& acc, size_t const nExclamationMarksAsArg) -> void
+
+    auto kernelLambda = [] ALPAKA_FN_ACC(Acc const& acc, size_t const nExclamationMarksAsArg) -> void
+    {
+        auto globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
+        auto linearizedGlobalThreadIdx = alpaka::mapIdx<1u>(globalThreadIdx, globalThreadExtent);
+
+        printf(
+            "[z:%u, y:%u, x:%u][linear:%u] Hello world from a lambda",
+            static_cast<unsigned>(globalThreadIdx[0]),
+            static_cast<unsigned>(globalThreadIdx[1]),
+            static_cast<unsigned>(globalThreadIdx[2]),
+            static_cast<unsigned>(linearizedGlobalThreadIdx[0]));
+
+        for(size_t i = 0; i < nExclamationMarksAsArg; ++i)
         {
-            auto globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-            auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
-            auto linearizedGlobalThreadIdx = alpaka::mapIdx<1u>(globalThreadIdx, globalThreadExtent);
-
-            printf(
-                "[z:%u, y:%u, x:%u][linear:%u] Hello world from a lambda",
-                static_cast<unsigned>(globalThreadIdx[0]),
-                static_cast<unsigned>(globalThreadIdx[1]),
-                static_cast<unsigned>(globalThreadIdx[2]),
-                static_cast<unsigned>(linearizedGlobalThreadIdx[0]));
-
-            for(size_t i = 0; i < nExclamationMarksAsArg; ++i)
-            {
-                printf("!");
-            }
-
-            printf("\n");
-        },
-        nExclamationMarks);
+            printf("!");
+        }
+
+        printf("\n");
+    };
+
+    alpaka::KernelCfg<Acc> const kernelCfg = {elementsPerGrid, elementsPerThread};
+
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, kernelLambda, nExclamationMarks);
+
+    alpaka::exec<Acc>(queue, workDiv, kernelLambda, nExclamationMarks);
     alpaka::wait(queue);
 
     return EXIT_SUCCESS;
@@ -138,3 +131,19 @@ auto main() -> int
     return EXIT_SUCCESS;
 #endif
 }
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
+}
diff --git a/alpaka/example/kernelSpecialization/src/kernelSpecialization.cpp b/alpaka/example/kernelSpecialization/src/kernelSpecialization.cpp
index 753f56e8..f33306ec 100644
--- a/alpaka/example/kernelSpecialization/src/kernelSpecialization.cpp
+++ b/alpaka/example/kernelSpecialization/src/kernelSpecialization.cpp
@@ -3,7 +3,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 
 #include <iostream>
 
@@ -27,7 +27,7 @@ struct Kernel
     //!
     //! It will be called when no overload is a better match.
     template<typename TAcc>
-    ALPAKA_FN_ACC auto operator()(TAcc const& acc) const
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc) const -> void
     {
         // For simplicity assume 1d thread indexing
         auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
@@ -41,7 +41,7 @@ struct Kernel
     //! Overloading for other accelerators is similar, with another template name instead of AccGpuCudaRt.
 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
     template<typename TDim, typename TIdx>
-    ALPAKA_FN_ACC auto operator()(alpaka::AccGpuCudaRt<TDim, TIdx> const& acc) const
+    ALPAKA_FN_ACC auto operator()(alpaka::AccGpuCudaRt<TDim, TIdx> const& acc) const -> void
     {
         // This overload is used when the kernel is run on the CUDA accelerator.
         // So inside we can use both alpaka and native CUDA directly.
@@ -53,26 +53,16 @@ struct Kernel
 #endif
 };
 
-auto main() -> int
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
-// Fallback for the CI with disabled sequential backend
-#if defined(ALPAKA_CI) && !defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
-    return EXIT_SUCCESS;
-#else
-
     // Define the accelerator
-    //
-    // It is possible to choose from a set of accelerators:
-    // - AccGpuCudaRt
-    // - AccGpuHipRt
-    // - AccCpuThreads
-    // - AccCpuOmp2Threads
-    // - AccCpuOmp2Blocks
-    // - AccCpuTbbBlocks
-    // - AccCpuSerial
-    //
     // For simplicity this examples always uses 1 dimensional indexing, and index type size_t
-    using Acc = alpaka::ExampleDefaultAcc<alpaka::DimInt<1>, std::size_t>;
+    using Acc = alpaka::TagToAcc<TAccTag, alpaka::DimInt<1>, std::size_t>;
     std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
 
     // Defines the synchronization behavior of a queue
@@ -87,19 +77,34 @@ auto main() -> int
     Queue queue(devAcc);
 
     // Define the work division
-    std::size_t const threadsPerGrid = 16u;
+    std::size_t const elementsPerGrid = 16u;
     std::size_t const elementsPerThread = 1u;
-    auto const workDiv = alpaka::getValidWorkDiv<Acc>(
-        devAcc,
-        threadsPerGrid,
-        elementsPerThread,
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
+    Kernel kernel;
+
+    alpaka::KernelCfg<Acc> const kernelCfg = {elementsPerGrid, elementsPerThread};
+
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, kernel);
 
     // Run the kernel
-    alpaka::exec<Acc>(queue, workDiv, Kernel{});
+    alpaka::exec<Acc>(queue, workDiv, kernel);
     alpaka::wait(queue);
 
     return EXIT_SUCCESS;
-#endif
+}
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
 }
diff --git a/alpaka/example/matrixMulWithMdspan/CMakeLists.txt b/alpaka/example/matrixMulWithMdspan/CMakeLists.txt
new file mode 100644
index 00000000..7fa21254
--- /dev/null
+++ b/alpaka/example/matrixMulWithMdspan/CMakeLists.txt
@@ -0,0 +1,53 @@
+#
+# Copyright 2023 Simeon Ehrig, Mehmet Yusufoglu
+# SPDX-License-Identifier: MPL-2.0
+#
+
+################################################################################
+# Required CMake version.
+
+cmake_minimum_required(VERSION 3.22)
+
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+################################################################################
+# Project.
+
+set(_TARGET_NAME matrixMulMdSpan)
+
+project(${_TARGET_NAME} LANGUAGES CXX)
+
+#-------------------------------------------------------------------------------
+# Find alpaka.
+
+if(NOT TARGET alpaka::alpaka)
+    option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)
+
+    if(alpaka_USE_SOURCE_TREE)
+        # Don't build the examples recursively
+        set(alpaka_BUILD_EXAMPLES OFF)
+        add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
+    else()
+        find_package(alpaka REQUIRED)
+    endif()
+endif()
+
+
+if (alpaka_USE_MDSPAN STREQUAL "OFF")
+    message(STATUS "The matrixMulMdSpan example requires mdspan. Please set alpaka_USE_MDSPAN accordingly. Example disabled.")
+    return()
+endif ()
+
+#-------------------------------------------------------------------------------
+# Add executable.
+
+alpaka_add_executable(
+    ${_TARGET_NAME}
+    src/matrixMulMdSpan.cpp)
+target_link_libraries(
+    ${_TARGET_NAME}
+    PUBLIC alpaka::alpaka)
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)
+
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
diff --git a/alpaka/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp b/alpaka/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp
new file mode 100644
index 00000000..e34dcb2d
--- /dev/null
+++ b/alpaka/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp
@@ -0,0 +1,207 @@
+/* Copyright 2024 Mehmet Yusufoglu, Simeon Ehrig, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#include <alpaka/alpaka.hpp>
+// Needed for running example for all backends available; one by one
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
+
+#include <experimental/mdspan>
+#include <iostream>
+
+//! Matrix multiplication example by using mdspan data structure
+
+//! Some simple type traits for checking the types
+//! isMdspan simply checks if a type is of type std::experimental::mdspan or not
+//! Primary template for is_mdspan (defaults to false)
+template<typename T>
+struct IsMdspan : std::false_type
+{
+};
+
+//! Specialization for mdspan with four template arguments
+template<typename ElementType, typename Extents, typename LayoutPolicy, typename AccessorPolicy>
+struct IsMdspan<std::experimental::mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>> : std::true_type
+{
+};
+
+template<typename T>
+inline constexpr bool is_mdspan = IsMdspan<T>::value;
+
+// Index type
+using Idx = std::size_t;
+// Set data type
+using DataType = float;
+
+/**
+ * @brief Kernel for performing multiplication of two 2D matrices. Each element is computed by a different thread.
+ * MdSpan data structure is used to pass the data to and from the kernel.
+ */
+struct MatrixMulKernel
+{
+    //! \tparam TAcc Accelerator type
+    //! \tparam MdSpan The type of the multidimensional span (mdspan)
+    //! \param acc Accelerator
+    //! \param A First input matrix
+    //! \param B Second input matrix
+    //! \param C Output matrix where the result of A * B will be stored
+    //! \param K The shared dimension between A and B
+    template<typename TAcc, typename TMdSpan>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, TMdSpan A, TMdSpan B, TMdSpan C) const
+    {
+        // compile time checks
+        static_assert(is_mdspan<TMdSpan>, "The type TMdSpan should be an std mdspan");
+        static_assert(TMdSpan::rank() == 2);
+
+        // A is MxK and B is KxN
+        auto const K = static_cast<Idx>(A.extent(1));
+
+        auto const i = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
+        auto const j = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[1];
+
+        if(i < C.extent(0) && j < C.extent(1))
+        {
+            DataType sum = 0.0f;
+            for(Idx k = 0; k < K; ++k)
+            {
+                sum += A(i, k) * B(k, j);
+            }
+            C(i, j) = sum;
+        }
+    }
+};
+
+// initialize the matrix
+template<typename TMdSpan>
+inline void initializeMatrx(TMdSpan& span)
+{
+    auto const numColumns = span.extent(1);
+    for(Idx i = 0; i < span.extent(0); ++i)
+    {
+        for(Idx j = 0; j < numColumns; ++j)
+        {
+            // fill with some data
+            span(i, j) = static_cast<DataType>(i * numColumns + j);
+        }
+    }
+}
+
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
+{
+    // Set number of dimensions (i.e 2) as a type
+    using Dim = alpaka::DimInt<2>;
+
+    // Define matrix dimensions, A is MxK and B is KxN
+    Idx const M = 1024;
+    Idx const N = 512;
+    Idx const K = 1024;
+
+    // Define device and queue
+    using Acc = alpaka::AccCpuSerial<Dim, Idx>;
+    using Queue = alpaka::Queue<Acc, alpaka::Blocking>;
+    using Vec = alpaka::Vec<Dim, Idx>;
+
+    auto const platformHost = alpaka::PlatformCpu{};
+    auto const devHost = alpaka::getDevByIdx(platformHost, 0);
+    auto const platformAcc = alpaka::Platform<Acc>{};
+    auto const devAcc = alpaka::getDevByIdx(platformAcc, 0);
+
+    Queue queue(devAcc);
+
+    // Define the 2D extents (dimensions)
+    Vec const extentA(static_cast<Idx>(M), static_cast<Idx>(K));
+    Vec const extentB(static_cast<Idx>(K), static_cast<Idx>(N));
+    Vec const extentC(static_cast<Idx>(M), static_cast<Idx>(N));
+
+    // Allocate host memory
+    auto bufHostA = alpaka::allocBuf<DataType, Idx>(devHost, extentA);
+    auto bufHostB = alpaka::allocBuf<DataType, Idx>(devHost, extentB);
+    auto bufHostC = alpaka::allocBuf<DataType, Idx>(devHost, extentC);
+
+    // Create mdspan view for bufHostA and bufHostB using alpaka::experimental::getMdSpan to fill the host buffers
+    auto mdHostA = alpaka::experimental::getMdSpan(bufHostA);
+    auto mdHostB = alpaka::experimental::getMdSpan(bufHostB);
+
+    // Initialize host matrices
+    initializeMatrx(mdHostA);
+    initializeMatrx(mdHostB);
+
+    // Allocate device memory
+    auto bufDevA = alpaka::allocBuf<DataType, Idx>(devAcc, extentA);
+    auto bufDevB = alpaka::allocBuf<DataType, Idx>(devAcc, extentB);
+    auto bufDevC = alpaka::allocBuf<DataType, Idx>(devAcc, extentC);
+
+    // Copy data to device, use directly host buffers (not mdspans used to fill the data)
+    alpaka::memcpy(queue, bufDevA, bufHostA);
+    alpaka::memcpy(queue, bufDevB, bufHostB);
+    alpaka::wait(queue);
+
+    // Create mdspan views for device buffers using alpaka::experimental::getMdSpan
+    auto mdDevA = alpaka::experimental::getMdSpan(bufDevA);
+    auto mdDevB = alpaka::experimental::getMdSpan(bufDevB);
+    auto mdDevC = alpaka::experimental::getMdSpan(bufDevC);
+
+    MatrixMulKernel kernel;
+
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    alpaka::KernelCfg<Acc> const kernelCfg
+        = {extentC, Vec::ones(), false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted};
+    auto const workDiv = alpaka::getValidWorkDiv<Acc>(kernelCfg, devAcc, kernel, mdDevA, mdDevB, mdDevC);
+
+    // Execute the kernel
+    alpaka::exec<Acc>(queue, workDiv, kernel, mdDevA, mdDevB, mdDevC);
+
+    // Copy result back to host
+    alpaka::memcpy(queue, bufHostC, bufDevC);
+    alpaka::wait(queue);
+
+    // Verify the result
+    bool success = true;
+    auto mdHostC = alpaka::experimental::getMdSpan(bufHostC);
+    for(Idx i = 0; i < M; ++i)
+    {
+        for(Idx j = 0; j < N; ++j)
+        {
+            DataType expectedValue = 0.0f;
+            for(Idx k = 0; k < K; ++k)
+            {
+                expectedValue += mdHostA(i, k) * mdHostB(k, j);
+            }
+            if(mdHostC(i, j) != expectedValue)
+            {
+                success = false;
+                break;
+            }
+        }
+    }
+
+    std::cout << "Multiplication of matrices of size " << M << "x" << K << " and " << K << "x" << N << " using mdspan "
+              << (success ? "succeeded" : "failed") << "!" << std::endl;
+    if(!success)
+    {
+        return EXIT_FAILURE;
+    }
+
+    return EXIT_SUCCESS;
+}
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
+}
diff --git a/alpaka/example/monteCarloIntegration/src/monteCarloIntegration.cpp b/alpaka/example/monteCarloIntegration/src/monteCarloIntegration.cpp
index bbb9e3fc..b26cd2af 100644
--- a/alpaka/example/monteCarloIntegration/src/monteCarloIntegration.cpp
+++ b/alpaka/example/monteCarloIntegration/src/monteCarloIntegration.cpp
@@ -3,7 +3,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 
 #include <cstdint>
 #include <cstdlib>
@@ -34,6 +34,7 @@ struct Kernel
     //! \param numPoints The total number of points to be calculated.
     //! \param globalCounter The sum of all local results.
     //! \param functor The function for which the integral is to be computed.
+    ALPAKA_NO_HOST_ACC_WARNING
     template<typename TAcc, typename TFunctor>
     ALPAKA_FN_ACC auto operator()(
         TAcc const& acc,
@@ -52,7 +53,7 @@ struct Kernel
             linearizedGlobalThreadIdx,
             0); // No specific subsequence start.
         // For simplicity the interval is fixed to [0.0,1.0].
-        auto dist(alpaka::rand::distribution::createUniformReal<float>(acc));
+        auto dist = alpaka::rand::distribution::createUniformReal<float>(acc);
 
         uint32_t localCount = 0;
         for(size_t i = linearizedGlobalThreadIdx; i < numPoints; i += globalThreadExtent.prod())
@@ -72,13 +73,18 @@ struct Kernel
     }
 };
 
-auto main() -> int
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
     // Defines and setup.
     using Dim = alpaka::DimInt<1>;
     using Idx = std::size_t;
     using Vec = alpaka::Vec<Dim, Idx>;
-    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    using Acc = alpaka::TagToAcc<TAccTag, Dim, Idx>;
     using Host = alpaka::DevCpu;
     auto const platformHost = alpaka::PlatformCpu{};
     auto const devHost = alpaka::getDevByIdx(platformHost, 0);
@@ -90,36 +96,34 @@ auto main() -> int
 
     using BufHost = alpaka::Buf<Host, uint32_t, Dim, Idx>;
     using BufAcc = alpaka::Buf<Acc, uint32_t, Dim, Idx>;
-    using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
+
     // Problem parameter.
     constexpr size_t numPoints = 1'000'000u;
     constexpr size_t extent = 1u;
     constexpr size_t numThreads = 100u; // Kernel will decide numCalcPerThread.
     constexpr size_t numAlpakaElementsPerThread = 1;
-    WorkDiv workdiv{alpaka::getValidWorkDiv<Acc>(
-        devAcc,
-        Vec(numThreads),
-        Vec(numAlpakaElementsPerThread),
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted)};
 
     // Setup buffer.
     BufHost bufHost{alpaka::allocBuf<uint32_t, Idx>(devHost, extent)};
-    uint32_t* const ptrBufHost{alpaka::getPtrNative(bufHost)};
     BufAcc bufAcc{alpaka::allocBuf<uint32_t, Idx>(devAcc, extent)};
-    uint32_t* const ptrBufAcc{alpaka::getPtrNative(bufAcc)};
+    uint32_t* const ptrBufAcc{std::data(bufAcc)};
 
     // Initialize the global count to 0.
-    ptrBufHost[0] = 0.0f;
+    bufHost[0] = 0.0f;
     alpaka::memcpy(queue, bufAcc, bufHost);
 
+    alpaka::KernelCfg<Acc> const kernelCfg = {Vec(numThreads), Vec(numAlpakaElementsPerThread)};
     Kernel kernel;
-    alpaka::exec<Acc>(queue, workdiv, kernel, numPoints, ptrBufAcc, Function{});
+
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, kernel, numPoints, ptrBufAcc, Function{});
+
+    alpaka::exec<Acc>(queue, workDiv, kernel, numPoints, ptrBufAcc, Function{});
     alpaka::memcpy(queue, bufHost, bufAcc);
     alpaka::wait(queue);
 
     // Check the result.
-    uint32_t globalCount = *ptrBufHost;
+    uint32_t globalCount = bufHost[0];
 
     // Final result.
     float finalResult = globalCount / static_cast<float>(numPoints);
@@ -132,3 +136,19 @@ auto main() -> int
     std::cout << "error: " << error << "\n";
     return error > 0.001 ? EXIT_FAILURE : EXIT_SUCCESS;
 }
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
+}
diff --git a/alpaka/example/openMPSchedule/src/openMPSchedule.cpp b/alpaka/example/openMPSchedule/src/openMPSchedule.cpp
index 23a71eec..65abaa01 100644
--- a/alpaka/example/openMPSchedule/src/openMPSchedule.cpp
+++ b/alpaka/example/openMPSchedule/src/openMPSchedule.cpp
@@ -3,7 +3,6 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
 
 #include <cstdint>
 #include <iostream>
@@ -105,18 +104,18 @@ auto main() -> int
     Queue queue(devAcc);
 
     // Define the work division
-    Idx const threadsPerGrid = 16u;
+    Idx const elementsPerGrid = 16u;
     Idx const elementsPerThread = 1u;
-    auto const workDiv = alpaka::getValidWorkDiv<Acc>(
-        devAcc,
-        threadsPerGrid,
-        elementsPerThread,
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
+
+    OpenMPScheduleDefaultKernel openMPScheduleDefaultKernel;
+
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    alpaka::KernelCfg<Acc> kernelCfg = {elementsPerGrid, elementsPerThread};
+    auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, openMPScheduleDefaultKernel);
 
     // Run the kernel setting no schedule explicitly.
     std::cout << "OpenMPScheduleDefaultKernel setting no schedule explicitly:\n";
-    alpaka::exec<Acc>(queue, workDiv, OpenMPScheduleDefaultKernel{});
+    alpaka::exec<Acc>(queue, workDiv, openMPScheduleDefaultKernel);
     alpaka::wait(queue);
 
     // Run the kernel setting the schedule via a trait
diff --git a/alpaka/example/parallelLoopPatterns/src/parallelLoopPatterns.cpp b/alpaka/example/parallelLoopPatterns/src/parallelLoopPatterns.cpp
index 6b9736ec..a7e05b36 100644
--- a/alpaka/example/parallelLoopPatterns/src/parallelLoopPatterns.cpp
+++ b/alpaka/example/parallelLoopPatterns/src/parallelLoopPatterns.cpp
@@ -3,7 +3,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 
 #include <iostream>
 #include <typeinfo>
@@ -45,10 +45,9 @@ void testResult(TQueue& queue, TBufAcc& bufAcc)
     auto const byte(static_cast<uint8_t>(0u));
     alpaka::memset(queue, bufAcc, byte);
     // Test that all elements were processed
-    auto const* result = alpaka::getPtrNative(bufHost);
     bool testPassed = true;
     for(uint32_t i = 0u; i < n; i++)
-        testPassed = testPassed && (std::abs(result[i] - process(i)) < 1e-3);
+        testPassed = testPassed && (std::abs(bufHost[i] - process(i)) < 1e-3);
     std::cout << (testPassed ? "Test passed.\n" : "Test failed.\n");
 }
 
@@ -74,7 +73,7 @@ struct NaiveCudaStyleKernel
     template<typename TAcc>
     ALPAKA_FN_ACC void operator()(TAcc const& acc, float* result, uint32_t n) const
     {
-        auto const globalThreadIdx(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]);
+        auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
         // Cuf off threads that have nothing to do
         if(globalThreadIdx < n)
         {
@@ -115,7 +114,7 @@ void naiveCudaStyle(TDev& dev, TQueue& queue, TBufAcc& bufAcc)
     std::cout << "\nNaive CUDA style processing - each thread processes one data point:\n";
     std::cout << "   " << blocksPerGrid << " blocks, " << threadsPerBlock << " threads per block, "
               << "alpaka element layer not used\n";
-    alpaka::exec<TAcc>(queue, workDiv, NaiveCudaStyleKernel{}, alpaka::getPtrNative(bufAcc), n);
+    alpaka::exec<TAcc>(queue, workDiv, NaiveCudaStyleKernel{}, std::data(bufAcc), n);
     testResult(queue, bufAcc);
 }
 
@@ -141,8 +140,8 @@ struct GridStridedLoopKernel
     template<typename TAcc>
     ALPAKA_FN_ACC void operator()(TAcc const& acc, float* result, uint32_t n) const
     {
-        auto const globalThreadExtent(alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u]);
-        auto const globalThreadIdx(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]);
+        auto const globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u];
+        auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
         for(uint32_t dataDomainIdx = globalThreadIdx; dataDomainIdx < n; dataDomainIdx += globalThreadExtent)
         {
             auto const memoryIdx = dataDomainIdx;
@@ -178,7 +177,7 @@ void gridStridedLoop(TDev& dev, TQueue& queue, TBufAcc& bufAcc)
     std::cout << "\nGrid strided loop processing - fixed number of threads and blocks:\n";
     std::cout << "   " << blocksPerGrid << " blocks, " << threadsPerBlock << " threads per block, "
               << "alpaka element layer not used\n";
-    alpaka::exec<TAcc>(queue, workDiv, GridStridedLoopKernel{}, alpaka::getPtrNative(bufAcc), n);
+    alpaka::exec<TAcc>(queue, workDiv, GridStridedLoopKernel{}, std::data(bufAcc), n);
     testResult(queue, bufAcc);
 }
 
@@ -206,9 +205,9 @@ struct ChunkedGridStridedLoopKernel
     template<typename TAcc>
     ALPAKA_FN_ACC void operator()(TAcc const& acc, float* result, uint32_t n) const
     {
-        auto const numElements(alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]);
-        auto const globalThreadExtent(alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u]);
-        auto const globalThreadIdx(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]);
+        auto const numElements = alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u];
+        auto const globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u];
+        auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
         // Additionally could split the loop into peeled and remainder
         for(uint32_t chunkStart = globalThreadIdx * numElements; chunkStart < n;
             chunkStart += globalThreadExtent * numElements)
@@ -253,7 +252,7 @@ void chunkedGridStridedLoop(TDev& dev, TQueue& queue, TBufAcc& bufAcc)
     std::cout << "\nChunked grid strided loop processing - fixed number of threads and blocks:\n";
     std::cout << "   " << blocksPerGrid << " blocks, " << threadsPerBlock << " threads per block, "
               << elementsPerThread << " alpaka elements per thread\n";
-    alpaka::exec<TAcc>(queue, workDiv, ChunkedGridStridedLoopKernel{}, alpaka::getPtrNative(bufAcc), n);
+    alpaka::exec<TAcc>(queue, workDiv, ChunkedGridStridedLoopKernel{}, std::data(bufAcc), n);
     testResult(queue, bufAcc);
 }
 
@@ -278,8 +277,8 @@ struct NaiveOpenMPStyleKernel
     template<typename TAcc>
     ALPAKA_FN_ACC void operator()(TAcc const& acc, float* result, uint32_t n) const
     {
-        auto const globalThreadExtent(alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u]);
-        auto const globalThreadIdx(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]);
+        auto const globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u];
+        auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
         auto const processPerThread = (n + globalThreadExtent - 1) / globalThreadExtent;
         for(uint32_t dataDomainIdx = globalThreadIdx * processPerThread;
             (dataDomainIdx < (globalThreadIdx + 1) * processPerThread) && (dataDomainIdx < n);
@@ -319,7 +318,7 @@ void naiveOpenMPStyle(TDev& dev, TQueue& queue, TBufAcc& bufAcc)
     std::cout << "\nNaive OpenMP style processing - each thread processes a single consecutive range of elements:\n";
     std::cout << "   " << blocksPerGrid << " blocks, " << threadsPerBlock << " threads per block, "
               << "alpaka element layer not used\n";
-    alpaka::exec<TAcc>(queue, workDiv, NaiveOpenMPStyleKernel{}, alpaka::getPtrNative(bufAcc), n);
+    alpaka::exec<TAcc>(queue, workDiv, NaiveOpenMPStyleKernel{}, std::data(bufAcc), n);
     testResult(queue, bufAcc);
 }
 
@@ -343,9 +342,9 @@ struct OpenMPSimdStyleKernel
     template<typename TAcc>
     ALPAKA_FN_ACC void operator()(TAcc const& acc, float* result, uint32_t n) const
     {
-        auto const numElements(alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]);
-        auto const globalThreadExtent(alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u]);
-        auto const globalThreadIdx(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]);
+        auto const numElements = alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u];
+        auto const globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u];
+        auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
         // This is the number for naive OpenMP style
         auto const naiveProcessPerThread = (n + globalThreadExtent - 1) / globalThreadExtent;
         // Round up to multiple of numElements
@@ -397,33 +396,22 @@ void openMPSimdStyle(TDev& dev, TQueue& queue, TBufAcc& bufAcc)
     std::cout << "\nOpenMP SIMD style processing - each thread processes a single consecutive range of elements:\n";
     std::cout << "   " << blocksPerGrid << " blocks, " << threadsPerBlock << " threads per block, "
               << elementsPerThread << " alpaka elements per thread\n";
-    alpaka::exec<TAcc>(queue, workDiv, OpenMPSimdStyleKernel{}, alpaka::getPtrNative(bufAcc), n);
+    alpaka::exec<TAcc>(queue, workDiv, OpenMPSimdStyleKernel{}, std::data(bufAcc), n);
     testResult(queue, bufAcc);
 }
 
-auto main() -> int
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
-// Fallback for the CI with disabled sequential backend
-#if defined(ALPAKA_CI) && !defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
-    return EXIT_SUCCESS;
-#else
-
     // Define the index domain, this example is only for 1d
     using Dim = alpaka::DimInt<1u>;
 
     // Define the accelerator
-    //
-    // It is possible to choose from a set of accelerators:
-    // - AccGpuCudaRt
-    // - AccGpuHipRt
-    // - AccCpuThreads
-    // - AccCpuFibers
-    // - AccCpuOmp2Threads
-    // - AccCpuOmp2Blocks
-    // - AccCpuTbbBlocks
-    // - AccCpuSerial
-    // using Acc = alpaka::AccCpuSerial<Dim, uint32_t>;
-    using Acc = alpaka::ExampleDefaultAcc<Dim, uint32_t>;
+    using Acc = alpaka::TagToAcc<TAccTag, Dim, uint32_t>;
     std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
 
     // Select a device and create queue for it
@@ -442,5 +430,21 @@ auto main() -> int
     naiveOpenMPStyle<Acc>(devAcc, queue, bufAcc);
     openMPSimdStyle<Acc>(devAcc, queue, bufAcc);
 
-#endif
+    return EXIT_SUCCESS;
+}
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
 }
diff --git a/alpaka/example/randomCells2D/src/randomCells2D.cpp b/alpaka/example/randomCells2D/src/randomCells2D.cpp
index 7b96f795..36bc1258 100644
--- a/alpaka/example/randomCells2D/src/randomCells2D.cpp
+++ b/alpaka/example/randomCells2D/src/randomCells2D.cpp
@@ -3,7 +3,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 
 #include <cstdint>
 #include <cstdlib>
@@ -15,16 +15,14 @@ constexpr unsigned NUM_X = 127;
 constexpr unsigned NUM_Y = 211;
 
 /// Selected PRNG engine for single-value operation
-template<typename TAcc>
-using RandomEngineSingle = alpaka::rand::Philox4x32x10<TAcc>;
+using RandomEngineSingle = alpaka::rand::Philox4x32x10;
 // using RandomEngineSingle = alpaka::rand::engine::uniform_cuda_hip::Xor;
 // using RandomEngineSingle = alpaka::rand::engine::cpu::MersenneTwister;
 // using RandomEngineSingle = alpaka::rand::engine::cpu::TinyMersenneTwister;
 
 
 /// Selected PRNG engine for vector operation
-template<typename TAcc>
-using RandomEngineVector = alpaka::rand::Philox4x32x10Vector<TAcc>;
+using RandomEngineVector = alpaka::rand::Philox4x32x10Vector;
 
 /** Get a  pointer to the correct location of `TElement array` taking pitch into account.
  *
@@ -71,7 +69,7 @@ struct RunTimestepKernelSingle
     ALPAKA_FN_ACC auto operator()(
         TAcc const& acc,
         TExtent const extent,
-        RandomEngineSingle<TAcc>* const states,
+        RandomEngineSingle* const states,
         float* const cells,
         std::size_t pitchRand,
         std::size_t pitchOut) const -> void
@@ -84,7 +82,7 @@ struct RunTimestepKernelSingle
             auto cellsOut = pitchedPointer2D(cells, pitchOut, idx);
 
             // Setup generator and distribution.
-            RandomEngineSingle<TAcc> engine(*statesOut);
+            RandomEngineSingle engine(*statesOut);
             alpaka::rand::UniformReal<float> dist;
 
             float sum = 0;
@@ -104,7 +102,7 @@ struct RunTimestepKernelVector
     ALPAKA_FN_ACC auto operator()(
         TAcc const& acc,
         TExtent const extent,
-        RandomEngineVector<TAcc>* const states,
+        RandomEngineVector* const states,
         float* const cells,
         std::size_t pitchRand,
         std::size_t pitchOut) const -> void
@@ -117,10 +115,10 @@ struct RunTimestepKernelVector
             auto cellsOut = pitchedPointer2D(cells, pitchOut, idx);
 
             // Setup generator and distribution.
-            RandomEngineVector<TAcc> engine(*statesOut); // Load the state of the random engine
+            RandomEngineVector engine(*statesOut); // Load the state of the random engine
             using DistributionResult =
-                typename RandomEngineVector<TAcc>::template ResultContainer<float>; // Container type which will store
-                                                                                    // the distribution results
+                typename RandomEngineVector::template ResultContainer<float>; // Container type which will store
+                                                                              // the distribution results
             constexpr unsigned resultVectorSize = std::tuple_size_v<DistributionResult>; // Size of the result vector
             alpaka::rand::UniformReal<DistributionResult> dist; // Vector-aware distribution function
 
@@ -143,12 +141,17 @@ struct RunTimestepKernelVector
     }
 };
 
-auto main() -> int
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
     using Dim = alpaka::DimInt<2>;
     using Idx = std::size_t;
     using Vec = alpaka::Vec<Dim, Idx>;
-    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    using Acc = alpaka::TagToAcc<TAccTag, Dim, Idx>;
     using Host = alpaka::DevCpu;
     auto const platformHost = alpaka::PlatformCpu{};
     auto const devHost = alpaka::getDevByIdx(platformHost, 0);
@@ -160,53 +163,55 @@ auto main() -> int
 
     using BufHost = alpaka::Buf<Host, float, Dim, Idx>;
     using BufAcc = alpaka::Buf<Acc, float, Dim, Idx>;
-    using BufHostRand = alpaka::Buf<Host, RandomEngineSingle<Acc>, Dim, Idx>;
-    using BufAccRand = alpaka::Buf<Acc, RandomEngineSingle<Acc>, Dim, Idx>;
-    using BufHostRandVec = alpaka::Buf<Host, RandomEngineVector<Acc>, Dim, Idx>;
-    using BufAccRandVec = alpaka::Buf<Acc, RandomEngineVector<Acc>, Dim, Idx>;
-    using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
+    using BufHostRand = alpaka::Buf<Host, RandomEngineSingle, Dim, Idx>;
+    using BufAccRand = alpaka::Buf<Acc, RandomEngineSingle, Dim, Idx>;
+    using BufHostRandVec = alpaka::Buf<Host, RandomEngineVector, Dim, Idx>;
+    using BufAccRandVec = alpaka::Buf<Acc, RandomEngineVector, Dim, Idx>;
 
     constexpr Idx numX = NUM_X;
     constexpr Idx numY = NUM_Y;
 
-    const Vec extent(numY, numX);
+    Vec const extent(numY, numX);
 
     constexpr Idx perThreadX = 1;
     constexpr Idx perThreadY = 1;
 
-    WorkDiv workdiv{alpaka::getValidWorkDiv<Acc>(
-        devAcc,
-        extent,
-        Vec(perThreadY, perThreadX),
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted)};
-
     // Setup buffer.
     BufHost bufHostS{alpaka::allocBuf<float, Idx>(devHost, extent)};
-    float* const ptrBufHostS{alpaka::getPtrNative(bufHostS)};
+    float* const ptrBufHostS{std::data(bufHostS)};
     BufAcc bufAccS{alpaka::allocBuf<float, Idx>(devAcc, extent)};
-    float* const ptrBufAccS{alpaka::getPtrNative(bufAccS)};
+    float* const ptrBufAccS{std::data(bufAccS)};
 
     BufHost bufHostV{alpaka::allocBuf<float, Idx>(devHost, extent)};
-    float* const ptrBufHostV{alpaka::getPtrNative(bufHostV)};
+    float* const ptrBufHostV{std::data(bufHostV)};
     BufAcc bufAccV{alpaka::allocBuf<float, Idx>(devAcc, extent)};
-    float* const ptrBufAccV{alpaka::getPtrNative(bufAccV)};
+    float* const ptrBufAccV{std::data(bufAccV)};
 
-    BufHostRand bufHostRandS{alpaka::allocBuf<RandomEngineSingle<Acc>, Idx>(devHost, extent)};
-    BufAccRand bufAccRandS{alpaka::allocBuf<RandomEngineSingle<Acc>, Idx>(devAcc, extent)};
-    RandomEngineSingle<Acc>* const ptrBufAccRandS{alpaka::getPtrNative(bufAccRandS)};
+    BufHostRand bufHostRandS{alpaka::allocBuf<RandomEngineSingle, Idx>(devHost, extent)};
+    BufAccRand bufAccRandS{alpaka::allocBuf<RandomEngineSingle, Idx>(devAcc, extent)};
+    RandomEngineSingle* const ptrBufAccRandS{std::data(bufAccRandS)};
 
-    BufHostRandVec bufHostRandV{alpaka::allocBuf<RandomEngineVector<Acc>, Idx>(devHost, extent)};
-    BufAccRandVec bufAccRandV{alpaka::allocBuf<RandomEngineVector<Acc>, Idx>(devAcc, extent)};
-    RandomEngineVector<Acc>* const ptrBufAccRandV{alpaka::getPtrNative(bufAccRandV)};
+    BufHostRandVec bufHostRandV{alpaka::allocBuf<RandomEngineVector, Idx>(devHost, extent)};
+    BufAccRandVec bufAccRandV{alpaka::allocBuf<RandomEngineVector, Idx>(devAcc, extent)};
+    RandomEngineVector* const ptrBufAccRandV{std::data(bufAccRandV)};
 
     InitRandomKernel initRandomKernel;
+
+
     auto pitchBufAccRandS = alpaka::getPitchesInBytes(bufAccRandS)[0];
-    alpaka::exec<Acc>(queue, workdiv, initRandomKernel, extent, ptrBufAccRandS, pitchBufAccRandS);
-    alpaka::wait(queue);
 
     auto pitchBufAccRandV = alpaka::getPitchesInBytes(bufAccRandV)[0];
-    alpaka::exec<Acc>(queue, workdiv, initRandomKernel, extent, ptrBufAccRandV, pitchBufAccRandV);
+
+    alpaka::KernelCfg<Acc> const kernelCfg = {extent, Vec(perThreadY, perThreadX)};
+
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    auto const workDivInitRandom
+        = alpaka::getValidWorkDiv(kernelCfg, devAcc, initRandomKernel, extent, ptrBufAccRandS, pitchBufAccRandS);
+
+    alpaka::exec<Acc>(queue, workDivInitRandom, initRandomKernel, extent, ptrBufAccRandS, pitchBufAccRandS);
+    alpaka::wait(queue);
+
+    alpaka::exec<Acc>(queue, workDivInitRandom, initRandomKernel, extent, ptrBufAccRandV, pitchBufAccRandV);
     alpaka::wait(queue);
 
     auto pitchHostS = alpaka::getPitchesInBytes(bufHostS)[0];
@@ -224,9 +229,23 @@ auto main() -> int
     auto pitchBufAccS = alpaka::getPitchesInBytes(bufAccS)[0];
     alpaka::memcpy(queue, bufAccS, bufHostS);
     RunTimestepKernelSingle runTimestepKernelSingle;
+
+    alpaka::KernelCfg<Acc> const runtimeRandomKernelCfg = {extent, Vec(perThreadY, perThreadX)};
+
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    auto const workDivRuntimeStep = alpaka::getValidWorkDiv(
+        runtimeRandomKernelCfg,
+        devAcc,
+        runTimestepKernelSingle,
+        extent,
+        ptrBufAccRandS,
+        ptrBufAccS,
+        pitchBufAccRandS,
+        pitchBufAccS);
+
     alpaka::exec<Acc>(
         queue,
-        workdiv,
+        workDivRuntimeStep,
         runTimestepKernelSingle,
         extent,
         ptrBufAccRandS,
@@ -240,7 +259,7 @@ auto main() -> int
     RunTimestepKernelVector runTimestepKernelVector;
     alpaka::exec<Acc>(
         queue,
-        workdiv,
+        workDivRuntimeStep,
         runTimestepKernelVector,
         extent,
         ptrBufAccRandV,
@@ -287,3 +306,19 @@ auto main() -> int
         return 1;
     }
 }
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
+}
diff --git a/alpaka/example/randomStrategies/src/randomStrategies.cpp b/alpaka/example/randomStrategies/src/randomStrategies.cpp
index 84d2b543..6a1940c8 100644
--- a/alpaka/example/randomStrategies/src/randomStrategies.cpp
+++ b/alpaka/example/randomStrategies/src/randomStrategies.cpp
@@ -3,7 +3,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 
 #include <chrono>
 #include <cstdint>
@@ -17,8 +17,7 @@ constexpr unsigned NUM_ROLLS = 2000; ///< Amount of random number "dice rolls" p
 
 /// Selected PRNG engine
 // Comment the current "using" line, and uncomment a different one to change the PRNG engine
-template<typename TAcc>
-using RandomEngine = alpaka::rand::Philox4x32x10<TAcc>;
+using RandomEngine = alpaka::rand::Philox4x32x10;
 
 // using RandomEngine = alpaka::rand::engine::cpu::MersenneTwister;
 // using RandomEngine = alpaka::rand::engine::cpu::TinyMersenneTwister;
@@ -26,13 +25,14 @@ using RandomEngine = alpaka::rand::Philox4x32x10<TAcc>;
 
 
 /// Parameters to set up the default accelerator, queue, and buffers
+template<typename TAccTag>
 struct Box
 {
     // accelerator, queue, and work division typedefs
     using Dim = alpaka::DimInt<1>;
     using Idx = std::size_t;
     using Vec = alpaka::Vec<Dim, Idx>;
-    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    using Acc = alpaka::TagToAcc<TAccTag, Dim, Idx>;
     using PlatformHost = alpaka::PlatformCpu;
     using Host = alpaka::Dev<PlatformHost>;
     using PlatformAcc = alpaka::Platform<Acc>;
@@ -45,11 +45,12 @@ struct Box
     QueueAcc queue; ///< default accelerator queue
 
     // buffers holding the PRNG states
-    using BufHostRand = alpaka::Buf<Host, RandomEngine<Acc>, Dim, Idx>;
-    using BufAccRand = alpaka::Buf<Acc, RandomEngine<Acc>, Dim, Idx>;
+    using BufHostRand = alpaka::Buf<Host, RandomEngine, Dim, Idx>;
+    using BufAccRand = alpaka::Buf<Acc, RandomEngine, Dim, Idx>;
 
     Vec const extentRand; ///< size of the buffer of PRNG states
-    WorkDiv workdivRand; ///< work division for PRNG buffer initialization
+    // WorkDiv workdivRand; ///< work division for PRNG buffer initialization // REMOVE THAT!!
+    // WorkDiv workdivResult; ///< work division of the result calculation // REMOVE THAT!!
     BufHostRand bufHostRand; ///< host side PRNG states buffer (can be used to check the state of the states)
     BufAccRand bufAccRand; ///< device side PRNG states buffer
 
@@ -58,28 +59,16 @@ struct Box
     using BufAcc = alpaka::Buf<Acc, float, Dim, Idx>;
 
     Vec const extentResult; ///< size of the results buffer
-    WorkDiv workdivResult; ///< work division of the result calculation
+
     BufHost bufHostResult; ///< host side results buffer
     BufAcc bufAccResult; ///< device side results buffer
 
     Box()
         : queue{alpaka::getDevByIdx(accPlatform, 0)}
         , extentRand{static_cast<Idx>(NUM_POINTS)} // One PRNG state per "point".
-        , workdivRand{alpaka::getValidWorkDiv<Acc>(
-              alpaka::getDevByIdx(accPlatform, 0),
-              extentRand,
-              Vec(Idx{1}),
-              false,
-              alpaka::GridBlockExtentSubDivRestrictions::Unrestricted)}
-        , bufHostRand{alpaka::allocBuf<RandomEngine<Acc>, Idx>(alpaka::getDevByIdx(hostPlatform, 0), extentRand)}
-        , bufAccRand{alpaka::allocBuf<RandomEngine<Acc>, Idx>(alpaka::getDevByIdx(accPlatform, 0), extentRand)}
+        , bufHostRand{alpaka::allocBuf<RandomEngine, Idx>(alpaka::getDevByIdx(hostPlatform, 0), extentRand)}
+        , bufAccRand{alpaka::allocBuf<RandomEngine, Idx>(alpaka::getDevByIdx(accPlatform, 0), extentRand)}
         , extentResult{static_cast<Idx>((NUM_POINTS * NUM_ROLLS))} // Store all "rolls" for each "point"
-        , workdivResult{alpaka::getValidWorkDiv<Acc>(
-              alpaka::getDevByIdx(accPlatform, 0),
-              extentResult,
-              Vec(static_cast<Idx>(NUM_ROLLS)), // One thread per "point"; each performs NUM_ROLLS "rolls"
-              false,
-              alpaka::GridBlockExtentSubDivRestrictions::Unrestricted)}
         , bufHostResult{alpaka::allocBuf<float, Idx>(alpaka::getDevByIdx(hostPlatform, 0), extentResult)}
         , bufAccResult{alpaka::allocBuf<float, Idx>(alpaka::getDevByIdx(accPlatform, 0), extentResult)}
     {
@@ -167,7 +156,7 @@ struct FillKernel
     ALPAKA_FN_ACC auto operator()(
         TAcc const& acc, ///< current accelerator
         TExtent const extent, ///< size of the results buffer
-        RandomEngine<TAcc>* const states, ///< PRNG states buffer
+        RandomEngine* const states, ///< PRNG states buffer
         float* const cells ///< results buffer
     ) const -> void
     {
@@ -180,7 +169,7 @@ struct FillKernel
             auto const numWorkers
                 = alpaka::math::min(acc, numGridThreads, static_cast<decltype(numGridThreads)>(NUM_POINTS));
 
-            RandomEngine<TAcc> engine(states[idx]); // Setup the PRNG using the saved state for this thread.
+            RandomEngine engine(states[idx]); // Setup the PRNG using the saved state for this thread.
             alpaka::rand::UniformReal<float> dist; // Setup the random number distribution
             for(uint32_t i = idx; i < extent[0]; i += numWorkers)
             {
@@ -195,13 +184,14 @@ struct FillKernel
  *
  *  File is in TSV format. One line for each "point"; line length is the number of "rolls".
  */
-void saveDataAndShowAverage(std::string filename, float const* buffer, Box const& box)
+template<typename TAccTag>
+void saveDataAndShowAverage(std::string filename, float const* buffer, Box<TAccTag> const& box)
 {
     std::ofstream output(filename);
     std::cout << "Writing " << filename << " ... " << std::flush;
     auto const lineLength = box.extentResult[0] / box.extentRand[0];
     double average = 0;
-    for(Box::Idx i = 0; i < box.extentResult[0]; ++i)
+    for(typename Box<TAccTag>::Idx i = 0; i < box.extentResult[0]; ++i)
     {
         output << buffer[i] << ((i + 1) % lineLength ? "\t" : "\n");
         average += buffer[i];
@@ -217,7 +207,8 @@ struct Writer;
 template<>
 struct Writer<Strategy::seed>
 {
-    static void save(float const* buffer, Box const& box)
+    template<typename TAccTag>
+    static void save(float const* buffer, Box<TAccTag> const& box)
     {
         saveDataAndShowAverage("out_seed.csv", buffer, box);
     }
@@ -226,7 +217,8 @@ struct Writer<Strategy::seed>
 template<>
 struct Writer<Strategy::subsequence>
 {
-    static void save(float const* buffer, Box const& box)
+    template<typename TAccTag>
+    static void save(float const* buffer, Box<TAccTag> const& box)
     {
         saveDataAndShowAverage("out_subsequence.csv", buffer, box);
     }
@@ -235,17 +227,18 @@ struct Writer<Strategy::subsequence>
 template<>
 struct Writer<Strategy::offset>
 {
-    static void save(float const* buffer, Box const& box)
+    template<typename TAccTag>
+    static void save(float const* buffer, Box<TAccTag> const& box)
     {
         saveDataAndShowAverage("out_offset.csv", buffer, box);
     }
 };
 
-template<Strategy TStrategy>
-void runStrategy(Box& box)
+template<Strategy TStrategy, typename TAccTag>
+void runStrategy(Box<TAccTag>& box)
 {
     // Set up the pointer to the PRNG states buffer
-    RandomEngine<Box::Acc>* const ptrBufAccRand{alpaka::getPtrNative(box.bufAccRand)};
+    RandomEngine* const ptrBufAccRand{std::data(box.bufAccRand)};
 
     // Initialize the PRNG and its states on the device
     InitRandomKernel<TStrategy> initRandomKernel;
@@ -253,9 +246,26 @@ void runStrategy(Box& box)
     // of the PRNG buffer and has to be passed in explicitly. Other strategies ignore the last parameter, and deduce
     // the initial parameters solely from the thread index
 
-    alpaka::exec<Box::Acc>(
+
+    alpaka::KernelCfg<typename Box<TAccTag>::Acc> kernelCfg
+        = {box.extentRand,
+           typename Box<TAccTag>::Vec(typename Box<TAccTag>::Idx{1}),
+           false,
+           alpaka::GridBlockExtentSubDivRestrictions::Unrestricted};
+
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    auto const workDivRand = alpaka::getValidWorkDiv(
+        kernelCfg,
+        alpaka::getDevByIdx(box.accPlatform, 0),
+        initRandomKernel,
+        box.extentRand,
+        ptrBufAccRand,
+        static_cast<unsigned>(box.extentResult[0] / box.extentRand[0]));
+
+
+    alpaka::exec<typename Box<TAccTag>::Acc>(
         box.queue,
-        box.workdivRand,
+        workDivRand,
         initRandomKernel,
         box.extentRand,
         ptrBufAccRand,
@@ -265,22 +275,46 @@ void runStrategy(Box& box)
     alpaka::wait(box.queue);
 
     // OPTIONAL: copy the the initial states to host if you want to check them yourself
-    // alpaka_rand::Philox4x32x10<Box::Acc>* const ptrBufHostRand{alpaka::getPtrNative(box.bufHostRand)};
+    // alpaka_rand::Philox4x32x10<Box::Acc>* const ptrBufHostRand{std::data(box.bufHostRand)};
     // alpaka::memcpy(box.queue, box.bufHostRand, box.bufAccRand);
     // alpaka::wait(box.queue);
 
     // Set up the pointers to the results buffers
-    float* const ptrBufHostResult{alpaka::getPtrNative(box.bufHostResult)};
-    float* const ptrBufAccResult{alpaka::getPtrNative(box.bufAccResult)};
+    float* const ptrBufHostResult{std::data(box.bufHostResult)};
+    float* const ptrBufAccResult{std::data(box.bufAccResult)};
 
     // Initialise the results buffer to zero
-    for(Box::Idx i = 0; i < box.extentResult[0]; ++i)
+    for(typename Box<TAccTag>::Idx i = 0; i < box.extentResult[0]; ++i)
         ptrBufHostResult[i] = 0;
 
     // Run the "computation" kernel filling the results buffer with random numbers in parallel
     alpaka::memcpy(box.queue, box.bufAccResult, box.bufHostResult);
     FillKernel fillKernel;
-    alpaka::exec<Box::Acc>(box.queue, box.workdivResult, fillKernel, box.extentResult, ptrBufAccRand, ptrBufAccResult);
+
+    alpaka::KernelCfg<typename Box<TAccTag>::Acc> fillKernelCfg
+        = {box.extentResult,
+           typename Box<TAccTag>::Vec(static_cast<typename Box<TAccTag>::Idx>(
+               NUM_ROLLS)), // One thread per "point"; each performs NUM_ROLLS "rolls"
+           false,
+           alpaka::GridBlockExtentSubDivRestrictions::Unrestricted};
+
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    auto const workdivResult = alpaka::getValidWorkDiv(
+        fillKernelCfg,
+        alpaka::getDevByIdx(box.accPlatform, 0),
+        fillKernel,
+        box.extentResult,
+        ptrBufAccRand,
+        ptrBufAccResult);
+
+
+    alpaka::exec<typename Box<TAccTag>::Acc>(
+        box.queue,
+        workdivResult,
+        fillKernel,
+        box.extentResult,
+        ptrBufAccRand,
+        ptrBufAccResult);
     alpaka::memcpy(box.queue, box.bufHostResult, box.bufAccResult);
     alpaka::wait(box.queue);
 
@@ -288,9 +322,14 @@ void runStrategy(Box& box)
     Writer<TStrategy>::save(ptrBufHostResult, box);
 }
 
-auto main() -> int
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
-    Box box; // Initialize the box
+    Box<TAccTag> box; // Initialize the box
 
     runStrategy<Strategy::seed>(box); // threads start from different seeds
     runStrategy<Strategy::subsequence>(box); // threads use different subsequences
@@ -298,3 +337,19 @@ auto main() -> int
 
     return 0;
 }
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
+}
diff --git a/alpaka/example/reduce/src/reduce.cpp b/alpaka/example/reduce/src/reduce.cpp
index 15957650..2d5fe2c1 100644
--- a/alpaka/example/reduce/src/reduce.cpp
+++ b/alpaka/example/reduce/src/reduce.cpp
@@ -79,8 +79,8 @@ auto reduce(
     auto const taskKernelReduceMain = alpaka::createTaskKernel<Acc>(
         workDiv1,
         kernel1,
-        alpaka::getPtrNative(sourceDeviceMemory),
-        alpaka::getPtrNative(destinationDeviceMemory),
+        std::data(sourceDeviceMemory),
+        std::data(destinationDeviceMemory),
         n,
         func);
 
@@ -88,8 +88,8 @@ auto reduce(
     auto const taskKernelReduceLastBlock = alpaka::createTaskKernel<Acc>(
         workDiv2,
         kernel2,
-        alpaka::getPtrNative(destinationDeviceMemory),
-        alpaka::getPtrNative(destinationDeviceMemory),
+        std::data(destinationDeviceMemory),
+        std::data(destinationDeviceMemory),
         blockCount,
         func);
 
@@ -131,7 +131,7 @@ auto main() -> int
     // allocate memory
     auto hostMemory = alpaka::allocBuf<T, Idx>(devHost, n);
 
-    T* nativeHostMemory = alpaka::getPtrNative(hostMemory);
+    T* nativeHostMemory = std::data(hostMemory);
 
     // fill array with data
     for(uint64_t i = 0; i < n; i++)
diff --git a/alpaka/example/tagSpecialization/src/tagSpecialization.cpp b/alpaka/example/tagSpecialization/src/tagSpecialization.cpp
index 8fb0c180..dac94bba 100644
--- a/alpaka/example/tagSpecialization/src/tagSpecialization.cpp
+++ b/alpaka/example/tagSpecialization/src/tagSpecialization.cpp
@@ -3,7 +3,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 
 #include <iostream>
 
@@ -76,27 +76,16 @@ struct WrapperKernel
     }
 };
 
-auto main() -> int
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
-// Fallback for the CI with disabled sequential backend
-#if defined(ALPAKA_CI) && !defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
-    return EXIT_SUCCESS;
-#else
-
     // Define the accelerator
-    //
-    // It is possible to choose from a set of accelerators:
-    // - AccGpuCudaRt
-    // - AccGpuHipRt
-    // - AccCpuThreads
-    // - AccCpuFibers
-    // - AccCpuOmp2Threads
-    // - AccCpuOmp2Blocks
-    // - AccCpuTbbBlocks
-    // - AccCpuSerial
-    //
     // For simplicity this examples always uses 1 dimensional indexing, and index type size_t
-    using Acc = alpaka::ExampleDefaultAcc<alpaka::DimInt<1>, std::size_t>;
+    using Acc = alpaka::TagToAcc<TAccTag, alpaka::DimInt<1>, std::size_t>;
     std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
 
     // Call the specialized functions
@@ -126,5 +115,20 @@ auto main() -> int
     alpaka::exec<Acc>(queue, workDiv, WrapperKernel{});
     alpaka::wait(queue);
     return EXIT_SUCCESS;
-#endif
+}
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
 }
diff --git a/alpaka/example/vectorAdd/src/vectorAdd.cpp b/alpaka/example/vectorAdd/src/vectorAdd.cpp
index 60e136af..91d7bc7b 100644
--- a/alpaka/example/vectorAdd/src/vectorAdd.cpp
+++ b/alpaka/example/vectorAdd/src/vectorAdd.cpp
@@ -1,10 +1,10 @@
-/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan, Luca Ferragina,
- *                Aurora Perego
+/* Copyright 2024 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan, Luca Ferragina,
+ *                Aurora Perego, Andrea Bocci
  * SPDX-License-Identifier: ISC
  */
 
 #include <alpaka/alpaka.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 
 #include <chrono>
 #include <iostream>
@@ -35,48 +35,29 @@ class VectorAddKernel
     {
         static_assert(alpaka::Dim<TAcc>::value == 1, "The VectorAddKernel expects 1-dimensional indices!");
 
-        TIdx const gridThreadIdx(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]);
-        TIdx const threadElemExtent(alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]);
-        TIdx const threadFirstElemIdx(gridThreadIdx * threadElemExtent);
-
-        if(threadFirstElemIdx < numElements)
+        // The uniformElements range for loop takes care automatically of the blocks, threads and elements in the
+        // kernel launch grid.
+        for(auto i : alpaka::uniformElements(acc, numElements))
         {
-            // Calculate the number of elements to compute in this thread.
-            // The result is uniform for all but the last thread.
-            TIdx const threadLastElemIdx(threadFirstElemIdx + threadElemExtent);
-            TIdx const threadLastElemIdxClipped((numElements > threadLastElemIdx) ? threadLastElemIdx : numElements);
-
-            for(TIdx i(threadFirstElemIdx); i < threadLastElemIdxClipped; ++i)
-            {
-                C[i] = A[i] + B[i];
-            }
+            C[i] = A[i] + B[i];
         }
     }
 };
 
-auto main() -> int
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
-// Fallback for the CI with disabled sequential backend
-#if defined(ALPAKA_CI) && !defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
-    return EXIT_SUCCESS;
-#else
-
     // Define the index domain
+    // Set the number of dimensions as an integral constant. Set to 1 for 1D.
     using Dim = alpaka::DimInt<1u>;
     using Idx = std::size_t;
 
     // Define the accelerator
-    //
-    // It is possible to choose from a set of accelerators:
-    // - AccGpuCudaRt
-    // - AccGpuHipRt
-    // - AccCpuThreads
-    // - AccCpuOmp2Threads
-    // - AccCpuOmp2Blocks
-    // - AccCpuTbbBlocks
-    // - AccCpuSerial
-    // using Acc = alpaka::AccCpuSerial<Dim, Idx>;
-    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    using Acc = alpaka::TagToAcc<TAccTag, Dim, Idx>;
     using DevAcc = alpaka::Dev<Acc>;
     std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
 
@@ -98,14 +79,6 @@ auto main() -> int
     Idx const elementsPerThread(8u);
     alpaka::Vec<Dim, Idx> const extent(numElements);
 
-    // Let alpaka calculate good block and grid sizes given our full problem extent
-    alpaka::WorkDivMembers<Dim, Idx> const workDiv(alpaka::getValidWorkDiv<Acc>(
-        devAcc,
-        extent,
-        elementsPerThread,
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted));
-
     // Define the buffer element type
     using Data = std::uint32_t;
 
@@ -120,11 +93,6 @@ auto main() -> int
     BufHost bufHostB(alpaka::allocBuf<Data, Idx>(devHost, extent));
     BufHost bufHostC(alpaka::allocBuf<Data, Idx>(devHost, extent));
 
-    // Initialize the host input vectors A and B
-    Data* const pBufHostA(alpaka::getPtrNative(bufHostA));
-    Data* const pBufHostB(alpaka::getPtrNative(bufHostB));
-    Data* const pBufHostC(alpaka::getPtrNative(bufHostC));
-
     // C++14 random generator for uniformly distributed numbers in {1,..,42}
     std::random_device rd{};
     std::default_random_engine eng{rd()};
@@ -132,9 +100,9 @@ auto main() -> int
 
     for(Idx i(0); i < numElements; ++i)
     {
-        pBufHostA[i] = dist(eng);
-        pBufHostB[i] = dist(eng);
-        pBufHostC[i] = 0;
+        bufHostA[i] = dist(eng);
+        bufHostB[i] = dist(eng);
+        bufHostC[i] = 0;
     }
 
     // Allocate 3 buffers on the accelerator
@@ -151,15 +119,27 @@ auto main() -> int
     // Instantiate the kernel function object
     VectorAddKernel kernel;
 
-    // Create the kernel execution task.
-    auto const taskKernel = alpaka::createTaskKernel<Acc>(
-        workDiv,
+    alpaka::KernelCfg<Acc> const kernelCfg = {extent, elementsPerThread};
+
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    auto const workDiv = alpaka::getValidWorkDiv(
+        kernelCfg,
+        devAcc,
         kernel,
         alpaka::getPtrNative(bufAccA),
         alpaka::getPtrNative(bufAccB),
         alpaka::getPtrNative(bufAccC),
         numElements);
 
+    // Create the kernel execution task.
+    auto const taskKernel = alpaka::createTaskKernel<Acc>(
+        workDiv,
+        kernel,
+        std::data(bufAccA),
+        std::data(bufAccB),
+        std::data(bufAccC),
+        numElements);
+
     // Enqueue the kernel execution task
     {
         auto const beginT = std::chrono::high_resolution_clock::now();
@@ -184,8 +164,8 @@ auto main() -> int
     static constexpr int MAX_PRINT_FALSE_RESULTS = 20;
     for(Idx i(0u); i < numElements; ++i)
     {
-        Data const& val(pBufHostC[i]);
-        Data const correctResult(pBufHostA[i] + pBufHostB[i]);
+        Data const& val(bufHostC[i]);
+        Data const correctResult(bufHostA[i] + bufHostB[i]);
         if(val != correctResult)
         {
             if(falseResults < MAX_PRINT_FALSE_RESULTS)
@@ -206,5 +186,20 @@ auto main() -> int
                   << "Execution results incorrect!" << std::endl;
         return EXIT_FAILURE;
     }
-#endif
+}
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
 }
diff --git a/alpaka/include/alpaka/acc/AccCpuOmp2Blocks.hpp b/alpaka/include/alpaka/acc/AccCpuOmp2Blocks.hpp
index 3ff3fcca..27661f58 100644
--- a/alpaka/include/alpaka/acc/AccCpuOmp2Blocks.hpp
+++ b/alpaka/include/alpaka/acc/AccCpuOmp2Blocks.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, René Widera, Jan Stephan, Bernhard Manfred Gruber
+/* Copyright 2024 Axel Huebl, Benjamin Worpitz, René Widera, Jan Stephan, Bernhard Manfred Gruber, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -32,6 +32,7 @@
 
 // Implementation details.
 #include "alpaka/acc/Tag.hpp"
+#include "alpaka/core/ClipCast.hpp"
 #include "alpaka/core/Concepts.hpp"
 #include "alpaka/dev/DevCpu.hpp"
 
@@ -116,14 +117,26 @@ namespace alpaka
             using type = AccCpuOmp2Blocks<TDim, TIdx>;
         };
 
+        //! The CPU OpenMP 2.0 block single thread accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IsSingleThreadAcc<AccCpuOmp2Blocks<TDim, TIdx>> : std::true_type
+        {
+        };
+
+        //! The CPU OpenMP 2.0 block multi thread accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IsMultiThreadAcc<AccCpuOmp2Blocks<TDim, TIdx>> : std::false_type
+        {
+        };
+
         //! The CPU OpenMP 2.0 block accelerator device properties get trait specialization.
         template<typename TDim, typename TIdx>
         struct GetAccDevProps<AccCpuOmp2Blocks<TDim, TIdx>>
         {
-            ALPAKA_FN_HOST static auto getAccDevProps(DevCpu const& /* dev */) -> alpaka::AccDevProps<TDim, TIdx>
+            ALPAKA_FN_HOST static auto getAccDevProps(DevCpu const& dev) -> alpaka::AccDevProps<TDim, TIdx>
             {
                 return {// m_multiProcessorCount
-                        static_cast<TIdx>(1),
+                        alpaka::core::clipCast<TIdx>(omp_get_max_threads()),
                         // m_gridBlockExtentMax
                         Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
                         // m_gridBlockCountMax
@@ -137,7 +150,9 @@ namespace alpaka
                         // m_threadElemCountMax
                         std::numeric_limits<TIdx>::max(),
                         // m_sharedMemSizeBytes
-                        static_cast<size_t>(AccCpuOmp2Blocks<TDim, TIdx>::staticAllocBytes())};
+                        static_cast<size_t>(AccCpuOmp2Blocks<TDim, TIdx>::staticAllocBytes()),
+                        // m_globalMemSizeBytes
+                        getMemBytes(dev)};
             }
         };
 
@@ -174,6 +189,13 @@ namespace alpaka
                 TKernelFnObj const& kernelFnObj,
                 TArgs&&... args)
             {
+                if(workDiv.m_blockThreadExtent.prod() != static_cast<TIdx>(1u))
+                {
+                    throw std::runtime_error(
+                        "The given work division is not valid for a single thread Acc: "
+                        + getAccName<AccCpuOmp2Blocks<TDim, TIdx>>() + ". Threads per block should be 1!");
+                }
+
                 return TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>(
                     workDiv,
                     kernelFnObj,
diff --git a/alpaka/include/alpaka/acc/AccCpuOmp2Threads.hpp b/alpaka/include/alpaka/acc/AccCpuOmp2Threads.hpp
index 4ff20540..bc326bc0 100644
--- a/alpaka/include/alpaka/acc/AccCpuOmp2Threads.hpp
+++ b/alpaka/include/alpaka/acc/AccCpuOmp2Threads.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, René Widera, Jan Stephan, Bernhard Manfred Gruber
+/* Copyright 2024 Axel Huebl, Benjamin Worpitz, René Widera, Jan Stephan, Bernhard Manfred Gruber, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -121,6 +121,18 @@ namespace alpaka
             using type = AccCpuOmp2Threads<TDim, TIdx>;
         };
 
+        //! The CPU OpenMP 2.0 thread single thread accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IsSingleThreadAcc<AccCpuOmp2Threads<TDim, TIdx>> : std::false_type
+        {
+        };
+
+        //! The CPU OpenMP 2.0 thread multi thread accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IsMultiThreadAcc<AccCpuOmp2Threads<TDim, TIdx>> : std::true_type
+        {
+        };
+
         //! The CPU OpenMP 2.0 thread accelerator device properties get trait specialization.
         template<typename TDim, typename TIdx>
         struct GetAccDevProps<AccCpuOmp2Threads<TDim, TIdx>>
@@ -132,6 +144,7 @@ namespace alpaka
 #    else
                 auto const blockThreadCountMax = alpaka::core::clipCast<TIdx>(::omp_get_max_threads());
 #    endif
+                auto const memBytes = getMemBytes(dev);
                 return {// m_multiProcessorCount
                         static_cast<TIdx>(1),
                         // m_gridBlockExtentMax
@@ -147,7 +160,9 @@ namespace alpaka
                         // m_threadElemCountMax
                         std::numeric_limits<TIdx>::max(),
                         // m_sharedMemSizeBytes
-                        getMemBytes(dev)};
+                        memBytes,
+                        // m_globalMemSizeBytes
+                        memBytes};
             }
         };
 
diff --git a/alpaka/include/alpaka/acc/AccCpuSerial.hpp b/alpaka/include/alpaka/acc/AccCpuSerial.hpp
index 2467a277..e1b223f8 100644
--- a/alpaka/include/alpaka/acc/AccCpuSerial.hpp
+++ b/alpaka/include/alpaka/acc/AccCpuSerial.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, René Widera, Jan Stephan, Bernhard Manfred Gruber
+/* Copyright 2024 Axel Huebl, Benjamin Worpitz, René Widera, Jan Stephan, Bernhard Manfred Gruber, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -110,11 +110,23 @@ namespace alpaka
             using type = AccCpuSerial<TDim, TIdx>;
         };
 
+        //! The CPU serial single thread accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IsSingleThreadAcc<AccCpuSerial<TDim, TIdx>> : std::true_type
+        {
+        };
+
+        //! The CPU serial multi thread accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IsMultiThreadAcc<AccCpuSerial<TDim, TIdx>> : std::false_type
+        {
+        };
+
         //! The CPU serial accelerator device properties get trait specialization.
         template<typename TDim, typename TIdx>
         struct GetAccDevProps<AccCpuSerial<TDim, TIdx>>
         {
-            ALPAKA_FN_HOST static auto getAccDevProps(DevCpu const& /* dev */) -> AccDevProps<TDim, TIdx>
+            ALPAKA_FN_HOST static auto getAccDevProps(DevCpu const& dev) -> AccDevProps<TDim, TIdx>
             {
                 return {// m_multiProcessorCount
                         static_cast<TIdx>(1),
@@ -131,7 +143,9 @@ namespace alpaka
                         // m_threadElemCountMax
                         std::numeric_limits<TIdx>::max(),
                         // m_sharedMemSizeBytes
-                        static_cast<size_t>(AccCpuSerial<TDim, TIdx>::staticAllocBytes())};
+                        static_cast<size_t>(AccCpuSerial<TDim, TIdx>::staticAllocBytes()),
+                        // m_globalMemSizeBytes
+                        getMemBytes(dev)};
             }
         };
 
@@ -168,6 +182,13 @@ namespace alpaka
                 TKernelFnObj const& kernelFnObj,
                 TArgs&&... args)
             {
+                if(workDiv.m_blockThreadExtent.prod() != static_cast<TIdx>(1u))
+                {
+                    throw std::runtime_error(
+                        "The given work division is not valid for a single thread Acc: "
+                        + getAccName<AccCpuSerial<TDim, TIdx>>() + ". Threads per block should be 1!");
+                }
+
                 return TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>(
                     workDiv,
                     kernelFnObj,
diff --git a/alpaka/include/alpaka/acc/AccCpuSycl.hpp b/alpaka/include/alpaka/acc/AccCpuSycl.hpp
index 7a2615fc..e4e73782 100644
--- a/alpaka/include/alpaka/acc/AccCpuSycl.hpp
+++ b/alpaka/include/alpaka/acc/AccCpuSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
+/* Copyright 2024 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -6,19 +6,7 @@
 
 #include "alpaka/acc/AccGenericSycl.hpp"
 #include "alpaka/acc/Tag.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/DemangleTypeNames.hpp"
 #include "alpaka/core/Sycl.hpp"
-#include "alpaka/dev/DevCpuSycl.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/kernel/TaskKernelCpuSycl.hpp"
-#include "alpaka/kernel/Traits.hpp"
-#include "alpaka/platform/PlatformCpuSycl.hpp"
-#include "alpaka/platform/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-
-#include <string>
-#include <utility>
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_CPU)
 
@@ -28,65 +16,23 @@ namespace alpaka
     //!
     //! This accelerator allows parallel kernel execution on a oneAPI-capable CPU target device.
     template<typename TDim, typename TIdx>
-    class AccCpuSycl final
-        : public AccGenericSycl<TDim, TIdx>
-        , public concepts::Implements<ConceptAcc, AccCpuSycl<TDim, TIdx>>
-    {
-    public:
-        using AccGenericSycl<TDim, TIdx>::AccGenericSycl;
-    };
-} // namespace alpaka
+    using AccCpuSycl = AccGenericSycl<TagCpuSycl, TDim, TIdx>;
 
-namespace alpaka::trait
-{
-    //! The CPU SYCL accelerator name trait specialization.
-    template<typename TDim, typename TIdx>
-    struct GetAccName<AccCpuSycl<TDim, TIdx>>
+    namespace trait
     {
-        static auto getAccName() -> std::string
+        template<typename TDim, typename TIdx>
+        struct AccToTag<alpaka::AccCpuSycl<TDim, TIdx>>
         {
-            return "AccCpuSycl<" + std::to_string(TDim::value) + "," + core::demangled<TIdx> + ">";
-        }
-    };
+            using type = alpaka::TagCpuSycl;
+        };
 
-    //! The CPU SYCL accelerator device type trait specialization.
-    template<typename TDim, typename TIdx>
-    struct DevType<AccCpuSycl<TDim, TIdx>>
-    {
-        using type = DevCpuSycl;
-    };
-
-    //! The CPU SYCL accelerator execution task type trait specialization.
-    template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
-    struct CreateTaskKernel<AccCpuSycl<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
-    {
-        static auto createTaskKernel(TWorkDiv const& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
+        template<typename TDim, typename TIdx>
+        struct TagToAcc<alpaka::TagCpuSycl, TDim, TIdx>
         {
-            return TaskKernelCpuSycl<TDim, TIdx, TKernelFnObj, TArgs...>{
-                workDiv,
-                kernelFnObj,
-                std::forward<TArgs>(args)...};
-        }
-    };
+            using type = alpaka::AccCpuSycl<TDim, TIdx>;
+        };
+    } // namespace trait
 
-    //! The CPU SYCL execution task platform type trait specialization.
-    template<typename TDim, typename TIdx>
-    struct PlatformType<AccCpuSycl<TDim, TIdx>>
-    {
-        using type = PlatformCpuSycl;
-    };
-
-    template<typename TDim, typename TIdx>
-    struct AccToTag<alpaka::AccCpuSycl<TDim, TIdx>>
-    {
-        using type = alpaka::TagCpuSycl;
-    };
-
-    template<typename TDim, typename TIdx>
-    struct TagToAcc<alpaka::TagCpuSycl, TDim, TIdx>
-    {
-        using type = alpaka::AccCpuSycl<TDim, TIdx>;
-    };
-} // namespace alpaka::trait
+} // namespace alpaka
 
 #endif
diff --git a/alpaka/include/alpaka/acc/AccCpuTbbBlocks.hpp b/alpaka/include/alpaka/acc/AccCpuTbbBlocks.hpp
index c0d9bcb4..d283523e 100644
--- a/alpaka/include/alpaka/acc/AccCpuTbbBlocks.hpp
+++ b/alpaka/include/alpaka/acc/AccCpuTbbBlocks.hpp
@@ -1,4 +1,5 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Erik Zenker, René Widera, Jan Stephan, Bernhard Manfred Gruber
+/* Copyright 2024 Axel Huebl, Benjamin Worpitz, Erik Zenker, René Widera, Jan Stephan, Bernhard Manfred Gruber,
+ *                Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -31,6 +32,7 @@
 
 // Implementation details.
 #include "alpaka/acc/Tag.hpp"
+#include "alpaka/core/ClipCast.hpp"
 #include "alpaka/core/Concepts.hpp"
 #include "alpaka/dev/DevCpu.hpp"
 
@@ -39,6 +41,8 @@
 
 #ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
 
+#    include <tbb/tbb.h>
+
 namespace alpaka
 {
     template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
@@ -107,14 +111,26 @@ namespace alpaka
             using type = AccCpuTbbBlocks<TDim, TIdx>;
         };
 
+        //! The CPU TBB block single thread accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IsSingleThreadAcc<AccCpuTbbBlocks<TDim, TIdx>> : std::true_type
+        {
+        };
+
+        //! The CPU TBB block multi thread accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IsMultiThreadAcc<AccCpuTbbBlocks<TDim, TIdx>> : std::false_type
+        {
+        };
+
         //! The CPU TBB block accelerator device properties get trait specialization.
         template<typename TDim, typename TIdx>
         struct GetAccDevProps<AccCpuTbbBlocks<TDim, TIdx>>
         {
-            ALPAKA_FN_HOST static auto getAccDevProps(DevCpu const& /* dev */) -> AccDevProps<TDim, TIdx>
+            ALPAKA_FN_HOST static auto getAccDevProps(DevCpu const& dev) -> AccDevProps<TDim, TIdx>
             {
                 return {// m_multiProcessorCount
-                        static_cast<TIdx>(1),
+                        alpaka::core::clipCast<TIdx>(tbb::this_task_arena::max_concurrency()),
                         // m_gridBlockExtentMax
                         Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
                         // m_gridBlockCountMax
@@ -128,7 +144,9 @@ namespace alpaka
                         // m_threadElemCountMax
                         std::numeric_limits<TIdx>::max(),
                         // m_sharedMemSizeBytes
-                        static_cast<size_t>(AccCpuTbbBlocks<TDim, TIdx>::staticAllocBytes())};
+                        static_cast<size_t>(AccCpuTbbBlocks<TDim, TIdx>::staticAllocBytes()),
+                        // m_globalMemSizeBytes
+                        getMemBytes(dev)};
             }
         };
 
@@ -165,6 +183,13 @@ namespace alpaka
                 TKernelFnObj const& kernelFnObj,
                 TArgs&&... args)
             {
+                if(workDiv.m_blockThreadExtent.prod() != static_cast<TIdx>(1u))
+                {
+                    throw std::runtime_error(
+                        "The given work division is not valid for a single thread Acc: "
+                        + getAccName<AccCpuTbbBlocks<TDim, TIdx>>() + ". Threads per block should be 1!");
+                }
+
                 return TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>(
                     workDiv,
                     kernelFnObj,
diff --git a/alpaka/include/alpaka/acc/AccCpuThreads.hpp b/alpaka/include/alpaka/acc/AccCpuThreads.hpp
index 18b902e8..ce8f04a7 100644
--- a/alpaka/include/alpaka/acc/AccCpuThreads.hpp
+++ b/alpaka/include/alpaka/acc/AccCpuThreads.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, René Widera, Jan Stephan, Bernhard Manfred Gruber
+/* Copyright 2024 Axel Huebl, Benjamin Worpitz, René Widera, Jan Stephan, Bernhard Manfred Gruber, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -124,6 +124,18 @@ namespace alpaka
             using type = AccCpuThreads<TDim, TIdx>;
         };
 
+        //! The CPU threads single thread accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IsSingleThreadAcc<AccCpuThreads<TDim, TIdx>> : std::false_type
+        {
+        };
+
+        //! The CPU threads multi thread accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IsMultiThreadAcc<AccCpuThreads<TDim, TIdx>> : std::true_type
+        {
+        };
+
         //! The CPU threads accelerator device properties get trait specialization.
         template<typename TDim, typename TIdx>
         struct GetAccDevProps<AccCpuThreads<TDim, TIdx>>
@@ -131,7 +143,7 @@ namespace alpaka
             ALPAKA_FN_HOST static auto getAccDevProps(DevCpu const& dev) -> AccDevProps<TDim, TIdx>
             {
 #    ifdef ALPAKA_CI
-                auto const blockThreadCountMax(static_cast<TIdx>(8));
+                auto const blockThreadCountMax = static_cast<TIdx>(8);
 #    else
                 // \TODO: Magic number. What is the maximum? Just set a reasonable value? There is a implementation
                 // defined maximum where the creation of a new thread crashes. std::thread::hardware_concurrency can
@@ -140,6 +152,7 @@ namespace alpaka
                     static_cast<TIdx>(1),
                     alpaka::core::clipCast<TIdx>(std::thread::hardware_concurrency() * 8));
 #    endif
+                auto const memBytes = getMemBytes(dev);
                 return {// m_multiProcessorCount
                         static_cast<TIdx>(1),
                         // m_gridBlockExtentMax
@@ -155,7 +168,9 @@ namespace alpaka
                         // m_threadElemCountMax
                         std::numeric_limits<TIdx>::max(),
                         // m_sharedMemSizeBytes
-                        getMemBytes(dev)};
+                        memBytes,
+                        // m_globalMemSizeBytes
+                        memBytes};
             }
         };
 
diff --git a/alpaka/include/alpaka/acc/AccDevProps.hpp b/alpaka/include/alpaka/acc/AccDevProps.hpp
index cd87e20c..a199d542 100644
--- a/alpaka/include/alpaka/acc/AccDevProps.hpp
+++ b/alpaka/include/alpaka/acc/AccDevProps.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2020 Benjamin Worpitz, Bernhard Manfred Gruber
+/* Copyright 2024 Benjamin Worpitz, Bernhard Manfred Gruber
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -7,9 +7,6 @@
 #include "alpaka/core/Common.hpp"
 #include "alpaka/vec/Vec.hpp"
 
-#include <string>
-#include <vector>
-
 namespace alpaka
 {
     //! The acceleration properties on a device.
@@ -23,37 +20,15 @@ namespace alpaka
             sizeof(TIdx) >= sizeof(int),
             "Index type is not supported, consider using int or a larger type.");
 
-        ALPAKA_FN_HOST AccDevProps(
-            TIdx const& multiProcessorCount,
-            Vec<TDim, TIdx> const& gridBlockExtentMax,
-            TIdx const& gridBlockCountMax,
-            Vec<TDim, TIdx> const& blockThreadExtentMax,
-            TIdx const& blockThreadCountMax,
-            Vec<TDim, TIdx> const& threadElemExtentMax,
-            TIdx const& threadElemCountMax,
-            size_t const& sharedMemSizeBytes)
-            : m_gridBlockExtentMax(gridBlockExtentMax)
-            , m_blockThreadExtentMax(blockThreadExtentMax)
-            , m_threadElemExtentMax(threadElemExtentMax)
-            , m_gridBlockCountMax(gridBlockCountMax)
-            , m_blockThreadCountMax(blockThreadCountMax)
-            , m_threadElemCountMax(threadElemCountMax)
-            , m_multiProcessorCount(multiProcessorCount)
-            , m_sharedMemSizeBytes(sharedMemSizeBytes)
-        {
-        }
-
-        // NOTE: The members have been reordered from the order in the constructor because gcc is buggy for some TDim
-        // and TIdx and generates invalid assembly.
+        // Please keep the order of data members so aggregate initialization does not break!
+        TIdx m_multiProcessorCount; //!< The number of multiprocessors.
         Vec<TDim, TIdx> m_gridBlockExtentMax; //!< The maximum number of blocks in each dimension of the grid.
-        Vec<TDim, TIdx> m_blockThreadExtentMax; //!< The maximum number of threads in each dimension of a block.
-        Vec<TDim, TIdx> m_threadElemExtentMax; //!< The maximum number of elements in each dimension of a thread.
-
         TIdx m_gridBlockCountMax; //!< The maximum number of blocks in a grid.
+        Vec<TDim, TIdx> m_blockThreadExtentMax; //!< The maximum number of threads in each dimension of a block.
         TIdx m_blockThreadCountMax; //!< The maximum number of threads in a block.
+        Vec<TDim, TIdx> m_threadElemExtentMax; //!< The maximum number of elements in each dimension of a thread.
         TIdx m_threadElemCountMax; //!< The maximum number of elements in a threads.
-
-        TIdx m_multiProcessorCount; //!< The number of multiprocessors.
         size_t m_sharedMemSizeBytes; //!< The size of shared memory per block
+        size_t m_globalMemSizeBytes; //!< The size of global memory
     };
 } // namespace alpaka
diff --git a/alpaka/include/alpaka/acc/AccFpgaSyclIntel.hpp b/alpaka/include/alpaka/acc/AccFpgaSyclIntel.hpp
index db4c0b94..d0e099f2 100644
--- a/alpaka/include/alpaka/acc/AccFpgaSyclIntel.hpp
+++ b/alpaka/include/alpaka/acc/AccFpgaSyclIntel.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Aurora Perego
+/* Copyright 2024 Jan Stephan, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -6,19 +6,7 @@
 
 #include "alpaka/acc/AccGenericSycl.hpp"
 #include "alpaka/acc/Tag.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/DemangleTypeNames.hpp"
 #include "alpaka/core/Sycl.hpp"
-#include "alpaka/dev/DevFpgaSyclIntel.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/kernel/TaskKernelFpgaSyclIntel.hpp"
-#include "alpaka/kernel/Traits.hpp"
-#include "alpaka/platform/PlatformFpgaSyclIntel.hpp"
-#include "alpaka/platform/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-
-#include <string>
-#include <utility>
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_FPGA)
 
@@ -28,65 +16,23 @@ namespace alpaka
     //!
     //! This accelerator allows parallel kernel execution on a oneAPI-capable Intel FPGA target device.
     template<typename TDim, typename TIdx>
-    class AccFpgaSyclIntel final
-        : public AccGenericSycl<TDim, TIdx>
-        , public concepts::Implements<ConceptAcc, AccFpgaSyclIntel<TDim, TIdx>>
-    {
-    public:
-        using AccGenericSycl<TDim, TIdx>::AccGenericSycl;
-    };
-} // namespace alpaka
+    using AccFpgaSyclIntel = AccGenericSycl<TagFpgaSyclIntel, TDim, TIdx>;
 
-namespace alpaka::trait
-{
-    //! The Intel FPGA SYCL accelerator name trait specialization.
-    template<typename TDim, typename TIdx>
-    struct GetAccName<AccFpgaSyclIntel<TDim, TIdx>>
+    namespace trait
     {
-        static auto getAccName() -> std::string
+        template<typename TDim, typename TIdx>
+        struct AccToTag<alpaka::AccFpgaSyclIntel<TDim, TIdx>>
         {
-            return "AccFpgaSyclIntel<" + std::to_string(TDim::value) + "," + core::demangled<TIdx> + ">";
-        }
-    };
+            using type = alpaka::TagFpgaSyclIntel;
+        };
 
-    //! The Intel FPGA SYCL accelerator device type trait specialization.
-    template<typename TDim, typename TIdx>
-    struct DevType<AccFpgaSyclIntel<TDim, TIdx>>
-    {
-        using type = DevFpgaSyclIntel;
-    };
-
-    //! The Intel FPGA SYCL accelerator execution task type trait specialization.
-    template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
-    struct CreateTaskKernel<AccFpgaSyclIntel<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
-    {
-        static auto createTaskKernel(TWorkDiv const& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
+        template<typename TDim, typename TIdx>
+        struct TagToAcc<alpaka::TagFpgaSyclIntel, TDim, TIdx>
         {
-            return TaskKernelFpgaSyclIntel<TDim, TIdx, TKernelFnObj, TArgs...>{
-                workDiv,
-                kernelFnObj,
-                std::forward<TArgs>(args)...};
-        }
-    };
+            using type = alpaka::AccFpgaSyclIntel<TDim, TIdx>;
+        };
+    } // namespace trait
 
-    //! The Intel FPGA SYCL execution task platform type trait specialization.
-    template<typename TDim, typename TIdx>
-    struct PlatformType<AccFpgaSyclIntel<TDim, TIdx>>
-    {
-        using type = PlatformFpgaSyclIntel;
-    };
-
-    template<typename TDim, typename TIdx>
-    struct AccToTag<alpaka::AccFpgaSyclIntel<TDim, TIdx>>
-    {
-        using type = alpaka::TagFpgaSyclIntel;
-    };
-
-    template<typename TDim, typename TIdx>
-    struct TagToAcc<alpaka::TagFpgaSyclIntel, TDim, TIdx>
-    {
-        using type = alpaka::AccFpgaSyclIntel<TDim, TIdx>;
-    };
-} // namespace alpaka::trait
+} // namespace alpaka
 
 #endif
diff --git a/alpaka/include/alpaka/acc/AccGenericSycl.hpp b/alpaka/include/alpaka/acc/AccGenericSycl.hpp
index 84b238ce..46793445 100644
--- a/alpaka/include/alpaka/acc/AccGenericSycl.hpp
+++ b/alpaka/include/alpaka/acc/AccGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Andrea Bocci, Luca Ferragina, Aurora Perego
+/* Copyright 2024 Jan Stephan, Antonio Di Pilato, Andrea Bocci, Luca Ferragina, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -10,11 +10,13 @@
 #include "alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp"
 #include "alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp"
 #include "alpaka/block/sync/BlockSyncGenericSycl.hpp"
+#include "alpaka/dev/DevGenericSycl.hpp"
 #include "alpaka/idx/bt/IdxBtGenericSycl.hpp"
 #include "alpaka/idx/gb/IdxGbGenericSycl.hpp"
 #include "alpaka/intrinsic/IntrinsicGenericSycl.hpp"
 #include "alpaka/math/MathGenericSycl.hpp"
 #include "alpaka/mem/fence/MemFenceGenericSycl.hpp"
+#include "alpaka/platform/PlatformGenericSycl.hpp"
 #include "alpaka/rand/RandDefault.hpp"
 #include "alpaka/rand/RandGenericSycl.hpp"
 #include "alpaka/warp/WarpGenericSycl.hpp"
@@ -31,6 +33,7 @@
 // Implementation details.
 #include "alpaka/core/BoostPredef.hpp"
 #include "alpaka/core/ClipCast.hpp"
+#include "alpaka/core/Concepts.hpp"
 #include "alpaka/core/Sycl.hpp"
 
 #include <cstddef>
@@ -43,10 +46,13 @@
 
 namespace alpaka
 {
+    template<typename TTag, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelGenericSycl;
+
     //! The SYCL accelerator.
     //!
     //! This accelerator allows parallel kernel execution on SYCL devices.
-    template<typename TDim, typename TIdx>
+    template<typename TTag, typename TDim, typename TIdx>
     class AccGenericSycl
         : public WorkDivGenericSycl<TDim, TIdx>
         , public gb::IdxGbGenericSycl<TDim, TIdx>
@@ -64,6 +70,7 @@ namespace alpaka
         , public rand::RandGenericSycl<TDim>
 #    endif
         , public warp::WarpGenericSycl<TDim>
+        , public concepts::Implements<ConceptAcc, AccGenericSycl<TTag, TDim, TIdx>>
     {
         static_assert(TDim::value > 0, "The SYCL accelerator must have a dimension greater than zero.");
 
@@ -96,19 +103,29 @@ namespace alpaka
 namespace alpaka::trait
 {
     //! The SYCL accelerator type trait specialization.
-    template<template<typename, typename> typename TAcc, typename TDim, typename TIdx>
-    struct AccType<TAcc<TDim, TIdx>, std::enable_if_t<std::is_base_of_v<AccGenericSycl<TDim, TIdx>, TAcc<TDim, TIdx>>>>
+    template<typename TTag, typename TDim, typename TIdx>
+    struct AccType<AccGenericSycl<TTag, TDim, TIdx>>
+    {
+        using type = AccGenericSycl<TTag, TDim, TIdx>;
+    };
+
+    //! The SYCL single thread accelerator type trait specialization.
+    template<typename TTag, typename TDim, typename TIdx>
+    struct IsSingleThreadAcc<AccGenericSycl<TTag, TDim, TIdx>> : std::false_type
+    {
+    };
+
+    //! The SYCL multi thread accelerator type trait specialization.
+    template<typename TTag, typename TDim, typename TIdx>
+    struct IsMultiThreadAcc<AccGenericSycl<TTag, TDim, TIdx>> : std::true_type
     {
-        using type = TAcc<TDim, TIdx>;
     };
 
     //! The SYCL accelerator device properties get trait specialization.
-    template<template<typename, typename> typename TAcc, typename TDim, typename TIdx>
-    struct GetAccDevProps<
-        TAcc<TDim, TIdx>,
-        std::enable_if_t<std::is_base_of_v<AccGenericSycl<TDim, TIdx>, TAcc<TDim, TIdx>>>>
+    template<typename TTag, typename TDim, typename TIdx>
+    struct GetAccDevProps<AccGenericSycl<TTag, TDim, TIdx>>
     {
-        static auto getAccDevProps(typename DevType<TAcc<TDim, TIdx>>::type const& dev) -> AccDevProps<TDim, TIdx>
+        static auto getAccDevProps(DevGenericSycl<TTag> const& dev) -> AccDevProps<TDim, TIdx>
         {
             auto const device = dev.getNativeHandle().first;
             auto const max_threads_dim
@@ -135,20 +152,60 @@ namespace alpaka::trait
                     // m_threadElemCountMax
                     std::numeric_limits<TIdx>::max(),
                     // m_sharedMemSizeBytes
-                    device.template get_info<sycl::info::device::local_mem_size>()};
+                    device.template get_info<sycl::info::device::local_mem_size>(),
+                    // m_globalMemSizeBytes
+                    getMemBytes(dev)};
+        }
+    };
+
+    //! The SYCL accelerator name trait specialization.
+    template<typename TTag, typename TDim, typename TIdx>
+    struct GetAccName<AccGenericSycl<TTag, TDim, TIdx>>
+    {
+        static auto getAccName() -> std::string
+        {
+            return std::string("Acc") + core::demangled<TTag>.substr(__builtin_strlen("alpaka::Tag")) + "<"
+                   + std::to_string(TDim::value) + "," + core::demangled<TIdx> + ">";
         }
     };
 
+    //! The SYCL accelerator device type trait specialization.
+    template<typename TTag, typename TDim, typename TIdx>
+    struct DevType<AccGenericSycl<TTag, TDim, TIdx>>
+    {
+        using type = DevGenericSycl<TTag>;
+    };
+
     //! The SYCL accelerator dimension getter trait specialization.
-    template<template<typename, typename> typename TAcc, typename TDim, typename TIdx>
-    struct DimType<TAcc<TDim, TIdx>, std::enable_if_t<std::is_base_of_v<AccGenericSycl<TDim, TIdx>, TAcc<TDim, TIdx>>>>
+    template<typename TTag, typename TDim, typename TIdx>
+    struct DimType<AccGenericSycl<TTag, TDim, TIdx>>
     {
         using type = TDim;
     };
 
+    //! The SYCL accelerator execution task type trait specialization.
+    template<typename TTag, typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
+    struct CreateTaskKernel<AccGenericSycl<TTag, TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
+    {
+        static auto createTaskKernel(TWorkDiv const& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
+        {
+            return TaskKernelGenericSycl<TTag, AccGenericSycl<TTag, TDim, TIdx>, TDim, TIdx, TKernelFnObj, TArgs...>{
+                workDiv,
+                kernelFnObj,
+                std::forward<TArgs>(args)...};
+        }
+    };
+
+    //! The SYCL execution task platform type trait specialization.
+    template<typename TTag, typename TDim, typename TIdx>
+    struct PlatformType<AccGenericSycl<TTag, TDim, TIdx>>
+    {
+        using type = PlatformGenericSycl<TTag>;
+    };
+
     //! The SYCL accelerator idx type trait specialization.
-    template<template<typename, typename> typename TAcc, typename TDim, typename TIdx>
-    struct IdxType<TAcc<TDim, TIdx>, std::enable_if_t<std::is_base_of_v<AccGenericSycl<TDim, TIdx>, TAcc<TDim, TIdx>>>>
+    template<typename TTag, typename TDim, typename TIdx>
+    struct IdxType<AccGenericSycl<TTag, TDim, TIdx>>
     {
         using type = TIdx;
     };
diff --git a/alpaka/include/alpaka/acc/AccGpuSyclIntel.hpp b/alpaka/include/alpaka/acc/AccGpuSyclIntel.hpp
index bc60307e..2e75b436 100644
--- a/alpaka/include/alpaka/acc/AccGpuSyclIntel.hpp
+++ b/alpaka/include/alpaka/acc/AccGpuSyclIntel.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2024 Jan Stephan, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -6,19 +6,7 @@
 
 #include "alpaka/acc/AccGenericSycl.hpp"
 #include "alpaka/acc/Tag.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/DemangleTypeNames.hpp"
 #include "alpaka/core/Sycl.hpp"
-#include "alpaka/dev/DevGpuSyclIntel.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/kernel/TaskKernelGpuSyclIntel.hpp"
-#include "alpaka/kernel/Traits.hpp"
-#include "alpaka/platform/PlatformGpuSyclIntel.hpp"
-#include "alpaka/platform/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-
-#include <string>
-#include <utility>
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_GPU)
 
@@ -28,65 +16,23 @@ namespace alpaka
     //!
     //! This accelerator allows parallel kernel execution on a oneAPI-capable Intel GPU target device.
     template<typename TDim, typename TIdx>
-    class AccGpuSyclIntel final
-        : public AccGenericSycl<TDim, TIdx>
-        , public concepts::Implements<ConceptAcc, AccGpuSyclIntel<TDim, TIdx>>
-    {
-    public:
-        using AccGenericSycl<TDim, TIdx>::AccGenericSycl;
-    };
-} // namespace alpaka
+    using AccGpuSyclIntel = AccGenericSycl<TagGpuSyclIntel, TDim, TIdx>;
 
-namespace alpaka::trait
-{
-    //! The Intel GPU SYCL accelerator name trait specialization.
-    template<typename TDim, typename TIdx>
-    struct GetAccName<AccGpuSyclIntel<TDim, TIdx>>
+    namespace trait
     {
-        static auto getAccName() -> std::string
+        template<typename TDim, typename TIdx>
+        struct AccToTag<alpaka::AccGpuSyclIntel<TDim, TIdx>>
         {
-            return "AccGpuSyclIntel<" + std::to_string(TDim::value) + "," + core::demangled<TIdx> + ">";
-        }
-    };
+            using type = alpaka::TagGpuSyclIntel;
+        };
 
-    //! The Intel GPU SYCL accelerator device type trait specialization.
-    template<typename TDim, typename TIdx>
-    struct DevType<AccGpuSyclIntel<TDim, TIdx>>
-    {
-        using type = DevGpuSyclIntel;
-    };
-
-    //! The Intel GPU SYCL accelerator execution task type trait specialization.
-    template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
-    struct CreateTaskKernel<AccGpuSyclIntel<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
-    {
-        static auto createTaskKernel(TWorkDiv const& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
+        template<typename TDim, typename TIdx>
+        struct TagToAcc<alpaka::TagGpuSyclIntel, TDim, TIdx>
         {
-            return TaskKernelGpuSyclIntel<TDim, TIdx, TKernelFnObj, TArgs...>{
-                workDiv,
-                kernelFnObj,
-                std::forward<TArgs>(args)...};
-        }
-    };
+            using type = alpaka::AccGpuSyclIntel<TDim, TIdx>;
+        };
+    } // namespace trait
 
-    //! The Intel GPU SYCL execution task platform type trait specialization.
-    template<typename TDim, typename TIdx>
-    struct PlatformType<AccGpuSyclIntel<TDim, TIdx>>
-    {
-        using type = PlatformGpuSyclIntel;
-    };
-
-    template<typename TDim, typename TIdx>
-    struct AccToTag<alpaka::AccGpuSyclIntel<TDim, TIdx>>
-    {
-        using type = alpaka::TagGpuSyclIntel;
-    };
-
-    template<typename TDim, typename TIdx>
-    struct TagToAcc<alpaka::TagGpuSyclIntel, TDim, TIdx>
-    {
-        using type = alpaka::AccGpuSyclIntel<TDim, TIdx>;
-    };
-} // namespace alpaka::trait
+} // namespace alpaka
 
 #endif
diff --git a/alpaka/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp b/alpaka/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp
index a70d9d12..bc0e8cb6 100644
--- a/alpaka/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp
+++ b/alpaka/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Benjamin Worpitz, René Widera, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Antonio Di Pilato
+/* Copyright 2024 Benjamin Worpitz, René Widera, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Antonio Di Pilato
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -94,6 +94,18 @@ namespace alpaka
             using type = AccGpuUniformCudaHipRt<TApi, TDim, TIdx>;
         };
 
+        //! The GPU CUDA single thread accelerator type trait specialization.
+        template<typename TApi, typename TDim, typename TIdx>
+        struct IsSingleThreadAcc<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>> : std::false_type
+        {
+        };
+
+        //! The GPU CUDA multi thread accelerator type trait specialization.
+        template<typename TApi, typename TDim, typename TIdx>
+        struct IsMultiThreadAcc<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>> : std::true_type
+        {
+        };
+
         //! The GPU CUDA accelerator device properties get trait specialization.
         template<typename TApi, typename TDim, typename TIdx>
         struct GetAccDevProps<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
@@ -170,7 +182,9 @@ namespace alpaka
                         // m_threadElemCountMax
                         std::numeric_limits<TIdx>::max(),
                         // m_sharedMemSizeBytes
-                        static_cast<size_t>(sharedMemSizeBytes)};
+                        static_cast<size_t>(sharedMemSizeBytes),
+                        // m_globalMemSizeBytes
+                        getMemBytes(dev)};
 
 #    else
                 typename TApi::DeviceProp_t properties;
@@ -197,7 +211,9 @@ namespace alpaka
                         // m_threadElemCountMax
                         std::numeric_limits<TIdx>::max(),
                         // m_sharedMemSizeBytes
-                        static_cast<size_t>(properties.sharedMemPerBlock)};
+                        static_cast<size_t>(properties.sharedMemPerBlock),
+                        // m_globalMemSizeBytes
+                        getMemBytes(dev)};
 #    endif
             }
         };
diff --git a/alpaka/include/alpaka/acc/Tag.hpp b/alpaka/include/alpaka/acc/Tag.hpp
index 611ee558..f7880afd 100644
--- a/alpaka/include/alpaka/acc/Tag.hpp
+++ b/alpaka/include/alpaka/acc/Tag.hpp
@@ -41,18 +41,32 @@ namespace alpaka
         struct TagToAcc;
     } // namespace trait
 
-    /// @brief maps an acc type to a tag type
-    /// @tparam TAcc alpaka acc type
+    //! \brief maps an acc type to a tag type
+    //! \tparam TAcc alpaka acc type
     template<typename TAcc>
     using AccToTag = typename trait::AccToTag<TAcc>::type;
 
-    /// @brief maps a tag type to an acc type
-    /// @tparam TTag alpaka tag type
-    /// @tparam TDim dimension of the mapped acc type
-    /// @tparam TIdx index type of the mapped acc type
+    //! \brief maps a tag type to an acc type
+    //! \tparam TTag alpaka tag type
+    //! \tparam TDim dimension of the mapped acc type
+    //! \tparam TIdx index type of the mapped acc type
     template<typename TTag, typename TDim, typename TIdx>
     using TagToAcc = typename trait::TagToAcc<TTag, TDim, TIdx>::type;
 
     template<typename TAcc, typename... TTag>
     inline constexpr bool accMatchesTags = (std::is_same_v<alpaka::AccToTag<TAcc>, TTag> || ...);
+
+    //! list of all available tags
+    using AccTags = std::tuple<
+        alpaka::TagCpuSerial,
+        alpaka::TagCpuThreads,
+        alpaka::TagCpuTbbBlocks,
+        alpaka::TagCpuOmp2Blocks,
+        alpaka::TagCpuOmp2Threads,
+        alpaka::TagGpuCudaRt,
+        alpaka::TagGpuHipRt,
+        alpaka::TagCpuSycl,
+        alpaka::TagFpgaSyclIntel,
+        alpaka::TagGpuSyclIntel>;
+
 } // namespace alpaka
diff --git a/alpaka/include/alpaka/acc/TagAccIsEnabled.hpp b/alpaka/include/alpaka/acc/TagAccIsEnabled.hpp
new file mode 100644
index 00000000..c21fd2b1
--- /dev/null
+++ b/alpaka/include/alpaka/acc/TagAccIsEnabled.hpp
@@ -0,0 +1,36 @@
+#pragma once
+
+// include all Acc's because of the struct AccIsEnabled
+// if an acc is not include, it will be not enabled independent of the compiler flags
+#include "alpaka/acc/AccCpuOmp2Blocks.hpp"
+#include "alpaka/acc/AccCpuOmp2Threads.hpp"
+#include "alpaka/acc/AccCpuSerial.hpp"
+#include "alpaka/acc/AccCpuSycl.hpp"
+#include "alpaka/acc/AccCpuTbbBlocks.hpp"
+#include "alpaka/acc/AccCpuThreads.hpp"
+#include "alpaka/acc/AccFpgaSyclIntel.hpp"
+#include "alpaka/acc/AccGpuCudaRt.hpp"
+#include "alpaka/acc/AccGpuHipRt.hpp"
+#include "alpaka/dim/DimIntegralConst.hpp"
+#include "alpaka/meta/Filter.hpp"
+
+#include <type_traits>
+
+namespace alpaka
+{
+    //! \brief check if the accelerator is enabled for a given tag
+    //! \tparam TTag alpaka tag type
+    template<typename TTag, typename = void>
+    struct AccIsEnabled : std::false_type
+    {
+    };
+
+    template<typename TTag>
+    struct AccIsEnabled<TTag, std::void_t<TagToAcc<TTag, alpaka::DimInt<1>, int>>> : std::true_type
+    {
+    };
+
+    //! list of all tags where the related accelerator is enabled
+    using EnabledAccTags = alpaka::meta::Filter<AccTags, alpaka::AccIsEnabled>;
+
+} // namespace alpaka
diff --git a/alpaka/include/alpaka/acc/Traits.hpp b/alpaka/include/alpaka/acc/Traits.hpp
index eb29e854..48fa0b18 100644
--- a/alpaka/include/alpaka/acc/Traits.hpp
+++ b/alpaka/include/alpaka/acc/Traits.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+/* Copyright 2024 Benjamin Worpitz, Bernhard Manfred Gruber, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -36,6 +36,26 @@ namespace alpaka
         template<typename T, typename TSfinae = void>
         struct AccType;
 
+        //! The single thread accelerator trait.
+        //!
+        //! If TAcc is an accelerator that supports only a single thread per block, inherit from std::true_type.
+        //! If TAcc is not an accelerator, or an accelerator that supports multiple threads per block, inherit from
+        //! std::false_type.
+        template<typename TAcc, typename TSfinae = void>
+        struct IsSingleThreadAcc : std::false_type
+        {
+        };
+
+        //! The multi thread accelerator trait.
+        //!
+        //! If TAcc is an accelerator that supports multiple threads per block, inherit from std::true_type.
+        //! If TAcc is not an accelerator, or an accelerator that supports only a single thread per block, inherit from
+        //! std::false_type.
+        template<typename TAcc, typename TSfinae = void>
+        struct IsMultiThreadAcc : std::false_type
+        {
+        };
+
         //! The device properties get trait.
         template<typename TAcc, typename TSfinae = void>
         struct GetAccDevProps;
@@ -57,6 +77,14 @@ namespace alpaka
     template<typename T>
     using Acc = typename trait::AccType<T>::type;
 
+    //! True if TAcc is an accelerator that supports only a single thread per block, false otherwise.
+    template<typename TAcc>
+    inline constexpr bool isSingleThreadAcc = trait::IsSingleThreadAcc<TAcc>::value;
+
+    //! True if TAcc is an accelerator that supports multiple threads per block, false otherwise.
+    template<typename TAcc>
+    inline constexpr bool isMultiThreadAcc = trait::IsMultiThreadAcc<TAcc>::value;
+
     //! \return The acceleration properties on the given device.
     template<typename TAcc, typename TDev>
     ALPAKA_FN_HOST auto getAccDevProps(TDev const& dev) -> AccDevProps<Dim<TAcc>, Idx<TAcc>>
@@ -81,5 +109,7 @@ namespace alpaka
         {
             using type = typename QueueType<typename alpaka::trait::PlatformType<TAcc>::type, TProperty>::type;
         };
+
     } // namespace trait
+
 } // namespace alpaka
diff --git a/alpaka/include/alpaka/alpaka.hpp b/alpaka/include/alpaka/alpaka.hpp
index 5ea58163..fe410cff 100644
--- a/alpaka/include/alpaka/alpaka.hpp
+++ b/alpaka/include/alpaka/alpaka.hpp
@@ -23,6 +23,7 @@
 #include "alpaka/acc/AccGpuHipRt.hpp"
 #include "alpaka/acc/AccGpuSyclIntel.hpp"
 #include "alpaka/acc/Tag.hpp"
+#include "alpaka/acc/TagAccIsEnabled.hpp"
 #include "alpaka/acc/Traits.hpp"
 // atomic
 #include "alpaka/atomic/AtomicCpu.hpp"
@@ -67,6 +68,7 @@
 #include "alpaka/core/OmpSchedule.hpp"
 #include "alpaka/core/Positioning.hpp"
 #include "alpaka/core/RemoveRestrict.hpp"
+#include "alpaka/core/RuntimeMacros.hpp"
 #include "alpaka/core/Sycl.hpp"
 #include "alpaka/core/ThreadPool.hpp"
 #include "alpaka/core/Unreachable.hpp"
@@ -96,6 +98,11 @@
 #include "alpaka/event/EventGpuSyclIntel.hpp"
 #include "alpaka/event/EventHipRt.hpp"
 #include "alpaka/event/Traits.hpp"
+// exec
+#include "alpaka/exec/ElementIndex.hpp"
+#include "alpaka/exec/IndependentElements.hpp"
+#include "alpaka/exec/Once.hpp"
+#include "alpaka/exec/UniformElements.hpp"
 // extent
 #include "alpaka/extent/Traits.hpp"
 // idx
@@ -147,6 +154,10 @@
 #include "alpaka/mem/fence/MemFenceOmp2Threads.hpp"
 #include "alpaka/mem/fence/MemFenceUniformCudaHipBuiltIn.hpp"
 #include "alpaka/mem/fence/Traits.hpp"
+#include "alpaka/mem/global/DeviceGlobalCpu.hpp"
+#include "alpaka/mem/global/DeviceGlobalGenericSycl.hpp"
+#include "alpaka/mem/global/DeviceGlobalUniformCudaHipBuiltIn.hpp"
+#include "alpaka/mem/global/Traits.hpp"
 #include "alpaka/mem/view/Traits.hpp"
 #include "alpaka/mem/view/ViewConst.hpp"
 #include "alpaka/mem/view/ViewPlainPtr.hpp"
@@ -157,7 +168,6 @@
 #include "alpaka/meta/Apply.hpp"
 #include "alpaka/meta/CartesianProduct.hpp"
 #include "alpaka/meta/Concatenate.hpp"
-#include "alpaka/meta/CudaVectorArrayWrapper.hpp"
 #include "alpaka/meta/DependentFalseType.hpp"
 #include "alpaka/meta/Filter.hpp"
 #include "alpaka/meta/Fold.hpp"
diff --git a/alpaka/include/alpaka/atomic/AtomicAtomicRef.hpp b/alpaka/include/alpaka/atomic/AtomicAtomicRef.hpp
index f38b3dee..61b825c1 100644
--- a/alpaka/include/alpaka/atomic/AtomicAtomicRef.hpp
+++ b/alpaka/include/alpaka/atomic/AtomicAtomicRef.hpp
@@ -42,7 +42,7 @@ namespace alpaka
     void isSupportedByAtomicAtomicRef()
     {
         static_assert(
-            std::is_trivially_copyable_v<T> && detail::atomic_ref<T>::required_alignment <= alignof(T),
+            std::is_trivially_copyable_v<T> && alpaka::detail::atomic_ref<T>::required_alignment <= alignof(T),
             "Type not supported by AtomicAtomicRef, please recompile defining "
             "ALPAKA_DISABLE_ATOMIC_ATOMICREF.");
     }
@@ -56,7 +56,7 @@ namespace alpaka
             ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
             {
                 isSupportedByAtomicAtomicRef<T>();
-                detail::atomic_ref<T> ref(*addr);
+                alpaka::detail::atomic_ref<T> ref(*addr);
                 return ref.fetch_add(value);
             }
         };
@@ -68,7 +68,7 @@ namespace alpaka
             ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
             {
                 isSupportedByAtomicAtomicRef<T>();
-                detail::atomic_ref<T> ref(*addr);
+                alpaka::detail::atomic_ref<T> ref(*addr);
                 return ref.fetch_sub(value);
             }
         };
@@ -80,7 +80,7 @@ namespace alpaka
             ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
             {
                 isSupportedByAtomicAtomicRef<T>();
-                detail::atomic_ref<T> ref(*addr);
+                alpaka::detail::atomic_ref<T> ref(*addr);
                 T old = ref;
                 T result = old;
                 result = std::min(result, value);
@@ -100,7 +100,7 @@ namespace alpaka
             ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
             {
                 isSupportedByAtomicAtomicRef<T>();
-                detail::atomic_ref<T> ref(*addr);
+                alpaka::detail::atomic_ref<T> ref(*addr);
                 T old = ref;
                 T result = old;
                 result = std::max(result, value);
@@ -120,7 +120,7 @@ namespace alpaka
             ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
             {
                 isSupportedByAtomicAtomicRef<T>();
-                detail::atomic_ref<T> ref(*addr);
+                alpaka::detail::atomic_ref<T> ref(*addr);
                 T old = ref;
                 T result = value;
                 while(!ref.compare_exchange_weak(old, result))
@@ -138,7 +138,7 @@ namespace alpaka
             ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
             {
                 isSupportedByAtomicAtomicRef<T>();
-                detail::atomic_ref<T> ref(*addr);
+                alpaka::detail::atomic_ref<T> ref(*addr);
                 T old = ref;
                 T result = ((old >= value) ? 0 : static_cast<T>(old + 1));
                 while(!ref.compare_exchange_weak(old, result))
@@ -156,7 +156,7 @@ namespace alpaka
             ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
             {
                 isSupportedByAtomicAtomicRef<T>();
-                detail::atomic_ref<T> ref(*addr);
+                alpaka::detail::atomic_ref<T> ref(*addr);
                 T old = ref;
                 T result = ((old >= value) ? 0 : static_cast<T>(old - 1));
                 while(!ref.compare_exchange_weak(old, result))
@@ -174,7 +174,7 @@ namespace alpaka
             ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
             {
                 isSupportedByAtomicAtomicRef<T>();
-                detail::atomic_ref<T> ref(*addr);
+                alpaka::detail::atomic_ref<T> ref(*addr);
                 return ref.fetch_and(value);
             }
         };
@@ -186,7 +186,7 @@ namespace alpaka
             ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
             {
                 isSupportedByAtomicAtomicRef<T>();
-                detail::atomic_ref<T> ref(*addr);
+                alpaka::detail::atomic_ref<T> ref(*addr);
                 return ref.fetch_or(value);
             }
         };
@@ -198,7 +198,7 @@ namespace alpaka
             ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
             {
                 isSupportedByAtomicAtomicRef<T>();
-                detail::atomic_ref<T> ref(*addr);
+                alpaka::detail::atomic_ref<T> ref(*addr);
                 return ref.fetch_xor(value);
             }
         };
@@ -214,12 +214,19 @@ namespace alpaka
                 T const& value) -> T
             {
                 isSupportedByAtomicAtomicRef<T>();
-                detail::atomic_ref<T> ref(*addr);
+                alpaka::detail::atomic_ref<T> ref(*addr);
                 T old = ref;
                 T result;
                 do
                 {
+#    if BOOST_COMP_GNUC || BOOST_COMP_CLANG
+#        pragma GCC diagnostic push
+#        pragma GCC diagnostic ignored "-Wfloat-equal"
+#    endif
                     result = ((old == compare) ? value : old);
+#    if BOOST_COMP_GNUC || BOOST_COMP_CLANG
+#        pragma GCC diagnostic pop
+#    endif
                 } while(!ref.compare_exchange_weak(old, result));
                 return old;
             }
diff --git a/alpaka/include/alpaka/atomic/AtomicCpu.hpp b/alpaka/include/alpaka/atomic/AtomicCpu.hpp
index 4dae7f31..5667bd00 100644
--- a/alpaka/include/alpaka/atomic/AtomicCpu.hpp
+++ b/alpaka/include/alpaka/atomic/AtomicCpu.hpp
@@ -1,14 +1,20 @@
-/* Copyright 2021 Andrea Bocci, Felice Pantaleo
+/* Copyright 2024 Andrea Bocci, Felice Pantaleo
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
-#include <boost/version.hpp>
+#include "alpaka/core/BoostPredef.hpp"
 
-#ifndef ALPAKA_DISABLE_ATOMIC_ATOMICREF
-#    define ALPAKA_DISABLE_ATOMIC_ATOMICREF
-#endif
+// clang 9/10/11 together with nvcc<11.6.0 as host compiler fails at compile time when using boost::atomic_ref
+#ifdef BOOST_COMP_CLANG_AVAILABLE
+#    if(BOOST_COMP_CLANG < BOOST_VERSION_NUMBER(12, 0, 0) && BOOST_COMP_NVCC                                          \
+        && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 6, 0))
+#        if !defined(ALPAKA_DISABLE_ATOMIC_ATOMICREF)
+#            define ALPAKA_DISABLE_ATOMIC_ATOMICREF
+#        endif
+#    endif
+#endif // BOOST_COMP_CLANG_AVAILABLE
 
 #include "alpaka/atomic/AtomicAtomicRef.hpp"
 #include "alpaka/atomic/AtomicStdLibLock.hpp"
diff --git a/alpaka/include/alpaka/atomic/AtomicGenericSycl.hpp b/alpaka/include/alpaka/atomic/AtomicGenericSycl.hpp
index 8ebf608d..bdfa53ba 100644
--- a/alpaka/include/alpaka/atomic/AtomicGenericSycl.hpp
+++ b/alpaka/include/alpaka/atomic/AtomicGenericSycl.hpp
@@ -51,47 +51,14 @@ namespace alpaka
             static constexpr auto value = sycl::memory_scope::work_group;
         };
 
-        template<typename T>
-        inline auto get_global_ptr(T* const addr)
-        {
-            return sycl::address_space_cast<sycl::access::address_space::global_space, sycl::access::decorated::no>(
-                addr);
-        }
-
-        template<typename T>
-        inline auto get_local_ptr(T* const addr)
-        {
-            return sycl::address_space_cast<sycl::access::address_space::local_space, sycl::access::decorated::no>(
-                addr);
-        }
-
-        template<typename T, typename THierarchy>
-        using global_ref = sycl::atomic_ref<
-            T,
-            sycl::memory_order::relaxed,
-            SyclMemoryScope<THierarchy>::value,
-            sycl::access::address_space::global_space>;
-
         template<typename T, typename THierarchy>
-        using local_ref = sycl::atomic_ref<
-            T,
-            sycl::memory_order::relaxed,
-            SyclMemoryScope<THierarchy>::value,
-            sycl::access::address_space::local_space>;
+        using sycl_atomic_ref = sycl::atomic_ref<T, sycl::memory_order::relaxed, SyclMemoryScope<THierarchy>::value>;
 
         template<typename THierarchy, typename T, typename TOp>
         inline auto callAtomicOp(T* const addr, TOp&& op)
         {
-            if(auto ptr = get_global_ptr(addr); ptr != nullptr)
-            {
-                auto ref = global_ref<T, THierarchy>{*addr};
-                return op(ref);
-            }
-            else
-            {
-                auto ref = local_ref<T, THierarchy>{*addr};
-                return op(ref);
-            }
+            auto ref = sycl_atomic_ref<T, THierarchy>{*addr};
+            return op(ref);
         }
 
         template<typename TRef, typename T, typename TEval>
@@ -178,7 +145,7 @@ namespace alpaka::trait
     struct AtomicOp<AtomicExch, AtomicGenericSycl, T, THierarchy>
     {
         static_assert(
-            (std::is_integral_v<T> || std::is_floating_point_v<T>) &&(sizeof(T) == 4 || sizeof(T) == 8),
+            (std::is_integral_v<T> || std::is_floating_point_v<T>) and(sizeof(T) == 4 || sizeof(T) == 8),
             "SYCL atomics do not support this type");
 
         static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T
@@ -200,10 +167,7 @@ namespace alpaka::trait
         {
             auto inc = [&value](auto old_val)
             { return (old_val >= value) ? static_cast<T>(0) : (old_val + static_cast<T>(1)); };
-            if(auto ptr = alpaka::detail::get_global_ptr(addr); ptr != nullptr)
-                return alpaka::detail::casWithCondition<alpaka::detail::global_ref<T, THierarchy>>(addr, inc);
-            else
-                return alpaka::detail::casWithCondition<alpaka::detail::local_ref<T, THierarchy>>(addr, inc);
+            return alpaka::detail::casWithCondition<alpaka::detail::sycl_atomic_ref<T, THierarchy>>(addr, inc);
         }
     };
 
@@ -220,10 +184,7 @@ namespace alpaka::trait
         {
             auto dec = [&value](auto& old_val)
             { return ((old_val == 0) || (old_val > value)) ? value : (old_val - static_cast<T>(1)); };
-            if(auto ptr = alpaka::detail::get_global_ptr(addr); ptr != nullptr)
-                return alpaka::detail::casWithCondition<alpaka::detail::global_ref<T, THierarchy>>(addr, dec);
-            else
-                return alpaka::detail::casWithCondition<alpaka::detail::local_ref<T, THierarchy>>(addr, dec);
+            return alpaka::detail::casWithCondition<alpaka::detail::sycl_atomic_ref<T, THierarchy>>(addr, dec);
         }
     };
 
@@ -294,16 +255,7 @@ namespace alpaka::trait
                 return expected_;
             };
 
-            if(auto ptr = alpaka::detail::get_global_ptr(addr); ptr != nullptr)
-            {
-                auto ref = alpaka::detail::global_ref<T, THierarchy>{*addr};
-                return cas(ref);
-            }
-            else
-            {
-                auto ref = alpaka::detail::local_ref<T, THierarchy>{*addr};
-                return cas(ref);
-            }
+            return alpaka::detail::callAtomicOp<THierarchy>(addr, cas);
         }
     };
 } // namespace alpaka::trait
diff --git a/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp b/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp
index 4e2af194..0c09cf18 100644
--- a/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp
+++ b/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp
@@ -35,7 +35,7 @@ namespace alpaka::trait
     {
         static auto getMem(BlockSharedMemDynGenericSycl const& shared) -> T*
         {
-            return reinterpret_cast<T*>(shared.m_accessor.get_pointer().get());
+            return reinterpret_cast<T*>(shared.m_accessor.get_multi_ptr<sycl::access::decorated::no>().get());
         }
     };
 } // namespace alpaka::trait
diff --git a/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynUniformCudaHipBuiltIn.hpp b/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynUniformCudaHipBuiltIn.hpp
index 279dab16..8364019a 100644
--- a/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynUniformCudaHipBuiltIn.hpp
+++ b/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynUniformCudaHipBuiltIn.hpp
@@ -8,6 +8,7 @@
 #include "alpaka/core/BoostPredef.hpp"
 #include "alpaka/core/Concepts.hpp"
 
+#include <cstddef>
 #include <type_traits>
 
 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
@@ -38,10 +39,12 @@ namespace alpaka
             __device__ static auto getMem(BlockSharedMemDynUniformCudaHipBuiltIn const&) -> T*
             {
                 // Because unaligned access to variables is not allowed in device code,
-                // we have to use the widest possible type to have all types aligned correctly.
-                // See: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared
-                // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#vector-types
-                extern __shared__ float4 shMem[];
+                // we use the widest possible alignment supported by CUDA types to have
+                // all types aligned correctly.
+                // See:
+                //   - http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared
+                //   - http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#vector-types
+                extern __shared__ std::byte shMem alignas(std::max_align_t)[];
                 return reinterpret_cast<T*>(shMem);
             }
         };
diff --git a/alpaka/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp b/alpaka/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp
index f92df9c0..060414d6 100644
--- a/alpaka/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp
+++ b/alpaka/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp
@@ -24,7 +24,7 @@ namespace alpaka
     public:
         BlockSharedMemStGenericSycl(sycl::local_accessor<std::byte> accessor)
             : BlockSharedMemStMemberImpl(
-                reinterpret_cast<std::uint8_t*>(accessor.get_pointer().get()),
+                reinterpret_cast<std::uint8_t*>(accessor.get_multi_ptr<sycl::access::decorated::no>().get()),
                 accessor.size())
             , m_accessor{accessor}
         {
diff --git a/alpaka/include/alpaka/core/ApiCudaRt.hpp b/alpaka/include/alpaka/core/ApiCudaRt.hpp
index 3dda1e48..ee2cdb2e 100644
--- a/alpaka/include/alpaka/core/ApiCudaRt.hpp
+++ b/alpaka/include/alpaka/core/ApiCudaRt.hpp
@@ -75,6 +75,7 @@ namespace alpaka
         static constexpr DeviceAttr_t deviceAttributeMaxSharedMemoryPerBlock = ::cudaDevAttrMaxSharedMemoryPerBlock;
         static constexpr DeviceAttr_t deviceAttributeMaxThreadsPerBlock = ::cudaDevAttrMaxThreadsPerBlock;
         static constexpr DeviceAttr_t deviceAttributeMultiprocessorCount = ::cudaDevAttrMultiProcessorCount;
+        static constexpr DeviceAttr_t deviceAttributeWarpSize = ::cudaDevAttrWarpSize;
 
         static constexpr Limit_t limitPrintfFifoSize = ::cudaLimitPrintfFifoSize;
         static constexpr Limit_t limitMallocHeapSize = ::cudaLimitMallocHeapSize;
@@ -181,7 +182,14 @@ namespace alpaka
         template<typename T>
         static inline Error_t funcGetAttributes(FuncAttributes_t* attr, T* func)
         {
+#    if BOOST_COMP_GNUC
+#        pragma GCC diagnostic push
+#        pragma GCC diagnostic ignored "-Wconditionally-supported"
+#    endif
             return ::cudaFuncGetAttributes(attr, reinterpret_cast<void const*>(func));
+#    if BOOST_COMP_GNUC
+#        pragma GCC diagnostic pop
+#    endif
         }
 
         static inline Error_t getDeviceCount(int* count)
diff --git a/alpaka/include/alpaka/core/ApiHipRt.hpp b/alpaka/include/alpaka/core/ApiHipRt.hpp
index 69590a45..d765246c 100644
--- a/alpaka/include/alpaka/core/ApiHipRt.hpp
+++ b/alpaka/include/alpaka/core/ApiHipRt.hpp
@@ -78,6 +78,7 @@ namespace alpaka
             = ::hipDeviceAttributeMaxSharedMemoryPerBlock;
         static constexpr DeviceAttr_t deviceAttributeMaxThreadsPerBlock = ::hipDeviceAttributeMaxThreadsPerBlock;
         static constexpr DeviceAttr_t deviceAttributeMultiprocessorCount = ::hipDeviceAttributeMultiprocessorCount;
+        static constexpr DeviceAttr_t deviceAttributeWarpSize = ::hipDeviceAttributeWarpSize;
 
 #    if HIP_VERSION >= 40'500'000
         static constexpr Limit_t limitPrintfFifoSize = ::hipLimitPrintfFifoSize;
@@ -206,7 +207,14 @@ namespace alpaka
         template<typename T>
         static inline Error_t funcGetAttributes(FuncAttributes_t* attr, T* func)
         {
+#    if BOOST_COMP_GNUC
+#        pragma GCC diagnostic push
+#        pragma GCC diagnostic ignored "-Wconditionally-supported"
+#    endif
             return ::hipFuncGetAttributes(attr, reinterpret_cast<void const*>(func));
+#    if BOOST_COMP_GNUC
+#        pragma GCC diagnostic pop
+#    endif
         }
 
         static inline Error_t getDeviceCount(int* count)
diff --git a/alpaka/include/alpaka/core/Common.hpp b/alpaka/include/alpaka/core/Common.hpp
index 90d8d40c..3b181ee2 100644
--- a/alpaka/include/alpaka/core/Common.hpp
+++ b/alpaka/include/alpaka/core/Common.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Matthias Werner, Jan Stephan, René Widera, Andrea Bocci
+/* Copyright 2024 Axel Hübl, Benjamin Worpitz, Matthias Werner, Jan Stephan, René Widera, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -98,12 +98,12 @@
 //! This macro defines a variable lying in global accelerator device memory.
 //!
 //! Example:
-//!   ALPAKA_STATIC_ACC_MEM_GLOBAL int i;
+//!   ALPAKA_STATIC_ACC_MEM_GLOBAL alpaka::DevGlobal<TAcc, int> variable;
 //!
-//! Those variables behave like ordinary variables when used in file-scope.
-//! They have external linkage (are accessible from other compilation units).
-//! If you want to access it from a different compilation unit, you have to declare it as extern:
-//!   extern ALPAKA_STATIC_ACC_MEM_GLOBAL int i;
+//! Those variables behave like ordinary variables when used in file-scope,
+//! but inside kernels the get() method must be used to access the variable.
+//! They are declared inline to resolve to a single instance across multiple
+//! translation units.
 //! Like ordinary variables, only one definition is allowed (ODR)
 //! Failure to do so might lead to linker errors.
 //!
@@ -112,36 +112,53 @@
 //! because this is forbidden by CUDA.
 //!
 //! \attention It is not allowed to initialize the variable together with the declaration.
-//!            To initialize the variable alpaka::createStaticDevMemView and alpaka::memcpy must be used.
+//!            To initialize the variable alpaka::memcpy must be used.
 //! \code{.cpp}
-//! ALPAKA_STATIC_ACC_MEM_GLOBAL int foo;
+//! ALPAKA_STATIC_ACC_MEM_GLOBAL alpaka::DevGlobal<TAcc, int> foo;
+//!
+//! struct DeviceMemoryKernel
+//! {
+//!    ALPAKA_NO_HOST_ACC_WARNING
+//!    template<typename TAcc>
+//!    ALPAKA_FN_ACC void operator()(TAcc const& acc) const
+//!    {
+//!      auto a = foo<TAcc>.get();
+//!    }
+//!  }
 //!
 //! void initFoo() {
 //!     auto extent = alpaka::Vec<alpaka::DimInt<1u>, size_t>{1};
-//!     auto viewFoo = alpaka::createStaticDevMemView(&foo, device, extent);
 //!     int initialValue = 42;
 //!     alpaka::ViewPlainPtr<DevHost, int, alpaka::DimInt<1u>, size_t> bufHost(&initialValue, devHost, extent);
-//!     alpaka::memcpy(queue, viewGlobalMemUninitialized, bufHost, extent);
+//!     alpaka::memcpy(queue, foo<Acc>, bufHost, extent);
 //! }
 //! \endcode
 #if((BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA) || (BOOST_LANG_CUDA && BOOST_COMP_NVCC && BOOST_ARCH_PTX)              \
     || BOOST_LANG_HIP)
-#    define ALPAKA_STATIC_ACC_MEM_GLOBAL __device__
-#elif defined(ALPAKA_ACC_SYCL_ENABLED)
-#    define ALPAKA_STATIC_ACC_MEM_GLOBAL _Pragma("GCC error \"The SYCL backend does not support global device variables.\""))
+#    if defined(__CUDACC_RDC__) || defined(__CLANG_RDC__)
+#        define ALPAKA_STATIC_ACC_MEM_GLOBAL                                                                          \
+            template<typename TAcc>                                                                                   \
+            __device__ inline
+#    else
+#        define ALPAKA_STATIC_ACC_MEM_GLOBAL                                                                          \
+            template<typename TAcc>                                                                                   \
+            __device__ static
+#    endif
 #else
-#    define ALPAKA_STATIC_ACC_MEM_GLOBAL
+#    define ALPAKA_STATIC_ACC_MEM_GLOBAL                                                                              \
+        template<typename TAcc>                                                                                       \
+        inline
 #endif
 
 //! This macro defines a variable lying in constant accelerator device memory.
 //!
 //! Example:
-//!   ALPAKA_STATIC_ACC_MEM_CONSTANT int i;
+//!   ALPAKA_STATIC_ACC_MEM_CONSTANT alpaka::DevGlobal<TAcc, const int> variable;
 //!
-//! Those variables behave like ordinary variables when used in file-scope.
-//! They have external linkage (are accessible from other compilation units).
-//! If you want to access it from a different compilation unit, you have to declare it as extern:
-//!   extern ALPAKA_STATIC_ACC_MEM_CONSTANT int i;
+//! Those variables behave like ordinary variables when used in file-scope,
+//! but inside kernels the get() method must be used to access the variable.
+//! They are declared inline to resolve to a single instance across multiple
+//! translation units.
 //! Like ordinary variables, only one definition is allowed (ODR)
 //! Failure to do so might lead to linker errors.
 //!
@@ -150,25 +167,42 @@
 //! because this is forbidden by CUDA.
 //!
 //! \attention It is not allowed to initialize the variable together with the declaration.
-//!            To initialize the variable alpaka::createStaticDevMemView and alpaka::memcpy must be used.
+//!            To initialize the variable alpaka::memcpy must be used.
 //! \code{.cpp}
-//! ALPAKA_STATIC_ACC_MEM_CONSTANT int foo;
+//! ALPAKA_STATIC_ACC_MEM_CONSTANT alpaka::DevGlobal<TAcc, const int> foo;
+//!
+//! struct DeviceMemoryKernel
+//! {
+//!    ALPAKA_NO_HOST_ACC_WARNING
+//!    template<typename TAcc>
+//!    ALPAKA_FN_ACC void operator()(TAcc const& acc) const
+//!    {
+//!      auto a = foo<TAcc>.get();
+//!    }
+//!  }
 //!
 //! void initFoo() {
 //!     auto extent = alpaka::Vec<alpaka::DimInt<1u>, size_t>{1};
-//!     auto viewFoo = alpaka::createStaticDevMemView(&foo, device, extent);
 //!     int initialValue = 42;
 //!     alpaka::ViewPlainPtr<DevHost, int, alpaka::DimInt<1u>, size_t> bufHost(&initialValue, devHost, extent);
-//!     alpaka::memcpy(queue, viewGlobalMemUninitialized, bufHost, extent);
+//!     alpaka::memcpy(queue, foo<Acc>, bufHost, extent);
 //! }
 //! \endcode
 #if((BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA) || (BOOST_LANG_CUDA && BOOST_COMP_NVCC && BOOST_ARCH_PTX)              \
     || BOOST_LANG_HIP)
-#    define ALPAKA_STATIC_ACC_MEM_CONSTANT __constant__
-#elif defined(ALPAKA_ACC_SYCL_ENABLED)
-#    define ALPAKA_STATIC_ACC_MEM_CONSTANT _Pragma("GCC error \"The SYCL backend does not support global device constants.\""))
+#    if defined(__CUDACC_RDC__) || defined(__CLANG_RDC__)
+#        define ALPAKA_STATIC_ACC_MEM_CONSTANT                                                                        \
+            template<typename TAcc>                                                                                   \
+            __constant__ inline
+#    else
+#        define ALPAKA_STATIC_ACC_MEM_CONSTANT                                                                        \
+            template<typename TAcc>                                                                                   \
+            __constant__ static
+#    endif
 #else
-#    define ALPAKA_STATIC_ACC_MEM_CONSTANT
+#    define ALPAKA_STATIC_ACC_MEM_CONSTANT                                                                            \
+        template<typename TAcc>                                                                                       \
+        inline
 #endif
 
 //! This macro disables memory optimizations for annotated device memory.
diff --git a/alpaka/include/alpaka/core/RuntimeMacros.hpp b/alpaka/include/alpaka/core/RuntimeMacros.hpp
new file mode 100644
index 00000000..80faa331
--- /dev/null
+++ b/alpaka/include/alpaka/core/RuntimeMacros.hpp
@@ -0,0 +1,52 @@
+/* Copyright 2022  Andrea Bocci, Mehmet Yusufoglu, René Widera, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+// Implementation details
+#include "alpaka/core/Sycl.hpp"
+
+//! ALPAKA_THROW_ACC either aborts(terminating the program and creating a core dump) or throws std::runtime_error
+//! depending on the Acc. The std::runtime_error exception can be catched in the catch block.
+//!
+//! For CUDA __trap function is used which triggers std::runtime_error but can be catched during wait not exec.
+//! For HIP abort() function is used and calls __builtin_trap()
+//! For Sycl assert(false) is not used since it can be disabled -DNDEBUG compile option. abort() is used although it
+//! generates a runtime error instead of aborting in GPUs: "Caught synchronous SYCL exception: Unresolved Symbol
+//! <abort> -999 (Unknown PI error)."
+//!
+//! The OpenMP specification mandates that exceptions thrown by some thread must be handled by the same thread.
+//! Therefore std::runtime_error thrown by ALPAKA_THROW_ACC aborts the the program for OpenMP backends. If needed
+//! the SIGABRT signal can be catched by signal handler.
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)
+#    define ALPAKA_THROW_ACC(MSG)                                                                                     \
+        {                                                                                                             \
+            printf(                                                                                                   \
+                "alpaka encountered a user-defined error condition while running on the CUDA back-end:\n%s",          \
+                (MSG));                                                                                               \
+            __trap();                                                                                                 \
+        }
+#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)
+#    define ALPAKA_THROW_ACC(MSG)                                                                                     \
+        {                                                                                                             \
+            printf(                                                                                                   \
+                "alpaka encountered a user-defined error condition while running on the HIP back-end:\n%s",           \
+                (MSG));                                                                                               \
+            abort();                                                                                                  \
+        }
+#elif defined(ALPAKA_ACC_SYCL_ENABLED) && defined(__SYCL_DEVICE_ONLY__)
+#    define ALPAKA_THROW_ACC(MSG)                                                                                     \
+        {                                                                                                             \
+            printf(                                                                                                   \
+                "alpaka encountered a user-defined error condition while running on the SYCL back-end:\n%s",          \
+                (MSG));                                                                                               \
+            abort();                                                                                                  \
+        }
+#else
+#    define ALPAKA_THROW_ACC(MSG)                                                                                     \
+        {                                                                                                             \
+            printf("alpaka encountered a user-defined error condition:\n%s", (MSG));                                  \
+            throw std::runtime_error(MSG);                                                                            \
+        }
+#endif
diff --git a/alpaka/include/alpaka/core/ThreadPool.hpp b/alpaka/include/alpaka/core/ThreadPool.hpp
index e99d4162..b59555a3 100644
--- a/alpaka/include/alpaka/core/ThreadPool.hpp
+++ b/alpaka/include/alpaka/core/ThreadPool.hpp
@@ -55,7 +55,18 @@ namespace alpaka::core::detail
         template<typename TFnObj, typename... TArgs>
         auto enqueueTask(TFnObj&& task, TArgs&&... args) -> std::future<void>
         {
-            auto ptask = Task{[=, t = std::forward<TFnObj>(task)]() noexcept(noexcept(task(args...))) { t(args...); }};
+#if BOOST_COMP_MSVC
+// MSVC 14.39.33519 is throwing an error because the noexcept type deduction is not defined in original C++17
+// error C2065: 'task': undeclared identifier
+// see: https://stackoverflow.com/a/72467726
+#    define ALPAKA_NOEXCEPT(...)
+#else
+#    define ALPAKA_NOEXCEPT(...) noexcept(__VA_ARGS__)
+#endif
+            auto ptask
+                = Task{[=, t = std::forward<TFnObj>(task)]() ALPAKA_NOEXCEPT(noexcept(task(args...))) { t(args...); }};
+#undef ALPAKA_NOEXCEPT
+
             auto future = ptask.get_future();
             {
                 std::lock_guard<std::mutex> lock{m_mutex};
diff --git a/alpaka/include/alpaka/core/UniformCudaHip.hpp b/alpaka/include/alpaka/core/UniformCudaHip.hpp
index 7aa201f9..0896f9de 100644
--- a/alpaka/include/alpaka/core/UniformCudaHip.hpp
+++ b/alpaka/include/alpaka/core/UniformCudaHip.hpp
@@ -61,7 +61,8 @@ namespace alpaka::uniform_cuda_hip::detail
             if(std::find(std::cbegin(ignoredErrorCodes), std::cend(ignoredErrorCodes), error)
                == std::cend(ignoredErrorCodes))
             {
-                rtCheck<TApi, TThrow>(error, ("'" + std::string(cmd) + "' returned error ").c_str(), file, line);
+                using namespace std::literals;
+                rtCheck<TApi, TThrow>(error, ("'"s + std::string(cmd) + "' returned error "s).c_str(), file, line);
             }
             else
             {
diff --git a/alpaka/include/alpaka/core/Utility.hpp b/alpaka/include/alpaka/core/Utility.hpp
index 03ea7264..2610027e 100644
--- a/alpaka/include/alpaka/core/Utility.hpp
+++ b/alpaka/include/alpaka/core/Utility.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Benjamin Worpitz, René Widera, Bernhard Manfred Gruber, Jan Stephan
+/* Copyright 2024 Benjamin Worpitz, René Widera, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 #pragma once
@@ -23,14 +23,14 @@ namespace alpaka::core
 #endif
 
     /// Returns the ceiling of a / b, as integer.
-    template<typename Integral>
+    template<typename Integral, typename = std::enable_if_t<std::is_integral_v<Integral>>>
     [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto divCeil(Integral a, Integral b) -> Integral
     {
         return (a + b - 1) / b;
     }
 
     /// Computes the nth power of base, in integers.
-    template<typename Integral>
+    template<typename Integral, typename = std::enable_if_t<std::is_integral_v<Integral>>>
     [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto intPow(Integral base, Integral n) -> Integral
     {
         if(n == 0)
@@ -42,7 +42,7 @@ namespace alpaka::core
     }
 
     /// Computes the floor of the nth root of value, in integers.
-    template<typename Integral>
+    template<typename Integral, typename = std::enable_if_t<std::is_integral_v<Integral>>>
     [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto nthRootFloor(Integral value, Integral n) -> Integral
     {
         // adapted from: https://en.wikipedia.org/wiki/Integer_square_root
@@ -58,4 +58,5 @@ namespace alpaka::core
         }
         return L;
     }
+
 } // namespace alpaka::core
diff --git a/alpaka/include/alpaka/dev/DevCpuSycl.hpp b/alpaka/include/alpaka/dev/DevCpuSycl.hpp
index 04b15a86..bc88ce9d 100644
--- a/alpaka/include/alpaka/dev/DevCpuSycl.hpp
+++ b/alpaka/include/alpaka/dev/DevCpuSycl.hpp
@@ -1,17 +1,17 @@
-/* Copyright 2023 Jan Stephan, Andrea Bocci
+/* Copyright 2024 Jan Stephan, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
+#include "alpaka/acc/Tag.hpp"
 #include "alpaka/dev/DevGenericSycl.hpp"
-#include "alpaka/platform/PlatformCpuSycl.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_CPU)
 
 namespace alpaka
 {
-    using DevCpuSycl = DevGenericSycl<PlatformCpuSycl>;
+    using DevCpuSycl = DevGenericSycl<TagCpuSycl>;
 } // namespace alpaka
 
 #endif
diff --git a/alpaka/include/alpaka/dev/DevFpgaSyclIntel.hpp b/alpaka/include/alpaka/dev/DevFpgaSyclIntel.hpp
index 516027db..c0c66ef9 100644
--- a/alpaka/include/alpaka/dev/DevFpgaSyclIntel.hpp
+++ b/alpaka/include/alpaka/dev/DevFpgaSyclIntel.hpp
@@ -1,17 +1,17 @@
-/* Copyright 2023 Jan Stephan
+/* Copyright 2024 Jan Stephan, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
+#include "alpaka/acc/Tag.hpp"
 #include "alpaka/dev/DevGenericSycl.hpp"
-#include "alpaka/platform/PlatformFpgaSyclIntel.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_FPGA)
 
 namespace alpaka
 {
-    using DevFpgaSyclIntel = DevGenericSycl<PlatformFpgaSyclIntel>;
+    using DevFpgaSyclIntel = DevGenericSycl<TagFpgaSyclIntel>;
 } // namespace alpaka
 
 #endif
diff --git a/alpaka/include/alpaka/dev/DevGenericSycl.hpp b/alpaka/include/alpaka/dev/DevGenericSycl.hpp
index 729090f8..efbcad92 100644
--- a/alpaka/include/alpaka/dev/DevGenericSycl.hpp
+++ b/alpaka/include/alpaka/dev/DevGenericSycl.hpp
@@ -32,7 +32,22 @@
 
 namespace alpaka
 {
-    template<typename TElem, typename TDim, typename TIdx, typename TDev>
+    namespace trait
+    {
+        template<typename TPlatform, typename TSfinae>
+        struct GetDevByIdx;
+    } // namespace trait
+
+    template<typename TTag>
+    using QueueGenericSyclBlocking = detail::QueueGenericSyclBase<TTag, true>;
+
+    template<typename TTag>
+    using QueueGenericSyclNonBlocking = detail::QueueGenericSyclBase<TTag, false>;
+
+    template<typename TTag>
+    struct PlatformGenericSycl;
+
+    template<typename TElem, typename TDim, typename TIdx, typename TTag>
     class BufGenericSycl;
 
     namespace detail
@@ -105,11 +120,13 @@ namespace alpaka
     } // namespace detail
 
     //! The SYCL device handle.
-    template<typename TPlatform>
+    template<typename TTag>
     class DevGenericSycl
-        : public concepts::Implements<ConceptCurrentThreadWaitFor, DevGenericSycl<TPlatform>>
-        , public concepts::Implements<ConceptDev, DevGenericSycl<TPlatform>>
+        : public concepts::Implements<ConceptCurrentThreadWaitFor, DevGenericSycl<TTag>>
+        , public concepts::Implements<ConceptDev, DevGenericSycl<TTag>>
     {
+        friend struct trait::GetDevByIdx<PlatformGenericSycl<TTag>>;
+
     public:
         DevGenericSycl(sycl::device device, sycl::context context)
             : m_impl{std::make_shared<detail::DevGenericSyclImpl>(std::move(device), std::move(context))}
@@ -133,128 +150,133 @@ namespace alpaka
 
         std::shared_ptr<detail::DevGenericSyclImpl> m_impl;
     };
-} // namespace alpaka
 
-namespace alpaka::trait
-{
-    //! The SYCL device name get trait specialization.
-    template<typename TPlatform>
-    struct GetName<DevGenericSycl<TPlatform>>
+    namespace trait
     {
-        static auto getName(DevGenericSycl<TPlatform> const& dev) -> std::string
+        //! The SYCL device name get trait specialization.
+        template<typename TTag>
+        struct GetName<DevGenericSycl<TTag>>
         {
-            auto const device = dev.getNativeHandle().first;
-            return device.template get_info<sycl::info::device::name>();
-        }
-    };
+            static auto getName(DevGenericSycl<TTag> const& dev) -> std::string
+            {
+                auto const device = dev.getNativeHandle().first;
+                return device.template get_info<sycl::info::device::name>();
+            }
+        };
 
-    //! The SYCL device available memory get trait specialization.
-    template<typename TPlatform>
-    struct GetMemBytes<DevGenericSycl<TPlatform>>
-    {
-        static auto getMemBytes(DevGenericSycl<TPlatform> const& dev) -> std::size_t
+        //! The SYCL device available memory get trait specialization.
+        template<typename TTag>
+        struct GetMemBytes<DevGenericSycl<TTag>>
         {
-            auto const device = dev.getNativeHandle().first;
-            return device.template get_info<sycl::info::device::global_mem_size>();
-        }
-    };
+            static auto getMemBytes(DevGenericSycl<TTag> const& dev) -> std::size_t
+            {
+                auto const device = dev.getNativeHandle().first;
+                return device.template get_info<sycl::info::device::global_mem_size>();
+            }
+        };
 
-    //! The SYCL device free memory get trait specialization.
-    template<typename TPlatform>
-    struct GetFreeMemBytes<DevGenericSycl<TPlatform>>
-    {
-        static auto getFreeMemBytes(DevGenericSycl<TPlatform> const& /* dev */) -> std::size_t
+        //! The SYCL device free memory get trait specialization.
+        template<typename TTag>
+        struct GetFreeMemBytes<DevGenericSycl<TTag>>
         {
-            static_assert(!sizeof(TPlatform), "Querying free device memory not supported for SYCL devices.");
-            return std::size_t{};
-        }
-    };
+            static auto getFreeMemBytes(DevGenericSycl<TTag> const& /* dev */) -> std::size_t
+            {
+                static_assert(
+                    !sizeof(PlatformGenericSycl<TTag>),
+                    "Querying free device memory not supported for SYCL devices.");
+                return std::size_t{};
+            }
+        };
 
-    //! The SYCL device warp size get trait specialization.
-    template<typename TPlatform>
-    struct GetWarpSizes<DevGenericSycl<TPlatform>>
-    {
-        static auto getWarpSizes(DevGenericSycl<TPlatform> const& dev) -> std::vector<std::size_t>
+        //! The SYCL device warp size get trait specialization.
+        template<typename TTag>
+        struct GetWarpSizes<DevGenericSycl<TTag>>
         {
-            auto const device = dev.getNativeHandle().first;
-            std::vector<std::size_t> warp_sizes = device.template get_info<sycl::info::device::sub_group_sizes>();
-            // The CPU runtime supports a sub-group size of 64, but the SYCL implementation currently does not
-            auto find64 = std::find(warp_sizes.begin(), warp_sizes.end(), 64);
-            if(find64 != warp_sizes.end())
-                warp_sizes.erase(find64);
-            // Sort the warp sizes in decreasing order
-            std::sort(warp_sizes.begin(), warp_sizes.end(), std::greater<>{});
-            return warp_sizes;
-        }
-    };
+            static auto getWarpSizes(DevGenericSycl<TTag> const& dev) -> std::vector<std::size_t>
+            {
+                auto const device = dev.getNativeHandle().first;
+                std::vector<std::size_t> warp_sizes = device.template get_info<sycl::info::device::sub_group_sizes>();
+                // The CPU runtime supports a sub-group size of 64, but the SYCL implementation currently does not
+                auto find64 = std::find(warp_sizes.begin(), warp_sizes.end(), 64);
+                if(find64 != warp_sizes.end())
+                    warp_sizes.erase(find64);
+                // Sort the warp sizes in decreasing order
+                std::sort(warp_sizes.begin(), warp_sizes.end(), std::greater<>{});
+                return warp_sizes;
+            }
+        };
 
-    //! The SYCL device preferred warp size get trait specialization.
-    template<typename TPlatform>
-    struct GetPreferredWarpSize<DevGenericSycl<TPlatform>>
-    {
-        static auto getPreferredWarpSize(DevGenericSycl<TPlatform> const& dev) -> std::size_t
+        //! The SYCL device preferred warp size get trait specialization.
+        template<typename TTag>
+        struct GetPreferredWarpSize<DevGenericSycl<TTag>>
         {
-            return GetWarpSizes<DevGenericSycl<TPlatform>>::getWarpSizes(dev).front();
-        }
-    };
+            static auto getPreferredWarpSize(DevGenericSycl<TTag> const& dev) -> std::size_t
+            {
+                return GetWarpSizes<DevGenericSycl<TTag>>::getWarpSizes(dev).front();
+            }
+        };
 
-    //! The SYCL device reset trait specialization.
-    template<typename TPlatform>
-    struct Reset<DevGenericSycl<TPlatform>>
-    {
-        static auto reset(DevGenericSycl<TPlatform> const&) -> void
+        //! The SYCL device reset trait specialization.
+        template<typename TTag>
+        struct Reset<DevGenericSycl<TTag>>
         {
-            static_assert(!sizeof(TPlatform), "Explicit device reset not supported for SYCL devices");
-        }
-    };
+            static auto reset(DevGenericSycl<TTag> const&) -> void
+            {
+                static_assert(
+                    !sizeof(PlatformGenericSycl<TTag>),
+                    "Explicit device reset not supported for SYCL devices");
+            }
+        };
 
-    //! The SYCL device native handle trait specialization.
-    template<typename TPlatform>
-    struct NativeHandle<DevGenericSycl<TPlatform>>
-    {
-        [[nodiscard]] static auto getNativeHandle(DevGenericSycl<TPlatform> const& dev)
+        //! The SYCL device native handle trait specialization.
+        template<typename TTag>
+        struct NativeHandle<DevGenericSycl<TTag>>
         {
-            return dev.getNativeHandle();
-        }
-    };
+            [[nodiscard]] static auto getNativeHandle(DevGenericSycl<TTag> const& dev)
+            {
+                return dev.getNativeHandle();
+            }
+        };
 
-    //! The SYCL device memory buffer type trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TPlatform>
-    struct BufType<DevGenericSycl<TPlatform>, TElem, TDim, TIdx>
-    {
-        using type = BufGenericSycl<TElem, TDim, TIdx, TPlatform>;
-    };
+        //! The SYCL device memory buffer type trait specialization.
+        template<typename TElem, typename TDim, typename TIdx, typename TTag>
+        struct BufType<DevGenericSycl<TTag>, TElem, TDim, TIdx>
+        {
+            using type = BufGenericSycl<TElem, TDim, TIdx, TTag>;
+        };
 
-    //! The SYCL device platform type trait specialization.
-    template<typename TPlatform>
-    struct PlatformType<DevGenericSycl<TPlatform>>
-    {
-        using type = TPlatform;
-    };
+        //! The SYCL device platform type trait specialization.
+        template<typename TTag>
+        struct PlatformType<DevGenericSycl<TTag>>
+        {
+            using type = PlatformGenericSycl<TTag>;
+        };
 
-    //! The thread SYCL device wait specialization.
-    template<typename TPlatform>
-    struct CurrentThreadWaitFor<DevGenericSycl<TPlatform>>
-    {
-        static auto currentThreadWaitFor(DevGenericSycl<TPlatform> const& dev) -> void
+        //! The thread SYCL device wait specialization.
+        template<typename TTag>
+        struct CurrentThreadWaitFor<DevGenericSycl<TTag>>
         {
-            dev.m_impl->wait();
-        }
-    };
+            static auto currentThreadWaitFor(DevGenericSycl<TTag> const& dev) -> void
+            {
+                dev.m_impl->wait();
+            }
+        };
 
-    //! The SYCL blocking queue trait specialization.
-    template<typename TPlatform>
-    struct QueueType<DevGenericSycl<TPlatform>, Blocking>
-    {
-        using type = detail::QueueGenericSyclBase<DevGenericSycl<TPlatform>, true>;
-    };
+        //! The SYCL blocking queue trait specialization.
+        template<typename TTag>
+        struct QueueType<DevGenericSycl<TTag>, Blocking>
+        {
+            using type = QueueGenericSyclBlocking<TTag>;
+        };
 
-    //! The SYCL non-blocking queue trait specialization.
-    template<typename TPlatform>
-    struct QueueType<DevGenericSycl<TPlatform>, NonBlocking>
-    {
-        using type = detail::QueueGenericSyclBase<DevGenericSycl<TPlatform>, false>;
-    };
-} // namespace alpaka::trait
+        //! The SYCL non-blocking queue trait specialization.
+        template<typename TTag>
+        struct QueueType<DevGenericSycl<TTag>, NonBlocking>
+        {
+            using type = QueueGenericSyclNonBlocking<TTag>;
+        };
+
+    } // namespace trait
+} // namespace alpaka
 
 #endif
diff --git a/alpaka/include/alpaka/dev/DevGpuSyclIntel.hpp b/alpaka/include/alpaka/dev/DevGpuSyclIntel.hpp
index 9897d40e..28501267 100644
--- a/alpaka/include/alpaka/dev/DevGpuSyclIntel.hpp
+++ b/alpaka/include/alpaka/dev/DevGpuSyclIntel.hpp
@@ -1,17 +1,17 @@
-/* Copyright 2023 Jan Stephan
+/* Copyright 2024 Jan Stephan, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
+#include "alpaka/acc/Tag.hpp"
 #include "alpaka/dev/DevGenericSycl.hpp"
-#include "alpaka/platform/PlatformGpuSyclIntel.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_GPU)
 
 namespace alpaka
 {
-    using DevGpuSyclIntel = DevGenericSycl<PlatformGpuSyclIntel>;
+    using DevGpuSyclIntel = DevGenericSycl<TagGpuSyclIntel>;
 } // namespace alpaka
 
 #endif
diff --git a/alpaka/include/alpaka/dev/DevUniformCudaHipRt.hpp b/alpaka/include/alpaka/dev/DevUniformCudaHipRt.hpp
index 1b0758d6..876d8ca5 100644
--- a/alpaka/include/alpaka/dev/DevUniformCudaHipRt.hpp
+++ b/alpaka/include/alpaka/dev/DevUniformCudaHipRt.hpp
@@ -175,10 +175,11 @@ namespace alpaka
         {
             ALPAKA_FN_HOST static auto getPreferredWarpSize(DevUniformCudaHipRt<TApi> const& dev) -> std::size_t
             {
-                typename TApi::DeviceProp_t devProp;
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::getDeviceProperties(&devProp, dev.getNativeHandle()));
+                int warpSize = 0;
 
-                return static_cast<std::size_t>(devProp.warpSize);
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    TApi::deviceGetAttribute(&warpSize, TApi::deviceAttributeWarpSize, dev.getNativeHandle()));
+                return static_cast<std::size_t>(warpSize);
             }
         };
 
diff --git a/alpaka/include/alpaka/dev/Traits.hpp b/alpaka/include/alpaka/dev/Traits.hpp
index 096ce5bb..a3954f29 100644
--- a/alpaka/include/alpaka/dev/Traits.hpp
+++ b/alpaka/include/alpaka/dev/Traits.hpp
@@ -61,7 +61,7 @@ namespace alpaka
 
     //! True if TDev is a device, i.e. if it implements the ConceptDev concept.
     template<typename TDev>
-    inline constexpr bool isDevice = concepts::ImplementsConcept<ConceptDev, TDev>::value;
+    inline constexpr bool isDevice = concepts::ImplementsConcept<ConceptDev, std::decay_t<TDev>>::value;
 
     //! \return The device this object is bound to.
     template<typename T>
diff --git a/alpaka/include/alpaka/event/EventCpuSycl.hpp b/alpaka/include/alpaka/event/EventCpuSycl.hpp
index c95ed8e2..91a9517b 100644
--- a/alpaka/include/alpaka/event/EventCpuSycl.hpp
+++ b/alpaka/include/alpaka/event/EventCpuSycl.hpp
@@ -1,17 +1,17 @@
-/* Copyright 2023 Jan Stephan, Andrea Bocci
+/* Copyright 2024 Jan Stephan, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
-#include "alpaka/dev/DevCpuSycl.hpp"
+#include "alpaka/acc/Tag.hpp"
 #include "alpaka/event/EventGenericSycl.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_CPU)
 
 namespace alpaka
 {
-    using EventCpuSycl = EventGenericSycl<DevCpuSycl>;
+    using EventCpuSycl = EventGenericSycl<TagCpuSycl>;
 } // namespace alpaka
 
 #endif
diff --git a/alpaka/include/alpaka/event/EventFpgaSyclIntel.hpp b/alpaka/include/alpaka/event/EventFpgaSyclIntel.hpp
index d79d8aca..3646fe74 100644
--- a/alpaka/include/alpaka/event/EventFpgaSyclIntel.hpp
+++ b/alpaka/include/alpaka/event/EventFpgaSyclIntel.hpp
@@ -1,17 +1,17 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2024 Jan Stephan, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
-#include "alpaka/dev/DevFpgaSyclIntel.hpp"
+#include "alpaka/acc/Tag.hpp"
 #include "alpaka/event/EventGenericSycl.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_FPGA)
 
 namespace alpaka
 {
-    using EventFpgaSyclIntel = EventGenericSycl<DevFpgaSyclIntel>;
+    using EventFpgaSyclIntel = EventGenericSycl<TagFpgaSyclIntel>;
 } // namespace alpaka
 
 #endif
diff --git a/alpaka/include/alpaka/event/EventGenericSycl.hpp b/alpaka/include/alpaka/event/EventGenericSycl.hpp
index 68011a02..7ea85384 100644
--- a/alpaka/include/alpaka/event/EventGenericSycl.hpp
+++ b/alpaka/include/alpaka/event/EventGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Aurora Perego
+/* Copyright 2024 Jan Stephan, Antonio Di Pilato, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -22,11 +22,11 @@
 namespace alpaka
 {
     //! The SYCL device event.
-    template<typename TDev>
+    template<typename TTag>
     class EventGenericSycl final
     {
     public:
-        explicit EventGenericSycl(TDev const& dev) : m_dev{dev}
+        explicit EventGenericSycl(DevGenericSycl<TTag> const& dev) : m_dev{dev}
         {
         }
 
@@ -50,7 +50,7 @@ namespace alpaka
             m_event = event;
         }
 
-        TDev m_dev;
+        DevGenericSycl<TTag> m_dev;
 
     private:
         sycl::event m_event{};
@@ -60,20 +60,20 @@ namespace alpaka
 namespace alpaka::trait
 {
     //! The SYCL device event device get trait specialization.
-    template<typename TDev>
-    struct GetDev<EventGenericSycl<TDev>>
+    template<typename TTag>
+    struct GetDev<EventGenericSycl<TTag>>
     {
-        static auto getDev(EventGenericSycl<TDev> const& event) -> TDev
+        static auto getDev(EventGenericSycl<TTag> const& event) -> DevGenericSycl<TTag>
         {
             return event.m_dev;
         }
     };
 
     //! The SYCL device event test trait specialization.
-    template<typename TDev>
-    struct IsComplete<EventGenericSycl<TDev>>
+    template<typename TTag>
+    struct IsComplete<EventGenericSycl<TTag>>
     {
-        static auto isComplete(EventGenericSycl<TDev> const& event)
+        static auto isComplete(EventGenericSycl<TTag> const& event)
         {
             auto const status
                 = event.getNativeHandle().template get_info<sycl::info::event::command_execution_status>();
@@ -82,20 +82,20 @@ namespace alpaka::trait
     };
 
     //! The SYCL queue enqueue trait specialization.
-    template<typename TDev>
-    struct Enqueue<QueueGenericSyclNonBlocking<TDev>, EventGenericSycl<TDev>>
+    template<typename TTag>
+    struct Enqueue<QueueGenericSyclNonBlocking<TTag>, EventGenericSycl<TTag>>
     {
-        static auto enqueue(QueueGenericSyclNonBlocking<TDev>& queue, EventGenericSycl<TDev>& event)
+        static auto enqueue(QueueGenericSyclNonBlocking<TTag>& queue, EventGenericSycl<TTag>& event)
         {
             event.setEvent(queue.m_spQueueImpl->get_last_event());
         }
     };
 
     //! The SYCL queue enqueue trait specialization.
-    template<typename TDev>
-    struct Enqueue<QueueGenericSyclBlocking<TDev>, EventGenericSycl<TDev>>
+    template<typename TTag>
+    struct Enqueue<QueueGenericSyclBlocking<TTag>, EventGenericSycl<TTag>>
     {
-        static auto enqueue(QueueGenericSyclBlocking<TDev>& queue, EventGenericSycl<TDev>& event)
+        static auto enqueue(QueueGenericSyclBlocking<TTag>& queue, EventGenericSycl<TTag>& event)
         {
             event.setEvent(queue.m_spQueueImpl->get_last_event());
         }
@@ -105,30 +105,30 @@ namespace alpaka::trait
     //!
     //! Waits until the event itself and therefore all tasks preceding it in the queue it is enqueued to have been
     //! completed. If the event is not enqueued to a queue the method returns immediately.
-    template<typename TDev>
-    struct CurrentThreadWaitFor<EventGenericSycl<TDev>>
+    template<typename TTag>
+    struct CurrentThreadWaitFor<EventGenericSycl<TTag>>
     {
-        static auto currentThreadWaitFor(EventGenericSycl<TDev> const& event)
+        static auto currentThreadWaitFor(EventGenericSycl<TTag> const& event)
         {
             event.getNativeHandle().wait_and_throw();
         }
     };
 
     //! The SYCL queue event wait trait specialization.
-    template<typename TDev>
-    struct WaiterWaitFor<QueueGenericSyclNonBlocking<TDev>, EventGenericSycl<TDev>>
+    template<typename TTag>
+    struct WaiterWaitFor<QueueGenericSyclNonBlocking<TTag>, EventGenericSycl<TTag>>
     {
-        static auto waiterWaitFor(QueueGenericSyclNonBlocking<TDev>& queue, EventGenericSycl<TDev> const& event)
+        static auto waiterWaitFor(QueueGenericSyclNonBlocking<TTag>& queue, EventGenericSycl<TTag> const& event)
         {
             queue.m_spQueueImpl->register_dependency(event.getNativeHandle());
         }
     };
 
     //! The SYCL queue event wait trait specialization.
-    template<typename TDev>
-    struct WaiterWaitFor<QueueGenericSyclBlocking<TDev>, EventGenericSycl<TDev>>
+    template<typename TTag>
+    struct WaiterWaitFor<QueueGenericSyclBlocking<TTag>, EventGenericSycl<TTag>>
     {
-        static auto waiterWaitFor(QueueGenericSyclBlocking<TDev>& queue, EventGenericSycl<TDev> const& event)
+        static auto waiterWaitFor(QueueGenericSyclBlocking<TTag>& queue, EventGenericSycl<TTag> const& event)
         {
             queue.m_spQueueImpl->register_dependency(event.getNativeHandle());
         }
@@ -138,20 +138,20 @@ namespace alpaka::trait
     //!
     //! Any future work submitted in any queue of this device will wait for event to complete before beginning
     //! execution.
-    template<typename TDev>
-    struct WaiterWaitFor<TDev, EventGenericSycl<TDev>>
+    template<typename TTag>
+    struct WaiterWaitFor<DevGenericSycl<TTag>, EventGenericSycl<TTag>>
     {
-        static auto waiterWaitFor(TDev& dev, EventGenericSycl<TDev> const& event)
+        static auto waiterWaitFor(DevGenericSycl<TTag>& dev, EventGenericSycl<TTag> const& event)
         {
             dev.m_impl->register_dependency(event.getNativeHandle());
         }
     };
 
     //! The SYCL device event native handle trait specialization.
-    template<typename TDev>
-    struct NativeHandle<EventGenericSycl<TDev>>
+    template<typename TTag>
+    struct NativeHandle<EventGenericSycl<TTag>>
     {
-        [[nodiscard]] static auto getNativeHandle(EventGenericSycl<TDev> const& event)
+        [[nodiscard]] static auto getNativeHandle(EventGenericSycl<TTag> const& event)
         {
             return event.getNativeHandle();
         }
diff --git a/alpaka/include/alpaka/event/EventGpuSyclIntel.hpp b/alpaka/include/alpaka/event/EventGpuSyclIntel.hpp
index d59562a9..508fb57f 100644
--- a/alpaka/include/alpaka/event/EventGpuSyclIntel.hpp
+++ b/alpaka/include/alpaka/event/EventGpuSyclIntel.hpp
@@ -1,17 +1,17 @@
-/* Copyright 2023 Jan Stephan
+/* Copyright 2024 Jan Stephan, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
-#include "alpaka/dev/DevGpuSyclIntel.hpp"
+#include "alpaka/acc/Tag.hpp"
 #include "alpaka/event/EventGenericSycl.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_GPU)
 
 namespace alpaka
 {
-    using EventGpuSyclIntel = EventGenericSycl<DevGpuSyclIntel>;
+    using EventGpuSyclIntel = EventGenericSycl<TagGpuSyclIntel>;
 } // namespace alpaka
 
 #endif
diff --git a/alpaka/include/alpaka/example/ExecuteForEachAccTag.hpp b/alpaka/include/alpaka/example/ExecuteForEachAccTag.hpp
new file mode 100644
index 00000000..1eae3d8b
--- /dev/null
+++ b/alpaka/include/alpaka/example/ExecuteForEachAccTag.hpp
@@ -0,0 +1,27 @@
+/* Copyright 2023 Jeffrey Kelling, Bernhard Manfred Gruber, Jan Stephan, Aurora Perego, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#include "alpaka/alpaka.hpp"
+
+#include <functional>
+#include <tuple>
+#include <utility>
+
+#pragma once
+
+namespace alpaka
+{
+    //! execute a callable for each active accelerator tag
+    //
+    // @param callable callable which can be invoked with an accelerator tag
+    // @return disjunction of all invocation results
+    //
+    template<typename TCallable>
+    inline auto executeForEachAccTag(TCallable&& callable)
+    {
+        // Execute the callable once for each enabled accelerator.
+        // Pass the tag as first argument to the callable.
+        return std::apply([=](auto const&... tags) { return (callable(tags) || ...); }, alpaka::EnabledAccTags{});
+    }
+} // namespace alpaka
diff --git a/alpaka/include/alpaka/exec/ElementIndex.hpp b/alpaka/include/alpaka/exec/ElementIndex.hpp
new file mode 100644
index 00000000..061c597f
--- /dev/null
+++ b/alpaka/include/alpaka/exec/ElementIndex.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+namespace alpaka
+{
+
+    /* ElementIndex
+     *
+     * An aggregate that containes the `.global` and `.local` indices of an element along a given dimension.
+     */
+
+    template<typename TIdx>
+    struct ElementIndex
+    {
+        TIdx global; // Index of the element along a given dimension, relative to the whole problem space.
+        TIdx local; // Index of the element along a given dimension, relative to the current group.
+    };
+
+} // namespace alpaka
diff --git a/alpaka/include/alpaka/exec/IndependentElements.hpp b/alpaka/include/alpaka/exec/IndependentElements.hpp
new file mode 100644
index 00000000..447fa7ef
--- /dev/null
+++ b/alpaka/include/alpaka/exec/IndependentElements.hpp
@@ -0,0 +1,454 @@
+#pragma once
+
+#include "alpaka/acc/Traits.hpp"
+#include "alpaka/idx/Accessors.hpp"
+
+#include <algorithm>
+#include <ciso646> // workaround for MSVC in c++17 mode - TODO: remove once we move to c++20
+#include <cstddef>
+#include <type_traits>
+
+namespace alpaka
+{
+
+    namespace detail
+    {
+
+        /* IndependentGroupsAlong
+         *
+         * `IndependentGroupsAlong<TAcc, Dim>(acc, groups)` returns a one-dimensional iteratable range than spans the
+         * group indices from 0 to `groups`; the groups are assigned to the blocks along the `Dim` dimension. If
+         * `groups` is not specified, it defaults to the number of blocks along the `Dim` dimension.
+         *
+         * `independentGroupsAlong<Dim>(acc, ...)` is a shorthand for `IndependentGroupsAlong<TAcc, Dim>(acc, ...)`
+         * that can infer the accelerator type from the argument.
+         *
+         * In a 1-dimensional kernel, `independentGroups(acc, ...)` is a shorthand for `IndependentGroupsAlong<TAcc,
+         * 0>(acc, ...)`.
+         *
+         * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop),
+         * followed by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). For convenience
+         * when converting CUDA or HIP code, `independentGroupsAlongX(acc, ...)`, `Y` and `Z` are shorthands for
+         * `IndependentGroupsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
+         *
+         * `independentGroupsAlong<Dim>(acc, ...)` should be called consistently by all the threads in a block. All
+         * threads in a block see the same loop iterations, while threads in different blocks may see a different
+         * number of iterations.
+         * If the work division has more blocks than the required number of groups, the first blocks will perform one
+         * iteration of the loop, while the other blocks will exit the loop immediately.
+         * If the work division has less blocks than the required number of groups, some of the blocks will perform
+         * more than one iteration, in order to cover then whole problem space.
+         *
+         * For example,
+         *
+         *   for (auto group: independentGroupsAlong<Dim>(acc, 7))
+         *
+         * will return the group range from 0 to 6, distributed across all blocks in the work division.
+         * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the
+         * other blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from
+         * 0 to 6 will process one group while block 7 will no process any.
+         * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the
+         * loop, in order to cover then whole problem space. For example if the work division has 4 blocks, block 0
+         * will process the groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6,
+         * and block 3 will process group 3.
+         */
+
+        template<
+            typename TAcc,
+            std::size_t Dim,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
+        class IndependentGroupsAlong
+        {
+        public:
+            using Idx = alpaka::Idx<TAcc>;
+
+            ALPAKA_FN_ACC inline IndependentGroupsAlong(TAcc const& acc)
+                : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
+                , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
+                , extent_{stride_}
+            {
+            }
+
+            ALPAKA_FN_ACC inline IndependentGroupsAlong(TAcc const& acc, Idx groups)
+                : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
+                , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
+                , extent_{groups}
+            {
+            }
+
+            class const_iterator;
+            using iterator = const_iterator;
+
+            ALPAKA_FN_ACC inline const_iterator begin() const
+            {
+                return const_iterator(stride_, extent_, first_);
+            }
+
+            ALPAKA_FN_ACC inline const_iterator end() const
+            {
+                return const_iterator(stride_, extent_, extent_);
+            }
+
+            class const_iterator
+            {
+                friend class IndependentGroupsAlong;
+
+                ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first)
+                    : stride_{stride}
+                    , extent_{extent}
+                    , first_{std::min(first, extent)}
+                {
+                }
+
+            public:
+                ALPAKA_FN_ACC inline Idx operator*() const
+                {
+                    return first_;
+                }
+
+                // pre-increment the iterator
+                ALPAKA_FN_ACC inline const_iterator& operator++()
+                {
+                    // increment the first-element-in-block index by the grid stride
+                    first_ += stride_;
+                    if(first_ < extent_)
+                        return *this;
+
+                    // the iterator has reached or passed the end of the extent, clamp it to the extent
+                    first_ = extent_;
+                    return *this;
+                }
+
+                // post-increment the iterator
+                ALPAKA_FN_ACC inline const_iterator operator++(int)
+                {
+                    const_iterator old = *this;
+                    ++(*this);
+                    return old;
+                }
+
+                ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const
+                {
+                    return (first_ == other.first_);
+                }
+
+                ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const
+                {
+                    return not(*this == other);
+                }
+
+            private:
+                // non-const to support iterator copy and assignment
+                Idx stride_;
+                Idx extent_;
+                // modified by the pre/post-increment operator
+                Idx first_;
+            };
+
+        private:
+            Idx const first_;
+            Idx const stride_;
+            Idx const extent_;
+        };
+
+    } // namespace detail
+
+    /* independentGroups
+     *
+     * `independentGroups(acc, groups)` returns a one-dimensional iteratable range than spans the group indices from 0
+     * to `groups`. If `groups` is not specified, it defaults to the number of blocks.
+     *
+     * `independentGroups(acc, ...)` is a shorthand for `detail::IndependentGroupsAlong<TAcc, 0>(acc, ...)`.
+     *
+     * `independentGroups(acc, ...)` should be called consistently by all the threads in a block. All threads in a
+     * block see the same loop iterations, while threads in different blocks may see a different number of iterations.
+     * If the work division has more blocks than the required number of groups, the first blocks will perform one
+     * iteration of the loop, while the other blocks will exit the loop immediately.
+     * If the work division has less blocks than the required number of groups, some of the blocks will perform more
+     * than one iteration, in order to cover then whole problem space.
+     *
+     * For example,
+     *
+     *   for (auto group: independentGroups(acc, 7))
+     *
+     * will return the group range from 0 to 6, distributed across all blocks in the work division.
+     * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
+     * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6
+     * will process one group while block 7 will no process any.
+     * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the
+     * loop, in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will
+     * process the groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block
+     * 3 will process group 3.
+     *
+     * Note that `independentGroups(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels,
+     * use
+     *   - `independentGroupsAlong<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
+     *   - `independentGroupsAlongX(acc, ...)`, `independentGroupsAlongY(acc, ...)`, or `independentGroupsAlongZ(acc,
+     *     ...)` to loop along the fastest, second-fastest, or third-fastest dimension.
+     */
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+    ALPAKA_FN_ACC inline auto independentGroups(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::IndependentGroupsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
+    }
+
+    /* independentGroupsAlong<Dim>
+     *
+     * `independentGroupsAlong<Dim>(acc, ...)` is a shorthand for `detail::IndependentGroupsAlong<TAcc, Dim>(acc, ...)`
+     * that can infer the accelerator type from the argument.
+     */
+
+    template<
+        std::size_t Dim,
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
+    ALPAKA_FN_ACC inline auto independentGroupsAlong(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::IndependentGroupsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
+    }
+
+    /* independentGroupsAlongX, Y, Z
+     *
+     * Like `independentGroups` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
+     * dimensions.
+     */
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
+    ALPAKA_FN_ACC inline auto independentGroupsAlongX(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::IndependentGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
+    }
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
+    ALPAKA_FN_ACC inline auto independentGroupsAlongY(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::IndependentGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
+    }
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
+    ALPAKA_FN_ACC inline auto independentGroupsAlongZ(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::IndependentGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
+    }
+
+    namespace detail
+    {
+
+        /* IndependentGroupElementsAlong
+         *
+         * `independentGroupElementsAlong<Dim>(acc, ...)` is a shorthand for `IndependentGroupElementsAlong<TAcc,
+         * Dim>(acc, ...)` that can infer the accelerator type from the argument.
+         */
+
+        template<
+            typename TAcc,
+            std::size_t Dim,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
+        class IndependentGroupElementsAlong
+        {
+        public:
+            using Idx = alpaka::Idx<TAcc>;
+
+            ALPAKA_FN_ACC inline IndependentGroupElementsAlong(TAcc const& acc)
+                : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
+                , thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_}
+                , stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_}
+                , extent_{stride_}
+            {
+            }
+
+            ALPAKA_FN_ACC inline IndependentGroupElementsAlong(TAcc const& acc, Idx extent)
+                : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
+                , thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_}
+                , stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_}
+                , extent_{extent}
+            {
+            }
+
+            ALPAKA_FN_ACC inline IndependentGroupElementsAlong(TAcc const& acc, Idx first, Idx extent)
+                : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
+                , thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_ + first}
+                , stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_}
+                , extent_{extent}
+            {
+            }
+
+            class const_iterator;
+            using iterator = const_iterator;
+
+            ALPAKA_FN_ACC inline const_iterator begin() const
+            {
+                return const_iterator(elements_, stride_, extent_, thread_);
+            }
+
+            ALPAKA_FN_ACC inline const_iterator end() const
+            {
+                return const_iterator(elements_, stride_, extent_, extent_);
+            }
+
+            class const_iterator
+            {
+                friend class IndependentGroupElementsAlong;
+
+                ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
+                    : elements_{elements}
+                    ,
+                    // we need to reduce the stride by on element range because index_ is later increased with each
+                    // increment
+                    stride_{stride - elements}
+                    , extent_{extent}
+                    , index_{std::min(first, extent)}
+                {
+                }
+
+            public:
+                ALPAKA_FN_ACC inline Idx operator*() const
+                {
+                    return index_;
+                }
+
+                // pre-increment the iterator
+                ALPAKA_FN_ACC inline const_iterator& operator++()
+                {
+                    ++indexElem_;
+                    ++index_;
+                    if(indexElem_ >= elements_)
+                    {
+                        indexElem_ = 0;
+                        index_ += stride_;
+                    }
+                    if(index_ >= extent_)
+                        index_ = extent_;
+
+                    return *this;
+                }
+
+                // post-increment the iterator
+                ALPAKA_FN_ACC inline const_iterator operator++(int)
+                {
+                    const_iterator old = *this;
+                    ++(*this);
+                    return old;
+                }
+
+                ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const
+                {
+                    return (*(*this) == *other);
+                }
+
+                ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const
+                {
+                    return not(*this == other);
+                }
+
+            private:
+                // non-const to support iterator copy and assignment
+                Idx elements_;
+                Idx stride_;
+                Idx extent_;
+                // modified by the pre/post-increment operator
+                Idx index_;
+                Idx indexElem_ = 0;
+            };
+
+        private:
+            Idx const elements_;
+            Idx const thread_;
+            Idx const stride_;
+            Idx const extent_;
+        };
+
+    } // namespace detail
+
+    /* independentGroupElements
+     */
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+    ALPAKA_FN_ACC inline auto independentGroupElements(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::IndependentGroupElementsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
+    }
+
+    /* independentGroupElementsAlong<Dim>
+     *
+     * `independentGroupElementsAlong<Dim>(acc, ...)` is a shorthand for `detail::IndependentGroupElementsAlong<TAcc,
+     * Dim>(acc, ...)` that can infer the accelerator type from the argument.
+     */
+
+    template<
+        std::size_t Dim,
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
+    ALPAKA_FN_ACC inline auto independentGroupElementsAlong(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::IndependentGroupElementsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
+    }
+
+    /* independentGroupElementsAlongX, Y, Z
+     *
+     * Like `independentGroupElements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
+     * dimensions.
+     */
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
+    ALPAKA_FN_ACC inline auto independentGroupElementsAlongX(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::IndependentGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(
+            acc,
+            static_cast<Idx>(args)...);
+    }
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
+    ALPAKA_FN_ACC inline auto independentGroupElementsAlongY(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::IndependentGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(
+            acc,
+            static_cast<Idx>(args)...);
+    }
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
+    ALPAKA_FN_ACC inline auto independentGroupElementsAlongZ(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::IndependentGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(
+            acc,
+            static_cast<Idx>(args)...);
+    }
+
+} // namespace alpaka
diff --git a/alpaka/include/alpaka/exec/Once.hpp b/alpaka/include/alpaka/exec/Once.hpp
new file mode 100644
index 00000000..8a2f2cb8
--- /dev/null
+++ b/alpaka/include/alpaka/exec/Once.hpp
@@ -0,0 +1,56 @@
+/* Copyright 2024 Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/acc/Traits.hpp"
+#include "alpaka/dim/DimIntegralConst.hpp"
+#include "alpaka/idx/Accessors.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+
+#include <type_traits>
+
+namespace alpaka
+{
+
+    /* oncePerGrid
+     *
+     * `oncePerGrid(acc)` returns true for a single thread within the kernel execution grid.
+     *
+     * Usually the condition is true for block 0 and thread 0, but these indices should not be relied upon.
+     */
+
+    template<typename TAcc, typename = std::enable_if_t<isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC inline constexpr bool oncePerGrid(TAcc const& acc)
+    {
+        using Dim = alpaka::Dim<TAcc>;
+        using Idx = alpaka::Idx<TAcc>;
+        using Vec = alpaka::Vec<Dim, Idx>;
+
+        // Workaround for a weird bug in oneAPI 2024.x targetting the CPU backend and FPGA emulator.
+        if constexpr(accMatchesTags<TAcc, TagCpuSycl, TagFpgaSyclIntel>)
+        {
+            // SYCL accelerator specific code
+            return acc.m_item_workdiv.get_global_linear_id() == 0;
+        }
+
+        return getIdx<Grid, Threads>(acc) == Vec::zeros();
+    }
+
+    /* oncePerBlock
+     *
+     * `oncePerBlock(acc)` returns true for a single thread within the block.
+     *
+     * Usually the condition is true for thread 0, but this index should not be relied upon.
+     */
+
+    template<typename TAcc, typename = std::enable_if_t<isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC inline constexpr bool oncePerBlock(TAcc const& acc)
+    {
+        return getIdx<Block, Threads>(acc) == Vec<Dim<TAcc>, Idx<TAcc>>::zeros();
+    }
+
+} // namespace alpaka
diff --git a/alpaka/include/alpaka/exec/UniformElements.hpp b/alpaka/include/alpaka/exec/UniformElements.hpp
new file mode 100644
index 00000000..2bfbc94a
--- /dev/null
+++ b/alpaka/include/alpaka/exec/UniformElements.hpp
@@ -0,0 +1,1145 @@
+#pragma once
+
+#include "alpaka/acc/Traits.hpp"
+#include "alpaka/core/Utility.hpp"
+#include "alpaka/exec/ElementIndex.hpp"
+#include "alpaka/idx/Accessors.hpp"
+
+#include <algorithm>
+#include <ciso646> // workaround for MSVC in c++17 mode - TODO: remove once we move to c++20
+#include <cstddef>
+#include <type_traits>
+
+namespace alpaka
+{
+
+    namespace detail
+    {
+
+        /* UniformElementsAlong
+         *
+         * `UniformElementsAlong<TAcc, Dim>(acc [, first], extent)` returns a one-dimensional iteratable range that
+         * spans the element indices from `first` (inclusive) to `extent` (exlusive) along the `Dim` dimension. If
+         * `first` is not specified, it defaults to 0. If `extent` is not specified, it defaults to the kernel grid
+         * size along the `Dim` dimension.
+         *
+         * `uniformElementsAlong<Dim>(acc, ...)` is a shorthand for `UniformElementsAlong<TAcc, Dim>(acc, ...)` that
+         * can infer the accelerator type from the argument.
+         *
+         * In a 1-dimensional kernel, `uniformElements(acc, ...)` is a shorthand for `UniformElementsAlong<TAcc,
+         * 0>(acc, ...)`.
+         *
+         * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop),
+         * followed by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). For convenience
+         * when converting CUDA or HIP code, `uniformElementsAlongX(acc, ...)`, `Y` and `Z` are shorthands for
+         * `UniformElementsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
+         *
+         * To cover the problem space, different threads may execute a different number of iterations. As a result, it
+         * is not safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. If
+         * a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
+         * loop over each group's elements, and synchronise only in the outer loop:
+         *
+         *  for (auto group : uniformGroupsAlong<Dim>(acc, extent)) {
+         *    for (auto element : uniformGroupElementsAlong<Dim>(acc, group, extent)) {
+         *       // first part of the computation
+         *       // no synchronisations here
+         *       ...
+         *    }
+         *    // wait for all threads to complete the first part
+         *    alpaka::syncBlockThreads();
+         *    for (auto element : uniformGroupElementsAlong<Dim>(acc, group, extent)) {
+         *       // second part of the computation
+         *       // no synchronisations here
+         *       ...
+         *    }
+         *    // wait for all threads to complete the second part
+         *    alpaka::syncBlockThreads();
+         *    ...
+         *  }
+         *
+         * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a
+         * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for
+         * example, the kernel may hang. To avoid this problem, round up `extent` to a multiple of the warp size, and
+         * check the element index explicitly inside the loop:
+         *
+         *  for (auto element : uniformElementsAlong<N-1>(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) {
+         *    bool flag = false;
+         *    if (element < extent) {
+         *      // do some work and compute a result flag only for the valid elements
+         *      flag = do_some_work();
+         *    }
+         *    // check if any valid element had a positive result
+         *    if (alpaka::warp::any(acc, flag)) {
+         *      // ...
+         *    }
+         *  }
+         *
+         * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension,
+         * `N-1`.
+         */
+
+        template<
+            typename TAcc,
+            std::size_t Dim,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
+        class UniformElementsAlong
+        {
+        public:
+            using Idx = alpaka::Idx<TAcc>;
+
+            ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc)
+                : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
+                , first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
+                , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
+                , extent_{stride_}
+            {
+            }
+
+            ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc, Idx extent)
+                : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
+                , first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
+                , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
+                , extent_{extent}
+            {
+            }
+
+            ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc, Idx first, Idx extent)
+                : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
+                , first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_ + first}
+                , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
+                , extent_{extent}
+            {
+            }
+
+            class const_iterator;
+            using iterator = const_iterator;
+
+            ALPAKA_FN_ACC inline const_iterator begin() const
+            {
+                return const_iterator(elements_, stride_, extent_, first_);
+            }
+
+            ALPAKA_FN_ACC inline const_iterator end() const
+            {
+                return const_iterator(elements_, stride_, extent_, extent_);
+            }
+
+            class const_iterator
+            {
+                friend class UniformElementsAlong;
+
+                ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
+                    : elements_{elements}
+                    ,
+                    // we need to reduce the stride by on element range because index_ is later increased with each
+                    // increment
+                    stride_{stride - elements}
+                    , extent_{extent}
+                    , index_{std::min(first, extent)}
+                {
+                }
+
+            public:
+                ALPAKA_FN_ACC inline Idx operator*() const
+                {
+                    return index_;
+                }
+
+                // pre-increment the iterator
+                ALPAKA_FN_ACC inline const_iterator& operator++()
+                {
+                    // increment the index along the elements processed by the current thread
+                    ++indexElem_;
+                    ++index_;
+                    if(indexElem_ >= elements_)
+                    {
+                        indexElem_ = 0;
+                        index_ += stride_;
+                    }
+                    if(index_ >= extent_)
+                        index_ = extent_;
+
+                    return *this;
+                }
+
+                // post-increment the iterator
+                ALPAKA_FN_ACC inline const_iterator operator++(int)
+                {
+                    const_iterator old = *this;
+                    ++(*this);
+                    return old;
+                }
+
+                ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const
+                {
+                    return (*(*this) == *other);
+                }
+
+                ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const
+                {
+                    return not(*this == other);
+                }
+
+            private:
+                // non-const to support iterator copy and assignment
+                Idx elements_;
+                Idx stride_;
+                Idx extent_;
+                // modified by the pre/post-increment operator
+                Idx index_;
+                Idx indexElem_ = 0;
+            };
+
+        private:
+            Idx const elements_;
+            Idx const first_;
+            Idx const stride_;
+            Idx const extent_;
+        };
+
+    } // namespace detail
+
+    /* uniformElements
+     *
+     * `uniformElements(acc [, first], extent)` returns a one-dimensional iteratable range that spans the element
+     * indices from `first` (inclusive) to `extent` (exlusive). If `first` is not specified, it defaults to 0. If
+     * `extent` is not specified, it defaults to the kernel grid size.
+     *
+     * `uniformElements(acc, ...)` is a shorthand for `detail::UniformElementsAlong<TAcc, 0>(acc, ...)`.
+     *
+     * To cover the problem space, different threads may execute a different number of iterations. As a result, it is
+     * not safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. If a
+     * block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner loop
+     * over each group's elements, and synchronise only in the outer loop:
+     *
+     *  for (auto group : uniformGroups(acc, extent)) {
+     *    for (auto element : uniformGroupElements(acc, group, extent)) {
+     *       // first part of the computation
+     *       // no synchronisations here
+     *       ...
+     *    }
+     *    // wait for all threads to complete the first part
+     *    alpaka::syncBlockThreads();
+     *    for (auto element : uniformGroupElements(acc, group, extent)) {
+     *       // second part of the computation
+     *       // no synchronisations here
+     *       ...
+     *    }
+     *    // wait for all threads to complete the second part
+     *    alpaka::syncBlockThreads();
+     *    ...
+     *  }
+     *
+     * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a
+     * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example,
+     * the kernel may hang. To avoid this problem, round up `extent` to a multiple of the warp size, and check the
+     * element index explicitly inside the loop:
+     *
+     *  for (auto element : uniformElements(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) {
+     *    bool flag = false;
+     *    if (element < extent) {
+     *      // do some work and compute a result flag only for elements up to extent
+     *      flag = do_some_work();
+     *    }
+     *    // check if any valid element had a positive result
+     *    if (alpaka::warp::any(acc, flag)) {
+     *      // ...
+     *    }
+     *  }
+     *
+     * Note that `uniformElements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels,
+     * use
+     *   - `uniformElementsND(acc, ...)` to cover an N-dimensional problem space with a single loop;
+     *   - `uniformElementsAlong<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
+     *   - `uniformElementsAlongX(acc, ...)`, `uniformElementsAlongY(acc, ...)`, or `uniformElementsAlongZ(acc, ...)`
+     *     to loop along the fastest, second-fastest, or third-fastest dimension.
+     */
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+    ALPAKA_FN_ACC inline auto uniformElements(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformElementsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
+    }
+
+    /* uniformElementsAlong<Dim>
+     *
+     * `uniformElementsAlong<Dim>(acc, ...)` is a shorthand for `detail::UniformElementsAlong<TAcc, Dim>(acc, ...)`
+     * that can infer the accelerator type from the argument.
+     */
+
+    template<
+        std::size_t Dim,
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
+    ALPAKA_FN_ACC inline auto uniformElementsAlong(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformElementsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
+    }
+
+    /* uniformElementsAlongX, Y, Z
+     *
+     * Like `uniformElements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
+     * dimensions.
+     */
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
+    ALPAKA_FN_ACC inline auto uniformElementsAlongX(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
+    }
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
+    ALPAKA_FN_ACC inline auto uniformElementsAlongY(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
+    }
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
+    ALPAKA_FN_ACC inline auto uniformElementsAlongZ(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
+    }
+
+    namespace detail
+    {
+
+        /* UniformElementsND
+         *
+         * `UniformElementsND(acc, extent)` returns an N-dimensional iteratable range that spans the element indices
+         * required to cover the given problem size, indicated by `extent`.
+         *
+         * `uniformElementsND(acc, ...)` is an alias for `UniformElementsND<TAcc>(acc, ...)`.
+         *
+         * To cover the problem space, different threads may execute a different number of iterations. As a result, it
+         * is not safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. If
+         * a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
+         * loop over each group's elements, and synchronise only in the outer loop:
+         *
+         *  for (auto group0 : uniformGroupsAlong<0>(acc, extent[0])) {
+         *    for (auto group1 : uniformGroupsAlong<1>(acc, extent[1])) {
+         *      for (auto element0 : uniformGroupElementsAlong<0>(acc, group0, extent[0])) {
+         *        for (auto element1 : uniformGroupElementsAlong<1>(acc, group1, extent[1])) {
+         *           // first part of the computation
+         *           // no synchronisations here
+         *           ...
+         *        }
+         *      }
+         *      // wait for all threads to complete the first part
+         *      alpaka::syncBlockThreads();
+         *      for (auto element0 : uniformGroupElementsAlong<0>(acc, group0, extent[0])) {
+         *        for (auto element1 : uniformGroupElementsAlong<1>(acc, group1, extent[1])) {
+         *           // second part of the computation
+         *           // no synchronisations here
+         *           ...
+         *        }
+         *      }
+         *      // wait for all threads to complete the second part
+         *      alpaka::syncBlockThreads();
+         *      ...
+         *    }
+         *  }
+         *
+         * For more details, see `UniformElementsAlong<TAcc, Dim>(acc, ...)`.
+         */
+
+        template<
+            typename TAcc,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
+        class UniformElementsND
+        {
+        public:
+            using Dim = alpaka::Dim<TAcc>;
+            using Idx = alpaka::Idx<TAcc>;
+            using Vec = alpaka::Vec<Dim, Idx>;
+
+            ALPAKA_FN_ACC inline UniformElementsND(TAcc const& acc)
+                : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)}
+                , thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_}
+                , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_}
+                , extent_{stride_}
+            {
+            }
+
+            ALPAKA_FN_ACC inline UniformElementsND(TAcc const& acc, Vec extent)
+                : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)}
+                , thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_}
+                , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_}
+                , extent_{extent}
+            {
+            }
+
+            // tag used to construct an end iterator
+            struct at_end_t
+            {
+            };
+
+            class const_iterator;
+            using iterator = const_iterator;
+
+            ALPAKA_FN_ACC inline const_iterator begin() const
+            {
+                // check that all dimensions of the current thread index are within the extent
+                if((thread_ < extent_).all())
+                {
+                    // construct an iterator pointing to the first element to be processed by the current thread
+                    return const_iterator{this, thread_};
+                }
+                else
+                {
+                    // construct an end iterator, pointing post the end of the extent
+                    return const_iterator{this, at_end_t{}};
+                }
+            }
+
+            ALPAKA_FN_ACC inline const_iterator end() const
+            {
+                // construct an end iterator, pointing post the end of the extent
+                return const_iterator{this, at_end_t{}};
+            }
+
+            class const_iterator
+            {
+                friend class UniformElementsND;
+
+            public:
+                ALPAKA_FN_ACC inline Vec operator*() const
+                {
+                    return index_;
+                }
+
+                // pre-increment the iterator
+                ALPAKA_FN_ACC inline constexpr const_iterator operator++()
+                {
+                    increment();
+                    return *this;
+                }
+
+                // post-increment the iterator
+                ALPAKA_FN_ACC inline constexpr const_iterator operator++(int)
+                {
+                    const_iterator old = *this;
+                    increment();
+                    return old;
+                }
+
+                ALPAKA_FN_ACC inline constexpr bool operator==(const_iterator const& other) const
+                {
+                    return (index_ == other.index_);
+                }
+
+                ALPAKA_FN_ACC inline constexpr bool operator!=(const_iterator const& other) const
+                {
+                    return not(*this == other);
+                }
+
+            private:
+                // construct an iterator pointing to the first element to be processed by the current thread
+                ALPAKA_FN_ACC inline const_iterator(UniformElementsND const* loop, Vec first)
+                    : loop_{loop}
+                    , first_{alpaka::elementwise_min(first, loop->extent_)}
+                    , range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)}
+                    , index_{first_}
+                {
+                }
+
+                // construct an end iterator, pointing post the end of the extent
+                ALPAKA_FN_ACC inline const_iterator(UniformElementsND const* loop, at_end_t const&)
+                    : loop_{loop}
+                    , first_{loop_->extent_}
+                    , range_{loop_->extent_}
+                    , index_{loop_->extent_}
+                {
+                }
+
+                template<size_t I>
+                ALPAKA_FN_ACC inline constexpr bool nth_elements_loop()
+                {
+                    bool overflow = false;
+                    ++index_[I];
+                    if(index_[I] >= range_[I])
+                    {
+                        index_[I] = first_[I];
+                        overflow = true;
+                    }
+                    return overflow;
+                }
+
+                template<size_t N>
+                ALPAKA_FN_ACC inline constexpr bool do_elements_loops()
+                {
+                    if constexpr(N == 0)
+                    {
+                        // overflow
+                        return true;
+                    }
+                    else
+                    {
+                        if(not nth_elements_loop<N - 1>())
+                        {
+                            return false;
+                        }
+                        else
+                        {
+                            return do_elements_loops<N - 1>();
+                        }
+                    }
+                    ALPAKA_UNREACHABLE(false);
+                }
+
+                template<size_t I>
+                ALPAKA_FN_ACC inline constexpr bool nth_strided_loop()
+                {
+                    bool overflow = false;
+                    first_[I] += loop_->stride_[I];
+                    if(first_[I] >= loop_->extent_[I])
+                    {
+                        first_[I] = loop_->thread_[I];
+                        overflow = true;
+                    }
+                    index_[I] = first_[I];
+                    range_[I] = std::min(first_[I] + loop_->elements_[I], loop_->extent_[I]);
+                    return overflow;
+                }
+
+                template<size_t N>
+                ALPAKA_FN_ACC inline constexpr bool do_strided_loops()
+                {
+                    if constexpr(N == 0)
+                    {
+                        // overflow
+                        return true;
+                    }
+                    else
+                    {
+                        if(not nth_strided_loop<N - 1>())
+                        {
+                            return false;
+                        }
+                        else
+                        {
+                            return do_strided_loops<N - 1>();
+                        }
+                    }
+                    ALPAKA_UNREACHABLE(false);
+                }
+
+                // increment the iterator
+                ALPAKA_FN_ACC inline constexpr void increment()
+                {
+                    // linear N-dimensional loops over the elements associated to the thread;
+                    // do_elements_loops<>() returns true if any of those loops overflows
+                    if(not do_elements_loops<Dim::value>())
+                    {
+                        // the elements loops did not overflow, return the next index
+                        return;
+                    }
+
+                    // strided N-dimensional loop over the threads in the kernel launch grid;
+                    // do_strided_loops<>() returns true if any of those loops overflows
+                    if(not do_strided_loops<Dim::value>())
+                    {
+                        // the strided loops did not overflow, return the next index
+                        return;
+                    }
+
+                    // the iterator has reached or passed the end of the extent, clamp it to the extent
+                    first_ = loop_->extent_;
+                    range_ = loop_->extent_;
+                    index_ = loop_->extent_;
+                }
+
+                // const pointer to the UniformElementsND that the iterator refers to
+                UniformElementsND const* loop_;
+
+                // modified by the pre/post-increment operator
+                Vec first_; // first element processed by this thread
+                Vec range_; // last element processed by this thread
+                Vec index_; // current element processed by this thread
+            };
+
+        private:
+            Vec const elements_;
+            Vec const thread_;
+            Vec const stride_;
+            Vec const extent_;
+        };
+
+    } // namespace detail
+
+    /* uniformElementsND
+     *
+     * `uniformElementsND(acc, ...)` is a shorthand for `detail::UniformElementsND<TAcc>(acc, ...)`.
+     */
+
+    template<
+        typename TAcc,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
+    ALPAKA_FN_ACC inline auto uniformElementsND(TAcc const& acc)
+    {
+        return detail::UniformElementsND<TAcc>(acc);
+    }
+
+    template<
+        typename TAcc,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
+    ALPAKA_FN_ACC inline auto uniformElementsND(
+        TAcc const& acc,
+        alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> extent)
+    {
+        return detail::UniformElementsND<TAcc>(acc, extent);
+    }
+
+    namespace detail
+    {
+
+        /* UniformGroupsAlong
+         *
+         * `UniformGroupsAlong<Dim>(acc, elements)` returns a one-dimensional iteratable range than spans the group
+         * indices required to cover the given problem size along the `Dim` dimension, in units of the block size.
+         * `elements` indicates the total number of elements, across all groups; if not specified, it defaults to the
+         * kernel grid size along the `Dim` dimension.
+         *
+         * `uniformGroupsAlong<Dim>(acc, ...)` is a shorthand for `UniformGroupsAlong<TAcc, Dim>(acc, ...)` that can
+         * infer the accelerator type from the argument.
+         *
+         * In a 1-dimensional kernel, `uniformGroups(acc, ...)` is a shorthand for `UniformGroupsAlong<Tacc, 0>(acc,
+         * ...)`.
+         *
+         * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop),
+         * followed by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). For convenience
+         * when converting CUDA or HIP code, `uniformGroupsAlongX(acc, ...)`, `Y` and `Z` are shorthands for
+         * `UniformGroupsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
+         *
+         * `uniformGroupsAlong<Dim>(acc, ...)` should be called consistently by all the threads in a block. All
+         * threads in a block see the same loop iterations, while threads in different blocks may see a different
+         * number of iterations. If the work division has more blocks than the required number of groups, the first
+         * blocks will perform one iteration of the loop, while the other blocks will exit the loop immediately. If the
+         * work division has less blocks than the required number of groups, some of the blocks will perform more than
+         * one iteration, in order to cover then whole problem space.
+         *
+         * If the problem size is not a multiple of the block size, the last group will process a number of elements
+         * smaller than the block size. However, also in this case all threads in the block will execute the same
+         * number of iterations of this loop: this makes it safe to use block-level synchronisations in the loop body.
+         * It is left to the inner loop (or the user) to ensure that only the correct number of threads process any
+         * data; this logic is implemented by `uniformGroupElementsAlong<Dim>(acc, group, elements)`.
+         *
+         * For example, if the block size is 64 and there are 400 elements
+         *
+         *   for (auto group: uniformGroupsAlong<Dim>(acc, 400)
+         *
+         * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should
+         * cover the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last
+         * group, group 6, should cover the elements from 384 to 399. All the threads of the block will process this
+         * last group; it is up to the inner loop to not process the non-existing elements after 399.
+         *
+         * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the
+         * other blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from
+         * 0 to 6 will process one group while block 7 will no process any.
+         *
+         * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the
+         * loop, in order to cover then whole problem space. For example if the work division has 4 blocks, block 0
+         * will process the groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6,
+         * and block 3 will process group 3.
+         *
+         * See `UniformElementsAlong<TAcc, Dim>(acc, ...)` for a concrete example using `uniformGroupsAlong<Dim>` and
+         * `uniformGroupElementsAlong<Dim>`.
+         */
+
+        template<
+            typename TAcc,
+            std::size_t Dim,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
+        class UniformGroupsAlong
+        {
+        public:
+            using Idx = alpaka::Idx<TAcc>;
+
+            ALPAKA_FN_ACC inline UniformGroupsAlong(TAcc const& acc)
+                : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
+                , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
+                , extent_{stride_}
+            {
+            }
+
+            // extent is the total number of elements (not blocks)
+            ALPAKA_FN_ACC inline UniformGroupsAlong(TAcc const& acc, Idx extent)
+                : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
+                , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
+                , extent_{alpaka::core::divCeil(extent, alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim])}
+            {
+            }
+
+            class const_iterator;
+            using iterator = const_iterator;
+
+            ALPAKA_FN_ACC inline const_iterator begin() const
+            {
+                return const_iterator(stride_, extent_, first_);
+            }
+
+            ALPAKA_FN_ACC inline const_iterator end() const
+            {
+                return const_iterator(stride_, extent_, extent_);
+            }
+
+            class const_iterator
+            {
+                friend class UniformGroupsAlong;
+
+                ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first)
+                    : stride_{stride}
+                    , extent_{extent}
+                    , first_{std::min(first, extent)}
+                {
+                }
+
+            public:
+                ALPAKA_FN_ACC inline Idx operator*() const
+                {
+                    return first_;
+                }
+
+                // pre-increment the iterator
+                ALPAKA_FN_ACC inline const_iterator& operator++()
+                {
+                    // increment the first-element-in-block index by the grid stride
+                    first_ += stride_;
+                    if(first_ < extent_)
+                        return *this;
+
+                    // the iterator has reached or passed the end of the extent, clamp it to the extent
+                    first_ = extent_;
+                    return *this;
+                }
+
+                // post-increment the iterator
+                ALPAKA_FN_ACC inline const_iterator operator++(int)
+                {
+                    const_iterator old = *this;
+                    ++(*this);
+                    return old;
+                }
+
+                ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const
+                {
+                    return (first_ == other.first_);
+                }
+
+                ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const
+                {
+                    return not(*this == other);
+                }
+
+            private:
+                // non-const to support iterator copy and assignment
+                Idx stride_;
+                Idx extent_;
+                // modified by the pre/post-increment operator
+                Idx first_;
+            };
+
+        private:
+            Idx const first_;
+            Idx const stride_;
+            Idx const extent_;
+        };
+
+    } // namespace detail
+
+    /* uniformGroups
+     *
+     * `uniformGroups(acc, elements)` returns a one-dimensional iteratable range than spans the group indices required
+     * to cover the given problem size, in units of the block size. `elements` indicates the total number of elements,
+     * across all groups; if not specified, it defaults to the kernel grid size.
+     *
+     * `uniformGroups(acc, ...)` is a shorthand for `detail::UniformGroupsAlong<TAcc, 0>(acc, ...)`.
+     *
+     * `uniformGroups(acc, ...)` should be called consistently by all the threads in a block. All threads in a block
+     * see the same loop iterations, while threads in different blocks may see a different number of iterations. If the
+     * work division has more blocks than the required number of groups, the first blocks will perform one iteration of
+     * the loop, while the other blocks will exit the loop immediately. If the work division has less blocks than the
+     * required number of groups, some of the blocks will perform more than one iteration, in order to cover then whole
+     * problem space.
+     *
+     * If the problem size is not a multiple of the block size, the last group will process a number of elements
+     * smaller than the block size. However, also in this case all threads in the block will execute the same number of
+     * iterations of this loop: this makes it safe to use block-level synchronisations in the loop body. It is left to
+     * the inner loop (or the user) to ensure that only the correct number of threads process any data; this logic is
+     * implemented by `uniformGroupElements(acc, group, elements)`.
+     *
+     * For example, if the block size is 64 and there are 400 elements
+     *
+     *   for (auto group: uniformGroups(acc, 400)
+     *
+     * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should
+     * cover the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last group,
+     * group 6, should cover the elements from 384 to 399. All the threads of the block will process this last group;
+     * it is up to the inner loop to not process the non-existing elements after 399.
+     *
+     * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
+     * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6
+     * will process one group while block 7 will no process any.
+     *
+     * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the
+     * loop, in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will
+     * process the groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block
+     * 3 will process group 3.
+     *
+     * See `uniformElements(acc, ...)` for a concrete example using `uniformGroups` and `uniformGroupElements`.
+     *
+     * Note that `uniformGroups(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels,
+     * use
+     *   - `uniformGroupsAlong<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
+     *   - `uniformGroupsAlongX(acc, ...)`, `uniformGroupsAlongY(acc, ...)`, or `uniformGroupsAlongZ(acc, ...)` to loop
+     *     along the fastest, second-fastest, or third-fastest dimension.
+     */
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+    ALPAKA_FN_ACC inline auto uniformGroups(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformGroupsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
+    }
+
+    /* uniformGroupsAlong<Dim>
+     *
+     * `uniformGroupsAlong<Dim>(acc, ...)` is a shorthand for `detail::UniformGroupsAlong<TAcc, Dim>(acc, ...)` that
+     * can infer the accelerator type from the argument.
+     */
+
+    template<
+        std::size_t Dim,
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
+    ALPAKA_FN_ACC inline auto uniformGroupsAlong(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformGroupsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
+    }
+
+    /* uniformGroupsAlongX, Y, Z
+     *
+     * Like `uniformGroups` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
+     * dimensions.
+     */
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
+    ALPAKA_FN_ACC inline auto uniformGroupsAlongX(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
+    }
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
+    ALPAKA_FN_ACC inline auto uniformGroupsAlongY(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
+    }
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
+    ALPAKA_FN_ACC inline auto uniformGroupsAlongZ(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
+    }
+
+    namespace detail
+    {
+
+        /* UniformGroupElementsAlong
+         *
+         * `UniformGroupElementsAlong<TAcc, Dim>(acc, group, elements)` returns a one-dimensional iteratable range that
+         * spans all the elements within the given `group` along dimension `Dim`, as obtained from
+         * `UniformGroupsAlong<Dim>`, up to `elements` (exclusive). `elements` indicates the total number of elements
+         * across all groups; if not specified, it defaults to the kernel grid size.
+         *
+         * `uniformGroupElementsAlong<Dim>(acc, ...)` is a shorthand for `UniformGroupElementsAlong<TAcc, Dim>(acc,
+         * ...)` that can infer the accelerator type from the argument.
+         *
+         * In a 1-dimensional kernel, `uniformGroupElements(acc, ...)` is a shorthand for
+         * `UniformGroupElementsAlong<0>(acc, ...)`.
+         *
+         * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop),
+         * followed by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). For convenience
+         * when converting CUDA or HIP code, `uniformGroupElementsAlongX(acc, ...)`, `Y` and `Z` are shorthands for
+         * `UniformGroupElementsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
+         *
+         * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local`
+         * indices of the corresponding element. The global index spans a subset of the range from 0 to `elements`
+         * (excluded), while the local index spans the range from 0 to the block size (excluded).
+         *
+         * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if
+         * the global element index reaches `elements`.
+         *
+         * If the problem size is not a multiple of the block size, different threads may execute a different number of
+         * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
+         * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
+         * See `UniformElementsAlong<Dim>(acc, ...)` for a concrete example using `uniformGroupsAlong<Dim>` and
+         * `uniformGroupElementsAlong<Dim>`.
+         *
+         * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a
+         * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for
+         * example, the kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and
+         * check the element index explicitly inside the loop:
+         *
+         *  for (auto element : uniformGroupElementsAlong<N-1>(acc, group, round_up_by(elements,
+         * alpaka::warp::getSize(acc)))) { bool flag = false; if (element < elements) {
+         *      // do some work and compute a result flag only for the valid elements
+         *      flag = do_some_work();
+         *    }
+         *    // check if any valid element had a positive result
+         *    if (alpaka::warp::any(acc, flag)) {
+         *      // ...
+         *    }
+         *  }
+         *
+         * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension,
+         * `N-1`.
+         */
+
+        template<
+            typename TAcc,
+            std::size_t Dim,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
+        class UniformGroupElementsAlong
+        {
+        public:
+            using Idx = alpaka::Idx<TAcc>;
+
+            ALPAKA_FN_ACC inline UniformGroupElementsAlong(TAcc const& acc, Idx block)
+                : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim]}
+                , local_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
+                , range_{local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
+            {
+            }
+
+            ALPAKA_FN_ACC inline UniformGroupElementsAlong(TAcc const& acc, Idx block, Idx extent)
+                : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim]}
+                , local_{std::min(
+                      extent - first_,
+                      alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim]
+                          * alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim])}
+                , range_{
+                      std::min(extent - first_, local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim])}
+            {
+            }
+
+            class const_iterator;
+            using iterator = const_iterator;
+
+            ALPAKA_FN_ACC inline const_iterator begin() const
+            {
+                return const_iterator(local_, first_, range_);
+            }
+
+            ALPAKA_FN_ACC inline const_iterator end() const
+            {
+                return const_iterator(range_, first_, range_);
+            }
+
+            class const_iterator
+            {
+                friend class UniformGroupElementsAlong;
+
+                ALPAKA_FN_ACC inline const_iterator(Idx local, Idx first, Idx range)
+                    : index_{local}
+                    , first_{first}
+                    , range_{range}
+                {
+                }
+
+            public:
+                ALPAKA_FN_ACC inline ElementIndex<Idx> operator*() const
+                {
+                    return ElementIndex<Idx>{index_ + first_, index_};
+                }
+
+                // pre-increment the iterator
+                ALPAKA_FN_ACC inline const_iterator& operator++()
+                {
+                    // increment the index along the elements processed by the current thread
+                    ++index_;
+                    if(index_ < range_)
+                        return *this;
+
+                    // the iterator has reached or passed the end of the extent, clamp it to the extent
+                    index_ = range_;
+                    return *this;
+                }
+
+                // post-increment the iterator
+                ALPAKA_FN_ACC inline const_iterator operator++(int)
+                {
+                    const_iterator old = *this;
+                    ++(*this);
+                    return old;
+                }
+
+                ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const
+                {
+                    return (index_ == other.index_);
+                }
+
+                ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const
+                {
+                    return not(*this == other);
+                }
+
+            private:
+                // modified by the pre/post-increment operator
+                Idx index_;
+                // non-const to support iterator copy and assignment
+                Idx first_;
+                Idx range_;
+            };
+
+        private:
+            Idx const first_;
+            Idx const local_;
+            Idx const range_;
+        };
+
+    } // namespace detail
+
+    /* uniformGroupElements
+     *
+     * `uniformGroupElements(acc, group, elements)` returns a one-dimensional iteratable range that spans all the
+     * elements within the given `group`, as obtained from `uniformGroups`, up to `elements` (exclusive). `elements`
+     * indicates the total number of elements across all groups; if not specified, it defaults to the kernel grid size.
+     *
+     * `uniformGroupElements(acc, ...)` is a shorthand for `detail::UniformGroupElementsAlong<0>(acc, ...)`.
+     *
+     * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local` indices
+     * of the corresponding element. The global index spans a subset of the range from 0 to `elements` (excluded),
+     * while the local index spans the range from 0 to the block size (excluded).
+     *
+     * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if the
+     * global element index reaches `elements`.
+     *
+     * If the problem size is not a multiple of the block size, different threads may execute a different number of
+     * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
+     * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
+     * See `uniformElements(acc, ...)` for a concrete example using `uniformGroups` and `uniformGroupElements`.
+     *
+     * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a
+     * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example,
+     * the kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and check the
+     * element index explicitly inside the loop:
+     *
+     *  for (auto element : uniformGroupElements(acc, group, round_up_by(elements, alpaka::warp::getSize(acc)))) {
+     *    bool flag = false;
+     *    if (element < elements) {
+     *      // do some work and compute a result flag only for the valid elements
+     *      flag = do_some_work();
+     *    }
+     *    // check if any valid element had a positive result
+     *    if (alpaka::warp::any(acc, flag)) {
+     *      // ...
+     *    }
+     *  }
+     *
+     * Note that `uniformGroupElements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional
+     * kernels, use
+     *   - `detail::UniformGroupElementsAlong<Dim>(acc, ...)` to perform the iteration explicitly along dimension
+     *     `Dim`;
+     *   - `uniformGroupElementsAlongX(acc, ...)`, `uniformGroupElementsAlongY(acc, ...)`, or
+     *     `uniformGroupElementsAlongZ(acc, ...)` to loop along the fastest, second-fastest, or third-fastest
+     *     dimension.
+     */
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+    ALPAKA_FN_ACC inline auto uniformGroupElements(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformGroupElementsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
+    }
+
+    /* uniformGroupElementsAlong<Dim>
+     *
+     * `uniformGroupElementsAlong<Dim>(acc, ...)` is a shorthand for `detail::UniformGroupElementsAlong<TAcc,
+     * Dim>(acc, ...)` that can infer the accelerator type from the argument.
+     */
+
+    template<
+        std::size_t Dim,
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
+    ALPAKA_FN_ACC inline auto uniformGroupElementsAlong(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformGroupElementsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
+    }
+
+    /* uniformGroupElementsAlongX, Y, Z
+     *
+     * Like `uniformGroupElements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
+     * dimensions.
+     */
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
+    ALPAKA_FN_ACC inline auto uniformGroupElementsAlongX(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
+    }
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
+    ALPAKA_FN_ACC inline auto uniformGroupElementsAlongY(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
+    }
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
+    ALPAKA_FN_ACC inline auto uniformGroupElementsAlongZ(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
+    }
+
+} // namespace alpaka
diff --git a/alpaka/include/alpaka/kernel/KernelFunctionAttributes.hpp b/alpaka/include/alpaka/kernel/KernelFunctionAttributes.hpp
new file mode 100644
index 00000000..03714305
--- /dev/null
+++ b/alpaka/include/alpaka/kernel/KernelFunctionAttributes.hpp
@@ -0,0 +1,25 @@
+/* Copyright 2022 René Widera, Mehmet Yusufoglu
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include <cstddef>
+
+namespace alpaka
+{
+    //! Kernel function attributes struct. Attributes are filled by calling the API of the accelerator using the kernel
+    //! function as an argument. In case of a CPU backend, maxThreadsPerBlock is set to 1 and other values remain zero
+    //! since there are no correponding API functions to get the values.
+    struct KernelFunctionAttributes
+    {
+        std::size_t constSizeBytes{0};
+        std::size_t localSizeBytes{0};
+        std::size_t sharedSizeBytes{0};
+        int maxDynamicSharedSizeBytes{0};
+        int numRegs{0};
+        // This field is ptx or isa version if the backend is GPU
+        int asmVersion{0};
+        int maxThreadsPerBlock{0};
+    };
+} // namespace alpaka
diff --git a/alpaka/include/alpaka/kernel/SyclSubgroupSize.hpp b/alpaka/include/alpaka/kernel/SyclSubgroupSize.hpp
index b56b652c..1c7124b6 100644
--- a/alpaka/include/alpaka/kernel/SyclSubgroupSize.hpp
+++ b/alpaka/include/alpaka/kernel/SyclSubgroupSize.hpp
@@ -6,81 +6,104 @@
 
 #    ifdef __SYCL_DEVICE_ONLY__
 
-#        if defined(__SYCL_TARGET_INTEL_GPU_BDW__) || /* Broadwell Intel graphics architecture */                     \
-            defined(__SYCL_TARGET_INTEL_GPU_SKL__) || /* Skylake Intel graphics architecture */                       \
-            defined(__SYCL_TARGET_INTEL_GPU_KBL__) || /* Kaby Lake Intel graphics architecture */                     \
-            defined(__SYCL_TARGET_INTEL_GPU_CFL__) || /* Coffee Lake Intel graphics architecture */                   \
-            defined(__SYCL_TARGET_INTEL_GPU_APL__) || /* Apollo Lake Intel graphics architecture */                   \
-            defined(__SYCL_TARGET_INTEL_GPU_GLK__) || /* Gemini Lake Intel graphics architecture */                   \
-            defined(__SYCL_TARGET_INTEL_GPU_WHL__) || /* Whiskey Lake Intel graphics architecture */                  \
-            defined(__SYCL_TARGET_INTEL_GPU_AML__) || /* Amber Lake Intel graphics architecture */                    \
-            defined(__SYCL_TARGET_INTEL_GPU_CML__) || /* Comet Lake Intel graphics architecture */                    \
-            defined(__SYCL_TARGET_INTEL_GPU_ICLLP__) || /* Ice Lake Intel graphics architecture */                    \
-            defined(__SYCL_TARGET_INTEL_GPU_TGLLP__) || /* Tiger Lake Intel graphics architecture */                  \
-            defined(__SYCL_TARGET_INTEL_GPU_RKL__) || /* Rocket Lake Intel graphics architecture */                   \
-            defined(__SYCL_TARGET_INTEL_GPU_ADL_S__) || /* Alder Lake S Intel graphics architecture */                \
-            defined(__SYCL_TARGET_INTEL_GPU_RPL_S__) || /* Raptor Lake Intel graphics architecture */                 \
-            defined(__SYCL_TARGET_INTEL_GPU_ADL_P__) || /* Alder Lake P Intel graphics architecture */                \
-            defined(__SYCL_TARGET_INTEL_GPU_ADL_N__) || /* Alder Lake N Intel graphics architecture */                \
-            defined(__SYCL_TARGET_INTEL_GPU_DG1__) || /* DG1 Intel graphics architecture */                           \
-            defined(__SYCL_TARGET_INTEL_GPU_ACM_G10__) || /* Alchemist G10 Intel graphics architecture */             \
-            defined(__SYCL_TARGET_INTEL_GPU_ACM_G11__) || /* Alchemist G11 Intel graphics architecture */             \
-            defined(__SYCL_TARGET_INTEL_GPU_ACM_G12__) /* Alchemist G12 Intel graphics architecture */
+#        if(__SYCL_TARGET_INTEL_GPU_BDW__) || /* Broadwell Intel graphics architecture */                             \
+            (__SYCL_TARGET_INTEL_GPU_SKL__) || /* Skylake Intel graphics architecture */                              \
+            (__SYCL_TARGET_INTEL_GPU_KBL__) || /* Kaby Lake Intel graphics architecture */                            \
+            (__SYCL_TARGET_INTEL_GPU_CFL__) || /* Coffee Lake Intel graphics architecture */                          \
+            (__SYCL_TARGET_INTEL_GPU_APL__) || /* Apollo Lake Intel graphics architecture */                          \
+            (__SYCL_TARGET_INTEL_GPU_GLK__) || /* Gemini Lake Intel graphics architecture */                          \
+            (__SYCL_TARGET_INTEL_GPU_WHL__) || /* Whiskey Lake Intel graphics architecture */                         \
+            (__SYCL_TARGET_INTEL_GPU_AML__) || /* Amber Lake Intel graphics architecture */                           \
+            (__SYCL_TARGET_INTEL_GPU_CML__) || /* Comet Lake Intel graphics architecture */                           \
+            (__SYCL_TARGET_INTEL_GPU_ICLLP__) || /* Ice Lake Intel graphics architecture */                           \
+            (__SYCL_TARGET_INTEL_GPU_EHL__) || /* Elkhart Lake or Jasper Lake Intel graphics architecture */          \
+            (__SYCL_TARGET_INTEL_GPU_TGLLP__) || /* Tiger Lake Intel graphics architecture */                         \
+            (__SYCL_TARGET_INTEL_GPU_RKL__) || /* Rocket Lake Intel graphics architecture */                          \
+            (__SYCL_TARGET_INTEL_GPU_ADL_S__) || /* Alder Lake S or Raptor Lake S Intel graphics architecture */      \
+            (__SYCL_TARGET_INTEL_GPU_ADL_P__) || /* Alder Lake P Intel graphics architecture */                       \
+            (__SYCL_TARGET_INTEL_GPU_ADL_N__) || /* Alder Lake N Intel graphics architecture */                       \
+            (__SYCL_TARGET_INTEL_GPU_DG1__) || /* DG1 Intel graphics architecture */                                  \
+            (__SYCL_TARGET_INTEL_GPU_ACM_G10__) || /* Alchemist G10 Intel graphics architecture */                    \
+            (__SYCL_TARGET_INTEL_GPU_ACM_G11__) || /* Alchemist G11 Intel graphics architecture */                    \
+            (__SYCL_TARGET_INTEL_GPU_ACM_G12__) || /* Alchemist G12 Intel graphics architecture */                    \
+            (__SYCL_TARGET_INTEL_GPU_MTL_U__) || /* Meteor Lake U/S or Arrow Lake U/S Intel graphics architecture */  \
+            (__SYCL_TARGET_INTEL_GPU_MTL_H__) || /* Meteor Lake H Intel graphics architecture */                      \
+            (__SYCL_TARGET_INTEL_GPU_ARL_H__) || /* Arrow Lake H Intel graphics architecture */                       \
+            (__SYCL_TARGET_INTEL_GPU_BMG_G21__) || /* Battlemage G21 Intel graphics architecture */                   \
+            (__SYCL_TARGET_INTEL_GPU_LNL_M__) /* Lunar Lake Intel graphics architecture */
 
 #            define SYCL_SUBGROUP_SIZE (8 | 16 | 32)
 
-#        elif defined(__SYCL_TARGET_INTEL_GPU_PVC__) /* Ponte Vecchio Intel graphics architecture */
+#        elif(__SYCL_TARGET_INTEL_GPU_PVC__) || /* Ponte Vecchio Intel graphics architecture */                       \
+            (__SYCL_TARGET_INTEL_GPU_PVC_VG__) /* Ponte Vecchio VG Intel graphics architecture */
 
 #            define SYCL_SUBGROUP_SIZE (16 | 32)
 
-#        elif defined(__SYCL_TARGET_INTEL_X86_64__) /* generate code ahead of time for x86_64 CPUs */
+#        elif(__SYCL_TARGET_INTEL_X86_64__) /* generate code ahead of time for x86_64 CPUs */
 
 #            define SYCL_SUBGROUP_SIZE (4 | 8 | 16 | 32 | 64)
 
-#        elif defined(__SYCL_TARGET_NVIDIA_GPU_SM_50__) || /* NVIDIA Maxwell architecture (compute capability 5.0) */ \
-            defined(__SYCL_TARGET_NVIDIA_GPU_SM_52__) || /* NVIDIA Maxwell architecture (compute capability 5.2) */   \
-            defined(__SYCL_TARGET_NVIDIA_GPU_SM_53__) || /* NVIDIA Jetson TX1 / Nano (compute capability 5.3) */      \
-            defined(__SYCL_TARGET_NVIDIA_GPU_SM_60__) || /* NVIDIA Pascal architecture (compute capability 6.0) */    \
-            defined(__SYCL_TARGET_NVIDIA_GPU_SM_61__) || /* NVIDIA Pascal architecture (compute capability 6.1) */    \
-            defined(__SYCL_TARGET_NVIDIA_GPU_SM_62__) || /* NVIDIA Jetson TX2 (compute capability 6.2) */             \
-            defined(__SYCL_TARGET_NVIDIA_GPU_SM_70__) || /* NVIDIA Volta architecture (compute capability 7.0) */     \
-            defined(__SYCL_TARGET_NVIDIA_GPU_SM_72__) || /* NVIDIA Jetson AGX (compute capability 7.2) */             \
-            defined(__SYCL_TARGET_NVIDIA_GPU_SM_75__) || /* NVIDIA Turing architecture (compute capability 7.5) */    \
-            defined(__SYCL_TARGET_NVIDIA_GPU_SM_80__) || /* NVIDIA Ampere architecture (compute capability 8.0) */    \
-            defined(__SYCL_TARGET_NVIDIA_GPU_SM_86__) || /* NVIDIA Ampere architecture (compute capability 8.6) */    \
-            defined(__SYCL_TARGET_NVIDIA_GPU_SM_87__) || /* NVIDIA Jetson/Drive AGX Orin (compute capability 8.7) */  \
-            defined(__SYCL_TARGET_NVIDIA_GPU_SM_89__) || /* NVIDIA Ada Lovelace arch. (compute capability 8.9) */     \
-            defined(__SYCL_TARGET_NVIDIA_GPU_SM_90__) /* NVIDIA Hopper architecture (compute capability 9.0) */
+#        elif(__SYCL_TARGET_NVIDIA_GPU_SM50__) || /* NVIDIA Maxwell architecture (compute capability 5.0) */          \
+            (__SYCL_TARGET_NVIDIA_GPU_SM52__) || /* NVIDIA Maxwell architecture (compute capability 5.2) */           \
+            (__SYCL_TARGET_NVIDIA_GPU_SM53__) || /* NVIDIA Jetson TX1 / Nano (compute capability 5.3) */              \
+            (__SYCL_TARGET_NVIDIA_GPU_SM60__) || /* NVIDIA Pascal architecture (compute capability 6.0) */            \
+            (__SYCL_TARGET_NVIDIA_GPU_SM61__) || /* NVIDIA Pascal architecture (compute capability 6.1) */            \
+            (__SYCL_TARGET_NVIDIA_GPU_SM62__) || /* NVIDIA Jetson TX2 (compute capability 6.2) */                     \
+            (__SYCL_TARGET_NVIDIA_GPU_SM70__) || /* NVIDIA Volta architecture (compute capability 7.0) */             \
+            (__SYCL_TARGET_NVIDIA_GPU_SM72__) || /* NVIDIA Jetson AGX (compute capability 7.2) */                     \
+            (__SYCL_TARGET_NVIDIA_GPU_SM75__) || /* NVIDIA Turing architecture (compute capability 7.5) */            \
+            (__SYCL_TARGET_NVIDIA_GPU_SM80__) || /* NVIDIA Ampere architecture (compute capability 8.0) */            \
+            (__SYCL_TARGET_NVIDIA_GPU_SM86__) || /* NVIDIA Ampere architecture (compute capability 8.6) */            \
+            (__SYCL_TARGET_NVIDIA_GPU_SM87__) || /* NVIDIA Jetson/Drive AGX Orin (compute capability 8.7) */          \
+            (__SYCL_TARGET_NVIDIA_GPU_SM89__) || /* NVIDIA Ada Lovelace arch. (compute capability 8.9) */             \
+            (__SYCL_TARGET_NVIDIA_GPU_SM90__) /* NVIDIA Hopper architecture (compute capability 9.0) */
 
 #            define SYCL_SUBGROUP_SIZE (32)
 
-#        elif defined(__SYCL_TARGET_AMD_GPU_GFX700__) || /* AMD GCN 2.0 Sea Islands architecture (gfx 7.0) */         \
-            defined(__SYCL_TARGET_AMD_GPU_GFX701__) || /* AMD GCN 2.0 Sea Islands architecture (gfx 7.0) */           \
-            defined(__SYCL_TARGET_AMD_GPU_GFX702__) || /* AMD GCN 2.0 Sea Islands architecture (gfx 7.0) */           \
-            defined(__SYCL_TARGET_AMD_GPU_GFX801__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.0) */      \
-            defined(__SYCL_TARGET_AMD_GPU_GFX802__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.0) */      \
-            defined(__SYCL_TARGET_AMD_GPU_GFX803__) || /* AMD GCN 4.0 Arctic Islands architecture (gfx 8.0) */        \
-            defined(__SYCL_TARGET_AMD_GPU_GFX805__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.0) */      \
-            defined(__SYCL_TARGET_AMD_GPU_GFX810__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.1) */      \
-            defined(__SYCL_TARGET_AMD_GPU_GFX900__) || /* AMD GCN 5.0 Vega architecture (gfx 9.0) */                  \
-            defined(__SYCL_TARGET_AMD_GPU_GFX902__) || /* AMD GCN 5.0 Vega architecture (gfx 9.0) */                  \
-            defined(__SYCL_TARGET_AMD_GPU_GFX904__) || /* AMD GCN 5.0 Vega architecture (gfx 9.0) */                  \
-            defined(__SYCL_TARGET_AMD_GPU_GFX906__) || /* AMD GCN 5.1 Vega II architecture (gfx 9.0) */               \
-            defined(__SYCL_TARGET_AMD_GPU_GFX908__) || /* AMD CDNA 1.0 Arcturus architecture (gfx 9.0) */             \
-            defined(__SYCL_TARGET_AMD_GPU_GFX90A__) /* AMD CDNA 2.0 Aldebaran architecture (gfx 9.0) */
+#        elif(__SYCL_TARGET_AMD_GPU_GFX700__) || /* AMD GCN 2.0 Sea Islands architecture (gfx 7.0) */                 \
+            (__SYCL_TARGET_AMD_GPU_GFX701__) || /* AMD GCN 2.0 Sea Islands architecture (gfx 7.0) */                  \
+            (__SYCL_TARGET_AMD_GPU_GFX702__) || /* AMD GCN 2.0 Sea Islands architecture (gfx 7.0) */                  \
+            (__SYCL_TARGET_AMD_GPU_GFX801__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.0) */             \
+            (__SYCL_TARGET_AMD_GPU_GFX802__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.0) */             \
+            (__SYCL_TARGET_AMD_GPU_GFX803__) || /* AMD GCN 4.0 Arctic Islands architecture (gfx 8.0) */               \
+            (__SYCL_TARGET_AMD_GPU_GFX805__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.0) */             \
+            (__SYCL_TARGET_AMD_GPU_GFX810__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.1) */             \
+            (__SYCL_TARGET_AMD_GPU_GFX900__) || /* AMD GCN 5.0 Vega architecture (gfx 9.0) */                         \
+            (__SYCL_TARGET_AMD_GPU_GFX902__) || /* AMD GCN 5.0 Vega architecture (gfx 9.0) */                         \
+            (__SYCL_TARGET_AMD_GPU_GFX904__) || /* AMD GCN 5.0 Vega architecture (gfx 9.0) */                         \
+            (__SYCL_TARGET_AMD_GPU_GFX906__) || /* AMD GCN 5.1 Vega II architecture (gfx 9.0) */                      \
+            (__SYCL_TARGET_AMD_GPU_GFX908__) || /* AMD CDNA 1.0 Arcturus architecture (gfx 9.0) */                    \
+            (__SYCL_TARGET_AMD_GPU_GFX909__) || /* AMD GCN 5.0 Raven 2 architecture (gfx 9.0) */                      \
+            (__SYCL_TARGET_AMD_GPU_GFX90A__) || /* AMD CDNA 2.0 Aldebaran architecture (gfx 9.0) */                   \
+            (__SYCL_TARGET_AMD_GPU_GFX90C__) || /* AMD GCN 5.1 Renoir architecture (gfx 9.0) */                       \
+            (__SYCL_TARGET_AMD_GPU_GFX940__) || /* AMD CDNA 3.0 Aqua Vanjaram architecture (gfx 9.4) */               \
+            (__SYCL_TARGET_AMD_GPU_GFX941__) || /* AMD CDNA 3.0 Aqua Vanjaram architecture (gfx 9.4) */               \
+            (__SYCL_TARGET_AMD_GPU_GFX942__) /* AMD CDNA 3.0 Aqua Vanjaram architecture (gfx 9.4) */
 
 #            define SYCL_SUBGROUP_SIZE (64)
 
-#        elif defined(__SYCL_TARGET_AMD_GPU_GFX1010__) || /* AMD RDNA 1.0 Navi 10 architecture (gfx 10.1) */          \
-            defined(__SYCL_TARGET_AMD_GPU_GFX1011__) || /* AMD RDNA 1.0 Navi 12 architecture (gfx 10.1) */            \
-            defined(__SYCL_TARGET_AMD_GPU_GFX1012__) || /* AMD RDNA 1.0 Navi 14 architecture (gfx 10.1) */            \
-            defined(__SYCL_TARGET_AMD_GPU_GFX1013__) || /* AMD RDNA 2.0 Oberon architecture (gfx 10.1) */             \
-            defined(__SYCL_TARGET_AMD_GPU_GFX1030__) || /* AMD RDNA 2.0 Navi 21 architecture (gfx 10.3) */            \
-            defined(__SYCL_TARGET_AMD_GPU_GFX1031__) || /* AMD RDNA 2.0 Navi 22 architecture (gfx 10.3) */            \
-            defined(__SYCL_TARGET_AMD_GPU_GFX1032__) || /* AMD RDNA 2.0 Navi 23 architecture (gfx 10.3) */            \
-            defined(__SYCL_TARGET_AMD_GPU_GFX1034__) /* AMD RDNA 2.0 Navi 24 architecture (gfx 10.3) */
-
-#            define SYCL_SUBGROUP_SIZE (32 | 64)
+#        elif(__SYCL_TARGET_AMD_GPU_GFX1010__) || /* AMD RDNA 1.0 Navi 10 architecture (gfx 10.1) */                  \
+            (__SYCL_TARGET_AMD_GPU_GFX1011__) || /* AMD RDNA 1.0 Navi 12 architecture (gfx 10.1) */                   \
+            (__SYCL_TARGET_AMD_GPU_GFX1012__) || /* AMD RDNA 1.0 Navi 14 architecture (gfx 10.1) */                   \
+            (__SYCL_TARGET_AMD_GPU_GFX1013__) || /* AMD RDNA 2.0 Oberon architecture (gfx 10.1) */                    \
+            (__SYCL_TARGET_AMD_GPU_GFX1030__) || /* AMD RDNA 2.0 Navi 21 architecture (gfx 10.3) */                   \
+            (__SYCL_TARGET_AMD_GPU_GFX1031__) || /* AMD RDNA 2.0 Navi 22 architecture (gfx 10.3) */                   \
+            (__SYCL_TARGET_AMD_GPU_GFX1032__) || /* AMD RDNA 2.0 Navi 23 architecture (gfx 10.3) */                   \
+            (__SYCL_TARGET_AMD_GPU_GFX1033__) || /* AMD RDNA 2.0 Van Gogh architecture (gfx 10.3) */                  \
+            (__SYCL_TARGET_AMD_GPU_GFX1034__) || /* AMD RDNA 2.0 Navi 24 architecture (gfx 10.3) */                   \
+            (__SYCL_TARGET_AMD_GPU_GFX1035__) || /* AMD RDNA 2.0 Rembrandt Mobile architecture (gfx 10.3) */          \
+            (__SYCL_TARGET_AMD_GPU_GFX1036__) || /* AMD RDNA 2.0 Raphael architecture (gfx 10.3) */                   \
+            (__SYCL_TARGET_AMD_GPU_GFX1100__) || /* AMD RDNA 3.0 Navi 31 architecture (gfx 11.0) */                   \
+            (__SYCL_TARGET_AMD_GPU_GFX1101__) || /* AMD RDNA 3.0 Navi 32 architecture (gfx 11.0) */                   \
+            (__SYCL_TARGET_AMD_GPU_GFX1102__) || /* AMD RDNA 3.0 Navi 33 architecture (gfx 11.0) */                   \
+            (__SYCL_TARGET_AMD_GPU_GFX1103__) || /* AMD RDNA 3.0 Phoenix mobile architecture (gfx 11.0) */            \
+            (__SYCL_TARGET_AMD_GPU_GFX1150__) || /* AMD RDNA 3.5 Strix Point architecture (gfx 11.5) */               \
+            (__SYCL_TARGET_AMD_GPU_GFX1151__) || /* AMD RDNA 3.5 Strix Halo architecture (gfx 11.5) */                \
+            (__SYCL_TARGET_AMD_GPU_GFX1200__) || /* AMD RDNA 4.0 Navi 44 architecture (gfx 12.0) */                   \
+            (__SYCL_TARGET_AMD_GPU_GFX1201__) /* AMD RDNA 4.0 Navi 48 architecture (gfx 12.0) */
+
+// starting from gfx10, HIP supports only wavefront size 32
+#            define SYCL_SUBGROUP_SIZE (32)
 
 #        else // __SYCL_TARGET_*
 
diff --git a/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp b/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp
index 6bf45a27..f0d60566 100644
--- a/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp
+++ b/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp
@@ -17,7 +17,9 @@
 #include "alpaka/core/OmpSchedule.hpp"
 #include "alpaka/dev/DevCpu.hpp"
 #include "alpaka/idx/MapIdx.hpp"
+#include "alpaka/kernel/KernelFunctionAttributes.hpp"
 #include "alpaka/kernel/Traits.hpp"
+#include "alpaka/platform/PlatformCpu.hpp"
 #include "alpaka/workdiv/WorkDivMembers.hpp"
 
 #include <functional>
@@ -31,6 +33,11 @@
 
 #ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
 
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic push
+#        pragma clang diagnostic ignored "-Wswitch-default"
+#    endif
+
 #    if _OPENMP < 200203
 #        error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
 #    endif
@@ -824,10 +831,6 @@ namespace alpaka
 
             // The number of blocks in the grid.
             TIdx const numBlocksInGrid(gridBlockExtent.prod());
-            if(blockThreadExtent.prod() != static_cast<TIdx>(1u))
-            {
-                throw std::runtime_error("Only one thread per block allowed in the OpenMP 2.0 block accelerator!");
-            }
 
             // Get the OpenMP schedule information for the given kernel and parameter types
             auto const schedule = std::apply(
@@ -946,7 +949,43 @@ namespace alpaka
         {
             using type = TIdx;
         };
+
+        //! \brief Specialisation of the class template FunctionAttributes
+        //! \tparam TDev The device type.
+        //! \tparam TDim The dimensionality of the accelerator device properties.
+        //! \tparam TIdx The idx type of the accelerator device properties.
+        //! \tparam TKernelFn Kernel function object type.
+        //! \tparam TArgs Kernel function object argument types as a parameter pack.
+        template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
+        struct FunctionAttributes<AccCpuOmp2Blocks<TDim, TIdx>, TDev, TKernelFn, TArgs...>
+        {
+            //! \param dev The device instance
+            //! \param kernelFn The kernel function object which should be executed.
+            //! \param args The kernel invocation arguments.
+            //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
+            //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
+            ALPAKA_FN_HOST static auto getFunctionAttributes(
+                TDev const& dev,
+                [[maybe_unused]] TKernelFn const& kernelFn,
+                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
+            {
+                alpaka::KernelFunctionAttributes kernelFunctionAttributes;
+
+                // set function properties for maxThreadsPerBlock to device properties, since API doesn't have function
+                // properties function.
+                auto const& props = alpaka::getAccDevProps<AccCpuOmp2Blocks<TDim, TIdx>>(dev);
+                kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(props.m_blockThreadCountMax);
+                kernelFunctionAttributes.maxDynamicSharedSizeBytes
+                    = static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024);
+                return kernelFunctionAttributes;
+            }
+        };
+
     } // namespace trait
 } // namespace alpaka
 
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic pop
+#    endif
+
 #endif
diff --git a/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp b/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp
index 1037139f..6b08e969 100644
--- a/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp
+++ b/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp
@@ -15,8 +15,10 @@
 #include "alpaka/acc/AccCpuOmp2Threads.hpp"
 #include "alpaka/core/Decay.hpp"
 #include "alpaka/dev/DevCpu.hpp"
+#include "alpaka/kernel/KernelFunctionAttributes.hpp"
 #include "alpaka/kernel/Traits.hpp"
 #include "alpaka/meta/NdLoop.hpp"
+#include "alpaka/platform/PlatformCpu.hpp"
 #include "alpaka/workdiv/WorkDivMembers.hpp"
 
 #include <functional>
@@ -192,6 +194,38 @@ namespace alpaka
         {
             using type = TIdx;
         };
+
+        //! \brief Specialisation of the class template FunctionAttributes
+        //! \tparam TDev The device type.
+        //! \tparam TDim The dimensionality of the accelerator device properties.
+        //! \tparam TIdx The idx type of the accelerator device properties.
+        //! \tparam TKernelFn Kernel function object type.
+        //! \tparam TArgs Kernel function object argument types as a parameter pack.
+        template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
+        struct FunctionAttributes<AccCpuOmp2Threads<TDim, TIdx>, TDev, TKernelFn, TArgs...>
+        {
+            //! \param dev The device instance
+            //! \param kernelFn The kernel function object which should be executed.
+            //! \param args The kernel invocation arguments.
+            //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
+            //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
+            ALPAKA_FN_HOST static auto getFunctionAttributes(
+                TDev const& dev,
+                [[maybe_unused]] TKernelFn const& kernelFn,
+                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
+            {
+                alpaka::KernelFunctionAttributes kernelFunctionAttributes;
+
+                // set function properties for maxThreadsPerBlock to device properties, since API doesn't have function
+                // properties function.
+                auto const& props = alpaka::getAccDevProps<AccCpuOmp2Threads<TDim, TIdx>>(dev);
+                kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(props.m_blockThreadCountMax);
+                kernelFunctionAttributes.maxDynamicSharedSizeBytes
+                    = static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024);
+                return kernelFunctionAttributes;
+            }
+        };
+
     } // namespace trait
 } // namespace alpaka
 
diff --git a/alpaka/include/alpaka/kernel/TaskKernelCpuSerial.hpp b/alpaka/include/alpaka/kernel/TaskKernelCpuSerial.hpp
index 44f5d2ce..a9a370d1 100644
--- a/alpaka/include/alpaka/kernel/TaskKernelCpuSerial.hpp
+++ b/alpaka/include/alpaka/kernel/TaskKernelCpuSerial.hpp
@@ -15,8 +15,10 @@
 #include "alpaka/acc/AccCpuSerial.hpp"
 #include "alpaka/core/Decay.hpp"
 #include "alpaka/dev/DevCpu.hpp"
+#include "alpaka/kernel/KernelFunctionAttributes.hpp"
 #include "alpaka/kernel/Traits.hpp"
 #include "alpaka/meta/NdLoop.hpp"
+#include "alpaka/platform/PlatformCpu.hpp"
 #include "alpaka/workdiv/WorkDivMembers.hpp"
 
 #include <functional>
@@ -77,11 +79,6 @@ namespace alpaka
                 *static_cast<WorkDivMembers<TDim, TIdx> const*>(this),
                 blockSharedMemDynSizeBytes);
 
-            if(blockThreadExtent.prod() != static_cast<TIdx>(1u))
-            {
-                throw std::runtime_error("A block for the serial accelerator can only ever have one single thread!");
-            }
-
             // Execute the blocks serially.
             meta::ndLoopIncIdx(
                 gridBlockExtent,
@@ -137,6 +134,37 @@ namespace alpaka
         {
             using type = TIdx;
         };
+
+        //! \brief Specialisation of the class template FunctionAttributes
+        //! \tparam TDev The device type.
+        //! \tparam TDim The dimensionality of the accelerator device properties.
+        //! \tparam TIdx The idx type of the accelerator device properties.
+        //! \tparam TKernelFn Kernel function object type.
+        //! \tparam TArgs Kernel function object argument types as a parameter pack.
+        template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
+        struct FunctionAttributes<AccCpuSerial<TDim, TIdx>, TDev, TKernelFn, TArgs...>
+        {
+            //! \param dev The device instance
+            //! \param kernelFn The kernel function object which should be executed.
+            //! \param args The kernel invocation arguments.
+            //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
+            //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
+            ALPAKA_FN_HOST static auto getFunctionAttributes(
+                TDev const& dev,
+                [[maybe_unused]] TKernelFn const& kernelFn,
+                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
+            {
+                alpaka::KernelFunctionAttributes kernelFunctionAttributes;
+
+                // set function properties for maxThreadsPerBlock to device properties, since API doesn't have function
+                // properties function.
+                auto const& props = alpaka::getAccDevProps<AccCpuSerial<TDim, TIdx>>(dev);
+                kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(props.m_blockThreadCountMax);
+                kernelFunctionAttributes.maxDynamicSharedSizeBytes
+                    = static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024);
+                return kernelFunctionAttributes;
+            }
+        };
     } // namespace trait
 } // namespace alpaka
 
diff --git a/alpaka/include/alpaka/kernel/TaskKernelCpuSycl.hpp b/alpaka/include/alpaka/kernel/TaskKernelCpuSycl.hpp
index 3d0a3086..b811a63a 100644
--- a/alpaka/include/alpaka/kernel/TaskKernelCpuSycl.hpp
+++ b/alpaka/include/alpaka/kernel/TaskKernelCpuSycl.hpp
@@ -1,20 +1,20 @@
-/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
+/* Copyright 2024 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
+#include "alpaka/acc/Tag.hpp"
 #include "alpaka/kernel/TaskKernelGenericSycl.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_CPU)
 
 namespace alpaka
 {
-    template<typename TDim, typename TIdx>
-    class AccCpuSycl;
-
     template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    using TaskKernelCpuSycl = TaskKernelGenericSycl<AccCpuSycl<TDim, TIdx>, TDim, TIdx, TKernelFnObj, TArgs...>;
+    using TaskKernelCpuSycl
+        = TaskKernelGenericSycl<TagCpuSycl, AccCpuSycl<TDim, TIdx>, TDim, TIdx, TKernelFnObj, TArgs...>;
+
 } // namespace alpaka
 
 #endif
diff --git a/alpaka/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp b/alpaka/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp
index 4b8f6b8b..4ca90dd5 100644
--- a/alpaka/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp
+++ b/alpaka/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp
@@ -16,8 +16,10 @@
 #include "alpaka/core/Decay.hpp"
 #include "alpaka/dev/DevCpu.hpp"
 #include "alpaka/idx/MapIdx.hpp"
+#include "alpaka/kernel/KernelFunctionAttributes.hpp"
 #include "alpaka/kernel/Traits.hpp"
 #include "alpaka/meta/NdLoop.hpp"
+#include "alpaka/platform/PlatformCpu.hpp"
 #include "alpaka/workdiv/WorkDivMembers.hpp"
 
 #include <functional>
@@ -81,11 +83,6 @@ namespace alpaka
             // The number of blocks in the grid.
             TIdx const numBlocksInGrid = gridBlockExtent.prod();
 
-            if(blockThreadExtent.prod() != static_cast<TIdx>(1u))
-            {
-                throw std::runtime_error("A block for the TBB accelerator can only ever have one single thread!");
-            }
-
             tbb::this_task_arena::isolate(
                 [&]
                 {
@@ -149,6 +146,37 @@ namespace alpaka
         {
             using type = TIdx;
         };
+
+        //! \brief Specialisation of the class template FunctionAttributes
+        //! \tparam TDev The device type.
+        //! \tparam TDim The dimensionality of the accelerator device properties.
+        //! \tparam TIdx The idx type of the accelerator device properties.
+        //! \tparam TKernelFn Kernel function object type.
+        //! \tparam TArgs Kernel function object argument types as a parameter pack.
+        template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
+        struct FunctionAttributes<AccCpuTbbBlocks<TDim, TIdx>, TDev, TKernelFn, TArgs...>
+        {
+            //! \param dev The device instance
+            //! \param kernelFn The kernel function object which should be executed.
+            //! \param args The kernel invocation arguments.
+            //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
+            //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
+            ALPAKA_FN_HOST static auto getFunctionAttributes(
+                TDev const& dev,
+                [[maybe_unused]] TKernelFn const& kernelFn,
+                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
+            {
+                alpaka::KernelFunctionAttributes kernelFunctionAttributes;
+
+                // set function properties for maxThreadsPerBlock to device properties, since API doesn't have function
+                // properties function.
+                auto const& props = alpaka::getAccDevProps<AccCpuTbbBlocks<TDim, TIdx>>(dev);
+                kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(props.m_blockThreadCountMax);
+                kernelFunctionAttributes.maxDynamicSharedSizeBytes
+                    = static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024);
+                return kernelFunctionAttributes;
+            }
+        };
     } // namespace trait
 } // namespace alpaka
 
diff --git a/alpaka/include/alpaka/kernel/TaskKernelCpuThreads.hpp b/alpaka/include/alpaka/kernel/TaskKernelCpuThreads.hpp
index 8bb518a8..850b6615 100644
--- a/alpaka/include/alpaka/kernel/TaskKernelCpuThreads.hpp
+++ b/alpaka/include/alpaka/kernel/TaskKernelCpuThreads.hpp
@@ -17,8 +17,10 @@
 #include "alpaka/core/Decay.hpp"
 #include "alpaka/core/ThreadPool.hpp"
 #include "alpaka/dev/DevCpu.hpp"
+#include "alpaka/kernel/KernelFunctionAttributes.hpp"
 #include "alpaka/kernel/Traits.hpp"
 #include "alpaka/meta/NdLoop.hpp"
+#include "alpaka/platform/PlatformCpu.hpp"
 #include "alpaka/workdiv/WorkDivMembers.hpp"
 
 #include <algorithm>
@@ -200,6 +202,38 @@ namespace alpaka
         {
             using type = TIdx;
         };
+
+        //! \brief Specialisation of the class template FunctionAttributes
+        //! \tparam TDev The device type.
+        //! \tparam TDim The dimensionality of the accelerator device properties.
+        //! \tparam TIdx The idx type of the accelerator device properties.
+        //! \tparam TKernelFn Kernel function object type.
+        //! \tparam TArgs Kernel function object argument types as a parameter pack.
+        template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
+        struct FunctionAttributes<AccCpuThreads<TDim, TIdx>, TDev, TKernelFn, TArgs...>
+        {
+            //! \param dev The device instance
+            //! \param kernelFn The kernel function object which should be executed.
+            //! \param args The kernel invocation arguments.
+            //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
+            //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
+            ALPAKA_FN_HOST static auto getFunctionAttributes(
+                TDev const& dev,
+                [[maybe_unused]] TKernelFn const& kernelFn,
+                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
+            {
+                alpaka::KernelFunctionAttributes kernelFunctionAttributes;
+
+                // set function properties for maxThreadsPerBlock to device properties, since API doesn't have function
+                // properties function.
+                auto const& props = alpaka::getAccDevProps<AccCpuThreads<TDim, TIdx>>(dev);
+                kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(props.m_blockThreadCountMax);
+                kernelFunctionAttributes.maxDynamicSharedSizeBytes
+                    = static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024);
+                return kernelFunctionAttributes;
+            }
+        };
+
     } // namespace trait
 } // namespace alpaka
 
diff --git a/alpaka/include/alpaka/kernel/TaskKernelFpgaSyclIntel.hpp b/alpaka/include/alpaka/kernel/TaskKernelFpgaSyclIntel.hpp
index d041c17a..61631659 100644
--- a/alpaka/include/alpaka/kernel/TaskKernelFpgaSyclIntel.hpp
+++ b/alpaka/include/alpaka/kernel/TaskKernelFpgaSyclIntel.hpp
@@ -1,21 +1,20 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2024 Jan Stephan, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
+#include "alpaka/acc/Tag.hpp"
 #include "alpaka/kernel/TaskKernelGenericSycl.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_FPGA)
 
 namespace alpaka
 {
-    template<typename TDim, typename TIdx>
-    class AccFpgaSyclIntel;
-
     template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
     using TaskKernelFpgaSyclIntel
-        = TaskKernelGenericSycl<AccFpgaSyclIntel<TDim, TIdx>, TDim, TIdx, TKernelFnObj, TArgs...>;
+        = TaskKernelGenericSycl<TagFpgaSyclIntel, AccFpgaSyclIntel<TDim, TIdx>, TDim, TIdx, TKernelFnObj, TArgs...>;
+
 } // namespace alpaka
 
 #endif
diff --git a/alpaka/include/alpaka/kernel/TaskKernelGenericSycl.hpp b/alpaka/include/alpaka/kernel/TaskKernelGenericSycl.hpp
index 450b80d1..11cc2cae 100644
--- a/alpaka/include/alpaka/kernel/TaskKernelGenericSycl.hpp
+++ b/alpaka/include/alpaka/kernel/TaskKernelGenericSycl.hpp
@@ -1,18 +1,20 @@
-/* Copyright 2023 Jan Stephan, Andrea Bocci, Luca Ferragina, Aurora Perego
+/* Copyright 2024 Jan Stephan, Andrea Bocci, Luca Ferragina, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
+#include "alpaka/acc/AccGenericSycl.hpp"
 #include "alpaka/acc/Traits.hpp"
-#include "alpaka/block/shared/dyn/BlockSharedDynMemberAllocKiB.hpp"
 #include "alpaka/core/BoostPredef.hpp"
 #include "alpaka/core/Sycl.hpp"
 #include "alpaka/dev/Traits.hpp"
 #include "alpaka/dim/Traits.hpp"
 #include "alpaka/idx/Traits.hpp"
+#include "alpaka/kernel/KernelFunctionAttributes.hpp"
 #include "alpaka/kernel/SyclSubgroupSize.hpp"
 #include "alpaka/kernel/Traits.hpp"
+#include "alpaka/platform/PlatformGenericSycl.hpp"
 #include "alpaka/platform/Traits.hpp"
 #include "alpaka/queue/Traits.hpp"
 #include "alpaka/workdiv/WorkDivMembers.hpp"
@@ -69,7 +71,7 @@
 namespace alpaka
 {
     //! The SYCL accelerator execution task.
-    template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    template<typename TTag, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
     class TaskKernelGenericSycl final : public WorkDivMembers<TDim, TIdx>
     {
     public:
@@ -276,6 +278,35 @@ namespace alpaka::trait
     {
         using type = TIdx;
     };
+
+    //! \brief Specialisation of the class template FunctionAttributes
+    //! \tparam TTag The SYCL device selector.
+    //! \tparam TDev The device type.
+    //! \tparam TDim The dimensionality of the accelerator device properties.
+    //! \tparam TIdx The idx type of the accelerator device properties.
+    //! \tparam TKernelFn Kernel function object type.
+    //! \tparam TArgs Kernel function object argument types as a parameter pack.
+    template<typename TTag, typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
+    struct FunctionAttributes<AccGenericSycl<TTag, TDim, TIdx>, TDev, TKernelFn, TArgs...>
+    {
+        //! \param dev The device instance
+        //! \param kernelFn The kernel function object which should be executed.
+        //! \param args The kernel invocation arguments.
+        //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
+        //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
+        ALPAKA_FN_HOST static auto getFunctionAttributes(
+            TDev const& dev,
+            [[maybe_unused]] TKernelFn const& kernelFn,
+            [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
+        {
+            alpaka::KernelFunctionAttributes kernelFunctionAttributes;
+
+            // set function properties for maxThreadsPerBlock to device properties
+            auto const& props = alpaka::getAccDevProps<AccGenericSycl<TTag, TDim, TIdx>>(dev);
+            kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(props.m_blockThreadCountMax);
+            return kernelFunctionAttributes;
+        }
+    };
 } // namespace alpaka::trait
 
 #    undef LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS
diff --git a/alpaka/include/alpaka/kernel/TaskKernelGpuCudaRt.hpp b/alpaka/include/alpaka/kernel/TaskKernelGpuCudaRt.hpp
index 59aa4761..416e8937 100644
--- a/alpaka/include/alpaka/kernel/TaskKernelGpuCudaRt.hpp
+++ b/alpaka/include/alpaka/kernel/TaskKernelGpuCudaRt.hpp
@@ -11,8 +11,9 @@
 
 namespace alpaka
 {
-    template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    using TaskKernelGpuCudaRt = TaskKernelGpuUniformCudaHipRt<ApiCudaRt, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>;
+    template<typename TAcc, typename TDev, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    using TaskKernelGpuCudaRt
+        = TaskKernelGpuUniformCudaHipRt<ApiCudaRt, TAcc, TDev, TDim, TIdx, TKernelFnObj, TArgs...>;
 } // namespace alpaka
 
 #endif // ALPAKA_ACC_GPU_CUDA_ENABLED
diff --git a/alpaka/include/alpaka/kernel/TaskKernelGpuSyclIntel.hpp b/alpaka/include/alpaka/kernel/TaskKernelGpuSyclIntel.hpp
index 0e970e8f..e5c5a9a7 100644
--- a/alpaka/include/alpaka/kernel/TaskKernelGpuSyclIntel.hpp
+++ b/alpaka/include/alpaka/kernel/TaskKernelGpuSyclIntel.hpp
@@ -1,21 +1,20 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2024 Jan Stephan, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
+#include "alpaka/acc/Tag.hpp"
 #include "alpaka/kernel/TaskKernelGenericSycl.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_GPU)
 
 namespace alpaka
 {
-    template<typename TDim, typename TIdx>
-    class AccGpuSyclIntel;
-
     template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
     using TaskKernelGpuSyclIntel
-        = TaskKernelGenericSycl<AccGpuSyclIntel<TDim, TIdx>, TDim, TIdx, TKernelFnObj, TArgs...>;
+        = TaskKernelGenericSycl<TagGpuSyclIntel, AccGpuSyclIntel<TDim, TIdx>, TDim, TIdx, TKernelFnObj, TArgs...>;
+
 } // namespace alpaka
 
 #endif
diff --git a/alpaka/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp b/alpaka/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp
index e2d7ed3d..53bbaf67 100644
--- a/alpaka/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp
+++ b/alpaka/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp
@@ -1,5 +1,5 @@
-/* Copyright 2022 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Jan Stephan, Andrea Bocci, Bernhard
- * Manfred Gruber, Antonio Di Pilato
+/* Copyright 2024 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Jan Stephan, Andrea Bocci, Bernhard
+ * Manfred Gruber, Antonio Di Pilato, Mehmet Yusufoglu
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -17,11 +17,11 @@
 #include "alpaka/dev/Traits.hpp"
 #include "alpaka/dim/Traits.hpp"
 #include "alpaka/idx/Traits.hpp"
+#include "alpaka/kernel/KernelFunctionAttributes.hpp"
 #include "alpaka/kernel/Traits.hpp"
 #include "alpaka/platform/Traits.hpp"
-#include "alpaka/queue/QueueUniformCudaHipRtBlocking.hpp"
-#include "alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp"
 #include "alpaka/queue/Traits.hpp"
+#include "alpaka/queue/cuda_hip/QueueUniformCudaHipRt.hpp"
 #include "alpaka/workdiv/WorkDivHelpers.hpp"
 #include "alpaka/workdiv/WorkDivMembers.hpp"
 
@@ -64,11 +64,7 @@ namespace alpaka
             TKernelFnObj const kernelFnObj,
             TArgs... args)
         {
-#        if BOOST_ARCH_PTX && (BOOST_ARCH_PTX < BOOST_VERSION_NUMBER(2, 0, 0))
-#            error "Device capability >= 2.0 is required!"
-#        endif
-
-            const TAcc acc(threadElemExtent);
+            TAcc const acc(threadElemExtent);
 
 // with clang it is not possible to query std::result_of for a pure device lambda created on the host side
 #        if !(BOOST_COMP_CLANG_CUDA && BOOST_COMP_CLANG)
@@ -180,14 +176,21 @@ namespace alpaka
             using type = TIdx;
         };
 
-        //! The CUDA/HIP non-blocking kernel enqueue trait specialization.
-        template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        //! The CUDA/HIP kernel enqueue trait specialization.
+        template<
+            typename TApi,
+            bool TBlocking,
+            typename TAcc,
+            typename TDim,
+            typename TIdx,
+            typename TKernelFnObj,
+            typename... TArgs>
         struct Enqueue<
-            QueueUniformCudaHipRtNonBlocking<TApi>,
+            uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>,
             TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
         {
             ALPAKA_FN_HOST static auto enqueue(
-                QueueUniformCudaHipRtNonBlocking<TApi>& queue,
+                uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>& queue,
                 TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...> const& task) -> void
             {
                 ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
@@ -196,10 +199,10 @@ namespace alpaka
 #        if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
                 // std::size_t printfFifoSize;
                 // TApi::deviceGetLimit(&printfFifoSize, TApi::limitPrintfFifoSize);
-                // std::cout << __func__ << "INFO: printfFifoSize: " << printfFifoSize << std::endl;
+                // std::cout << __func__ << " INFO: printfFifoSize: " << printfFifoSize << std::endl;
                 // TApi::deviceSetLimit(TApi::limitPrintfFifoSize, printfFifoSize*10);
                 // TApi::deviceGetLimit(&printfFifoSize, TApi::limitPrintfFifoSize);
-                // std::cout << __func__ << "INFO: printfFifoSize: " << printfFifoSize << std::endl;
+                // std::cout << __func__ << " INFO: printfFifoSize: " << printfFifoSize << std::endl;
 #        endif
                 auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(task);
                 auto const blockThreadExtent = getWorkDiv<Block, Threads>(task);
@@ -210,13 +213,15 @@ namespace alpaka
                 uniform_cuda_hip::detail::checkVecOnly3Dim(threadElemExtent);
 
 #        if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                std::cout << __func__ << " gridDim: " << gridDim.z << " " << gridDim.y << " " << gridDim.x
-                          << " blockDim: " << blockDim.z << " " << blockDim.y << " " << blockDim.x << std::endl;
+                std::cout << __func__ << " gridDim: (" << gridDim.z << ", " << gridDim.y << ", " << gridDim.x << ")\n";
+                std::cout << __func__ << " blockDim: (" << blockDim.z << ", " << blockDim.y << ", " << blockDim.x
+                          << ")\n";
 #        endif
 
 #        if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                // This checks for a valid work division that is also compliant with the maxima of the accelerator.
-                if(!isValidWorkDiv<TAcc>(getDev(queue), task))
+                // This checks for a valid work division that is also compliant with the hardware maxima of the
+                // accelerator.
+                if(!isValidWorkDiv<TAcc>(task, getDev(queue)))
                 {
                     throw std::runtime_error(
                         "The given work division is not valid or not supported by the device of type "
@@ -240,6 +245,7 @@ namespace alpaka
                 std::cout << __func__ << " BlockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
                           << std::endl;
 #        endif
+
                 auto kernelName = alpaka::detail::
                     gpuKernel<TKernelFnObj, TApi, TAcc, TDim, TIdx, remove_restrict_t<std::decay_t<TArgs>>...>;
 
@@ -257,6 +263,7 @@ namespace alpaka
 
                 // Set the current device.
                 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(queue.m_spQueueImpl->m_dev.getNativeHandle()));
+
                 // Enqueue the kernel execution.
                 // \NOTE: No const reference (const &) is allowed as the parameter type because the kernel launch
                 // language extension expects the arguments by value. This forces the type of a float argument given
@@ -274,120 +281,88 @@ namespace alpaka
                     },
                     task.m_args);
 
-                if constexpr(ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL)
+                if constexpr(TBlocking || ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL)
                 {
                     // Wait for the kernel execution to finish but do not check error return of this call.
                     // Do not use the alpaka::wait method because it checks the error itself but we want to give a
                     // custom error message.
                     std::ignore = TApi::streamSynchronize(queue.getNativeHandle());
-                    auto const msg = std::string{
-                        "'execution of kernel: '" + std::string{core::demangled<TKernelFnObj>} + "' failed with"};
+                }
+                if constexpr(ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL)
+                {
+                    auto const msg
+                        = std::string{"execution of kernel '" + core::demangled<TKernelFnObj> + "' failed with"};
                     ::alpaka::uniform_cuda_hip::detail::rtCheckLastError<TApi, true>(msg.c_str(), __FILE__, __LINE__);
                 }
             }
         };
 
-        //! The CUDA/HIP synchronous kernel enqueue trait specialization.
-        template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct Enqueue<
-            QueueUniformCudaHipRtBlocking<TApi>,
-            TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
+        //! \brief Specialisation of the class template FunctionAttributes
+        //! \tparam TApi The type the API of the GPU accelerator backend. Currently Cuda or Hip.
+        //! \tparam TDim The dimensionality of the accelerator device properties.
+        //! \tparam TIdx The idx type of the accelerator device properties.
+        //! \tparam TKernelFn Kernel function object type.
+        //! \tparam TArgs Kernel function object argument types as a parameter pack.
+        template<typename TApi, typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
+        struct FunctionAttributes<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>, TDev, TKernelFn, TArgs...>
         {
-            ALPAKA_FN_HOST static auto enqueue(
-                QueueUniformCudaHipRtBlocking<TApi>& queue,
-                TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...> const& task) -> void
+            //! \param dev The device instance
+            //! \param kernelFn The kernel function object which should be executed.
+            //! \param args The kernel invocation arguments.
+            //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
+            //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
+            ALPAKA_FN_HOST static auto getFunctionAttributes(
+                [[maybe_unused]] TDev const& dev,
+                [[maybe_unused]] TKernelFn const& kernelFn,
+                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
             {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-                // TODO: Check that (sizeof(TKernelFnObj) * m_3uiBlockThreadExtent.prod()) < available memory idx
+                auto kernelName = alpaka::detail::gpuKernel<
+                    TKernelFn,
+                    TApi,
+                    AccGpuUniformCudaHipRt<TApi, TDim, TIdx>,
+                    TDim,
+                    TIdx,
+                    remove_restrict_t<std::decay_t<TArgs>>...>;
 
-#        if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                // std::size_t printfFifoSize;
-                // TApi::deviceGetLimit(&printfFifoSize, TApi::limitPrintfFifoSize);
-                // std::cout << __func__ << "INFO: printfFifoSize: " << printfFifoSize << std::endl;
-                // TApi::deviceSetLimit(TApi::limitPrintfFifoSize, printfFifoSize*10);
-                // TApi::deviceGetLimit(&printfFifoSize, TApi::limitPrintfFifoSize);
-                // std::cout << __func__ << "INFO: printfFifoSize: " << printfFifoSize << std::endl;
+                typename TApi::FuncAttributes_t funcAttrs;
+#        if BOOST_COMP_GNUC
+                // Disable and enable compile warnings for gcc
+#            pragma GCC diagnostic push
+#            pragma GCC diagnostic ignored "-Wconditionally-supported"
 #        endif
-                auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(task);
-                auto const blockThreadExtent = getWorkDiv<Block, Threads>(task);
-                auto const threadElemExtent = getWorkDiv<Thread, Elems>(task);
-
-                dim3 const gridDim = uniform_cuda_hip::detail::convertVecToUniformCudaHipDim(gridBlockExtent);
-                dim3 const blockDim = uniform_cuda_hip::detail::convertVecToUniformCudaHipDim(blockThreadExtent);
-                uniform_cuda_hip::detail::checkVecOnly3Dim(threadElemExtent);
-
-#        if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                std::cout << __func__ << "gridDim: " << gridDim.z << " " << gridDim.y << " " << gridDim.x << std::endl;
-                std::cout << __func__ << "blockDim: " << blockDim.z << " " << blockDim.y << " " << blockDim.x
-                          << std::endl;
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    TApi::funcGetAttributes(&funcAttrs, reinterpret_cast<void const*>(kernelName)));
+#        if BOOST_COMP_GNUC
+#            pragma GCC diagnostic pop
 #        endif
 
-#        if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                // This checks for a valid work division that is also compliant with the maxima of the accelerator.
-                if(!isValidWorkDiv<TAcc>(getDev(queue), task))
-                {
-                    throw std::runtime_error(
-                        "The given work division is not valid or not supported by the device of type "
-                        + getAccName<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>() + "!");
-                }
-#        endif
+                alpaka::KernelFunctionAttributes kernelFunctionAttributes;
+                kernelFunctionAttributes.constSizeBytes = funcAttrs.constSizeBytes;
+                kernelFunctionAttributes.localSizeBytes = funcAttrs.localSizeBytes;
+                kernelFunctionAttributes.sharedSizeBytes = funcAttrs.sharedSizeBytes;
+                kernelFunctionAttributes.maxDynamicSharedSizeBytes = funcAttrs.maxDynamicSharedSizeBytes;
+                kernelFunctionAttributes.numRegs = funcAttrs.numRegs;
+                kernelFunctionAttributes.asmVersion = funcAttrs.ptxVersion;
+                kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(funcAttrs.maxThreadsPerBlock);
 
-                // Get the size of the block shared dynamic memory.
-                auto const blockSharedMemDynSizeBytes = std::apply(
-                    [&](remove_restrict_t<std::decay_t<TArgs>> const&... args) {
-                        return getBlockSharedMemDynSizeBytes<TAcc>(
-                            task.m_kernelFnObj,
-                            blockThreadExtent,
-                            threadElemExtent,
-                            args...);
-                    },
-                    task.m_args);
-
-#        if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                // Log the block shared memory idx.
-                std::cout << __func__ << " BlockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
-                          << std::endl;
-#        endif
-
-                auto kernelName = alpaka::detail::
-                    gpuKernel<TKernelFnObj, TApi, TAcc, TDim, TIdx, remove_restrict_t<std::decay_t<TArgs>>...>;
 #        if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                // Log the function attributes.
-                typename TApi::FuncAttributes_t funcAttrs;
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::funcGetAttributes(&funcAttrs, kernelName));
-                std::cout << __func__ << " binaryVersion: " << funcAttrs.binaryVersion
-                          << " constSizeBytes: " << funcAttrs.constSizeBytes << " B"
-                          << " localSizeBytes: " << funcAttrs.localSizeBytes << " B"
-                          << " maxThreadsPerBlock: " << funcAttrs.maxThreadsPerBlock
-                          << " numRegs: " << funcAttrs.numRegs << " ptxVersion: " << funcAttrs.ptxVersion
-                          << " sharedSizeBytes: " << funcAttrs.sharedSizeBytes << " B" << std::endl;
+                printf("Kernel Function Attributes: \n");
+                printf("binaryVersion: %d \n", funcAttrs.binaryVersion);
+                printf(
+                    "constSizeBytes: %lu \n localSizeBytes: %lu, sharedSizeBytes %lu  maxDynamicSharedSizeBytes: %d "
+                    "\n",
+                    funcAttrs.constSizeBytes,
+                    funcAttrs.localSizeBytes,
+                    funcAttrs.sharedSizeBytes,
+                    funcAttrs.maxDynamicSharedSizeBytes);
+
+                printf(
+                    "numRegs: %d, ptxVersion: %d \n maxThreadsPerBlock: %d .\n ",
+                    funcAttrs.numRegs,
+                    funcAttrs.ptxVersion,
+                    funcAttrs.maxThreadsPerBlock);
 #        endif
-
-                // Set the current device.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(queue.m_spQueueImpl->m_dev.getNativeHandle()));
-
-                // Enqueue the kernel execution.
-                std::apply(
-                    [&](remove_restrict_t<std::decay_t<TArgs>> const&... args)
-                    {
-                        kernelName<<<
-                            gridDim,
-                            blockDim,
-                            static_cast<std::size_t>(blockSharedMemDynSizeBytes),
-                            queue.getNativeHandle()>>>(threadElemExtent, task.m_kernelFnObj, args...);
-                    },
-                    task.m_args);
-
-                // Wait for the kernel execution to finish but do not check error return of this call.
-                // Do not use the alpaka::wait method because it checks the error itself but we want to give a custom
-                // error message.
-                std::ignore = TApi::streamSynchronize(queue.getNativeHandle());
-                if constexpr(ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL)
-                {
-                    auto const msg
-                        = std::string{"'execution of kernel: '" + core::demangled<TKernelFnObj> + "' failed with"};
-                    ::alpaka::uniform_cuda_hip::detail::rtCheckLastError<TApi, true>(msg.c_str(), __FILE__, __LINE__);
-                }
+                return kernelFunctionAttributes;
             }
         };
     } // namespace trait
diff --git a/alpaka/include/alpaka/kernel/Traits.hpp b/alpaka/include/alpaka/kernel/Traits.hpp
index 44b6476a..c2c0a55b 100644
--- a/alpaka/include/alpaka/kernel/Traits.hpp
+++ b/alpaka/include/alpaka/kernel/Traits.hpp
@@ -1,5 +1,5 @@
 /* Copyright 2023 Axel Huebl, Benjamin Worpitz, René Widera, Sergei Bastrakov, Jan Stephan, Bernhard Manfred Gruber,
- *                Andrea Bocci, Aurora Perego
+ *                Andrea Bocci, Aurora Perego, Mehmet Yusufoglu
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -12,6 +12,7 @@
 #include "alpaka/core/OmpSchedule.hpp"
 #include "alpaka/dim/Traits.hpp"
 #include "alpaka/idx/Traits.hpp"
+#include "alpaka/kernel/KernelFunctionAttributes.hpp"
 #include "alpaka/queue/Traits.hpp"
 #include "alpaka/vec/Vec.hpp"
 #include "alpaka/workdiv/Traits.hpp"
@@ -69,6 +70,29 @@ namespace alpaka
             }
         };
 
+        //! \brief The structure template to access to the functions attributes of a kernel function object.
+        //! \tparam TAcc The accelerator type
+        //! \tparam TKernelFnObj Kernel function object type.
+        //! \tparam TArgs Kernel function object argument types as a parameter pack.
+        template<typename TAcc, typename TDev, typename TKernelFnObj, typename... TArgs>
+        struct FunctionAttributes
+        {
+            //! \param dev The device instance
+            //! \param kernelFn The kernel function object which should be executed.
+            //! \param args The kernel invocation arguments.
+            //! \return KernelFunctionAttributes data structure instance. The default version always returns the
+            //! instance with fields which are set to zero.
+            ALPAKA_FN_HOST static auto getFunctionAttributes(
+                [[maybe_unused]] TDev const& dev,
+                [[maybe_unused]] TKernelFnObj const& kernelFn,
+                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
+            {
+                std::string const str
+                    = std::string(__func__) + " function is not specialised for the given arguments.\n";
+                throw std::invalid_argument{str};
+            }
+        };
+
         //! The trait for getting the warp size required by a kernel.
         //!
         //! \tparam TKernelFnObj The kernel function object.
@@ -142,13 +166,13 @@ namespace alpaka
 #    pragma clang diagnostic ignored                                                                                  \
         "-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
 #endif
-    //! \tparam TAcc The accelerator type.
-    //! \param kernelFnObj The kernel object for which the block shared memory size should be calculated.
-    //! \param blockThreadExtent The block thread extent.
-    //! \param threadElemExtent The thread element extent.
-    //! \param args,... The kernel invocation arguments.
-    //! \return The size of the shared memory allocated for a block in bytes.
-    //! The default implementation always returns zero.
+//! \tparam TAcc The accelerator type.
+//! \param kernelFnObj The kernel object for which the block shared memory size should be calculated.
+//! \param blockThreadExtent The block thread extent.
+//! \param threadElemExtent The thread element extent.
+//! \param args,... The kernel invocation arguments.
+//! \return The size of the shared memory allocated for a block in bytes.
+//! The default implementation always returns zero.
 #if BOOST_COMP_CLANG
 #    pragma clang diagnostic pop
 #endif
@@ -167,18 +191,37 @@ namespace alpaka
             args...);
     }
 
+    //! \tparam TAcc The accelerator type.
+    //! \tparam TDev The device type.
+    //! \param dev The device instance
+    //! \param kernelFnObj The kernel function object which should be executed.
+    //! \param args The kernel invocation arguments.
+    //! \return KernelFunctionAttributes instance. Instance is filled with values returned by the accelerator API
+    //! depending on the specific kernel. The default version always returns the instance with fields which are set to
+    //! zero.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc, typename TDev, typename TKernelFnObj, typename... TArgs>
+    ALPAKA_FN_HOST auto getFunctionAttributes(TDev const& dev, TKernelFnObj const& kernelFnObj, TArgs&&... args)
+        -> alpaka::KernelFunctionAttributes
+    {
+        return trait::FunctionAttributes<TAcc, TDev, TKernelFnObj, TArgs...>::getFunctionAttributes(
+            dev,
+            kernelFnObj,
+            std::forward<TArgs>(args)...);
+    }
+
 #if BOOST_COMP_CLANG
 #    pragma clang diagnostic push
 #    pragma clang diagnostic ignored                                                                                  \
         "-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
 #endif
-    //! \tparam TAcc The accelerator type.
-    //! \param kernelFnObj The kernel object for which the block shared memory size should be calculated.
-    //! \param blockThreadExtent The block thread extent.
-    //! \param threadElemExtent The thread element extent.
-    //! \param args,... The kernel invocation arguments.
-    //! \return The OpenMP schedule information as an alpaka::omp::Schedule object if the kernel specialized the
-    //!         OmpSchedule trait, an object of another type if the kernel didn't specialize the trait.
+//! \tparam TAcc The accelerator type.
+//! \param kernelFnObj The kernel object for which the block shared memory size should be calculated.
+//! \param blockThreadExtent The block thread extent.
+//! \param threadElemExtent The thread element extent.
+//! \param args,... The kernel invocation arguments.
+//! \return The OpenMP schedule information as an alpaka::omp::Schedule object if the kernel specialized the
+//!         OmpSchedule trait, an object of another type if the kernel didn't specialize the trait.
 #if BOOST_COMP_CLANG
 #    pragma clang diagnostic pop
 #endif
@@ -248,6 +291,33 @@ namespace alpaka
         }
     } // namespace detail
 
+    //! Check if the kernel type is trivially copyable
+    //!
+    //! \attention In case this trait is specialized for a user type the user should be sure that the result of calling
+    //! the copy constructor is equal to use memcpy to duplicate the object. An existing destructor should be free
+    //! of side effects.
+    //!
+    //! The default implementation is true for trivially copyable types (or for extended lambda expressions for CUDA).
+    //!
+    //! @tparam T type to check
+    //! @{
+    template<typename T, typename = void>
+    struct IsKernelTriviallyCopyable
+#if BOOST_COMP_NVCC
+        : std::bool_constant<
+              std::is_trivially_copyable_v<T> || __nv_is_extended_device_lambda_closure_type(T)
+              || __nv_is_extended_host_device_lambda_closure_type(T)>
+#else
+        : std::is_trivially_copyable<T>
+#endif
+    {
+    };
+
+    template<typename T>
+    inline constexpr bool isKernelTriviallyCopyable = IsKernelTriviallyCopyable<T>::value;
+
+//! @}
+
 //! Creates a kernel execution task.
 //!
 //! \tparam TAcc The accelerator type.
@@ -266,11 +336,10 @@ namespace alpaka
 
 #if BOOST_COMP_NVCC
         static_assert(
-            std::is_trivially_copyable_v<TKernelFnObj> || __nv_is_extended_device_lambda_closure_type(TKernelFnObj)
-                || __nv_is_extended_host_device_lambda_closure_type(TKernelFnObj),
+            isKernelTriviallyCopyable<TKernelFnObj>,
             "Kernels must be trivially copyable or an extended CUDA lambda expression!");
 #else
-        static_assert(std::is_trivially_copyable_v<TKernelFnObj>, "Kernels must be trivially copyable!");
+        static_assert(isKernelTriviallyCopyable<TKernelFnObj>, "Kernels must be trivially copyable!");
 #endif
         (detail::assertKernelArgIsTriviallyCopyable<std::decay_t<TArgs>>(), ...);
         static_assert(
diff --git a/alpaka/include/alpaka/math/Complex.hpp b/alpaka/include/alpaka/math/Complex.hpp
index 9359ac86..f265c7bf 100644
--- a/alpaka/include/alpaka/math/Complex.hpp
+++ b/alpaka/include/alpaka/math/Complex.hpp
@@ -14,560 +14,569 @@
 
 namespace alpaka
 {
-    //! Implementation of a complex number useable on host and device.
-    //!
-    //! It follows the layout of std::complex and so array-oriented access.
-    //! The class template implements all methods and operators as std::complex<T>.
-    //! Additionally, it provides an implicit conversion to and from std::complex<T>.
-    //! All methods besides operators << and >> are host-device.
-    //! It does not provide non-member functions of std::complex besides the operators.
-    //! Those are provided the same way as alpaka math functions for real numbers.
-    //!
-    //! Note that unlike most of alpaka, this is a concrete type template, and not merely a concept.
-    //!
-    //! Naming and order of the methods match https://en.cppreference.com/w/cpp/numeric/complex in C++17.
-    //! Implementation chose to not extend it e.g. by adding constexpr to some places that would get it in C++20.
-    //! The motivation is that with internal conversion to std::complex<T> for CPU backends, it would define the common
-    //! interface for genetic code anyways.
-    //! So it is more clear to have alpaka's interface exactly matching when possible, and not "improving".
-    //!
-    //! @tparam T type of the real and imaginary part: float, double, or long double.
-    template<typename T>
-    class Complex
-    {
-    public:
-        // Make sure the input type is floating-point
-        static_assert(std::is_floating_point_v<T>);
-
-        //! Type of the real and imaginary parts
-        using value_type = T;
-
-        //! Constructor from the given real and imaginary parts
-        constexpr ALPAKA_FN_HOST_ACC Complex(T const& real = T{}, T const& imag = T{}) : m_real(real), m_imag(imag)
+    namespace internal
+    {
+        //! Implementation of a complex number useable on host and device.
+        //!
+        //! It follows the layout of std::complex and so array-oriented access.
+        //! The class template implements all methods and operators as std::complex<T>.
+        //! Additionally, it provides an implicit conversion to and from std::complex<T>.
+        //! All methods besides operators << and >> are host-device.
+        //! It does not provide non-member functions of std::complex besides the operators.
+        //! Those are provided the same way as alpaka math functions for real numbers.
+        //!
+        //! Note that unlike most of alpaka, this is a concrete type template, and not merely a concept.
+        //!
+        //! Naming and order of the methods match https://en.cppreference.com/w/cpp/numeric/complex in C++17.
+        //! Implementation chose to not extend it e.g. by adding constexpr to some places that would get it in C++20.
+        //! The motivation is that with internal conversion to std::complex<T> for CPU backends, it would define the
+        //! common interface for generic code anyways. So it is more clear to have alpaka's interface exactly matching
+        //! when possible, and not "improving".
+        //!
+        //! @tparam T type of the real and imaginary part: float, double, or long double.
+        template<typename T>
+        class Complex
+        {
+        public:
+            // Make sure the input type is floating-point
+            static_assert(std::is_floating_point_v<T>);
+
+            //! Type of the real and imaginary parts
+            using value_type = T;
+
+            //! Constructor from the given real and imaginary parts
+            constexpr ALPAKA_FN_HOST_ACC Complex(T const& real = T{}, T const& imag = T{}) : m_real(real), m_imag(imag)
+            {
+            }
+
+            //! Copy constructor
+            constexpr Complex(Complex const& other) = default;
+
+            //! Constructor from Complex of another type
+            template<typename U>
+            constexpr ALPAKA_FN_HOST_ACC Complex(Complex<U> const& other)
+                : m_real(static_cast<T>(other.real()))
+                , m_imag(static_cast<T>(other.imag()))
+            {
+            }
+
+            //! Constructor from std::complex
+            constexpr ALPAKA_FN_HOST_ACC Complex(std::complex<T> const& other)
+                : m_real(other.real())
+                , m_imag(other.imag())
+            {
+            }
+
+            //! Conversion to std::complex
+            constexpr ALPAKA_FN_HOST_ACC operator std::complex<T>() const
+            {
+                return std::complex<T>{m_real, m_imag};
+            }
+
+            //! Assignment
+            Complex& operator=(Complex const&) = default;
+
+            //! Get the real part
+            constexpr ALPAKA_FN_HOST_ACC T real() const
+            {
+                return m_real;
+            }
+
+            //! Set the real part
+            constexpr ALPAKA_FN_HOST_ACC void real(T value)
+            {
+                m_real = value;
+            }
+
+            //! Get the imaginary part
+            constexpr ALPAKA_FN_HOST_ACC T imag() const
+            {
+                return m_imag;
+            }
+
+            //! Set the imaginary part
+            constexpr ALPAKA_FN_HOST_ACC void imag(T value)
+            {
+                m_imag = value;
+            }
+
+            //! Addition assignment with a real number
+            ALPAKA_FN_HOST_ACC Complex& operator+=(T const& other)
+            {
+                m_real += other;
+                return *this;
+            }
+
+            //! Addition assignment with a complex number
+            template<typename U>
+            ALPAKA_FN_HOST_ACC Complex& operator+=(Complex<U> const& other)
+            {
+                m_real += static_cast<T>(other.real());
+                m_imag += static_cast<T>(other.imag());
+                return *this;
+            }
+
+            //! Subtraction assignment with a real number
+            ALPAKA_FN_HOST_ACC Complex& operator-=(T const& other)
+            {
+                m_real -= other;
+                return *this;
+            }
+
+            //! Subtraction assignment with a complex number
+            template<typename U>
+            ALPAKA_FN_HOST_ACC Complex& operator-=(Complex<U> const& other)
+            {
+                m_real -= static_cast<T>(other.real());
+                m_imag -= static_cast<T>(other.imag());
+                return *this;
+            }
+
+            //! Multiplication assignment with a real number
+            ALPAKA_FN_HOST_ACC Complex& operator*=(T const& other)
+            {
+                m_real *= other;
+                m_imag *= other;
+                return *this;
+            }
+
+            //! Multiplication assignment with a complex number
+            template<typename U>
+            ALPAKA_FN_HOST_ACC Complex& operator*=(Complex<U> const& other)
+            {
+                auto const newReal = m_real * static_cast<T>(other.real()) - m_imag * static_cast<T>(other.imag());
+                auto const newImag = m_imag * static_cast<T>(other.real()) + m_real * static_cast<T>(other.imag());
+                m_real = newReal;
+                m_imag = newImag;
+                return *this;
+            }
+
+            //! Division assignment with a real number
+            ALPAKA_FN_HOST_ACC Complex& operator/=(T const& other)
+            {
+                m_real /= other;
+                m_imag /= other;
+                return *this;
+            }
+
+            //! Division assignment with a complex number
+            template<typename U>
+            ALPAKA_FN_HOST_ACC Complex& operator/=(Complex<U> const& other)
+            {
+                return *this *= Complex{
+                           static_cast<T>(other.real() / (other.real() * other.real() + other.imag() * other.imag())),
+                           static_cast<T>(
+                               -other.imag() / (other.real() * other.real() + other.imag() * other.imag()))};
+            }
+
+        private:
+            //! Real and imaginary parts, storage enables array-oriented access
+            T m_real, m_imag;
+        };
+
+        //! Host-device arithmetic operations matching std::complex<T>.
+        //!
+        //! They take and return alpaka::Complex.
+        //!
+        //! @{
+        //!
+
+        //! Unary plus (added for compatibility with std::complex)
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator+(Complex<T> const& val)
         {
+            return val;
         }
 
-        //! Copy constructor
-        constexpr Complex(Complex const& other) = default;
-
-        //! Constructor from Complex of another type
-        template<typename U>
-        constexpr ALPAKA_FN_HOST_ACC Complex(Complex<U> const& other)
-            : m_real(static_cast<T>(other.real()))
-            , m_imag(static_cast<T>(other.imag()))
+        //! Unary minus
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator-(Complex<T> const& val)
         {
+            return Complex<T>{-val.real(), -val.imag()};
         }
 
-        //! Constructor from std::complex
-        constexpr ALPAKA_FN_HOST_ACC Complex(std::complex<T> const& other) : m_real(other.real()), m_imag(other.imag())
+        //! Addition of two complex numbers
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator+(Complex<T> const& lhs, Complex<T> const& rhs)
         {
+            return Complex<T>{lhs.real() + rhs.real(), lhs.imag() + rhs.imag()};
         }
 
-        //! Conversion to std::complex
-        constexpr ALPAKA_FN_HOST_ACC operator std::complex<T>() const
+        //! Addition of a complex and a real number
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator+(Complex<T> const& lhs, T const& rhs)
         {
-            return std::complex<T>{m_real, m_imag};
+            return Complex<T>{lhs.real() + rhs, lhs.imag()};
         }
 
-        //! Assignment
-        Complex& operator=(Complex const&) = default;
-
-        //! Get the real part
-        constexpr ALPAKA_FN_HOST_ACC T real() const
+        //! Addition of a real and a complex number
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator+(T const& lhs, Complex<T> const& rhs)
         {
-            return m_real;
+            return Complex<T>{lhs + rhs.real(), rhs.imag()};
         }
 
-        //! Set the real part
-        constexpr ALPAKA_FN_HOST_ACC void real(T value)
+        //! Subtraction of two complex numbers
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator-(Complex<T> const& lhs, Complex<T> const& rhs)
         {
-            m_real = value;
+            return Complex<T>{lhs.real() - rhs.real(), lhs.imag() - rhs.imag()};
         }
 
-        //! Get the imaginary part
-        constexpr ALPAKA_FN_HOST_ACC T imag() const
+        //! Subtraction of a complex and a real number
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator-(Complex<T> const& lhs, T const& rhs)
         {
-            return m_imag;
+            return Complex<T>{lhs.real() - rhs, lhs.imag()};
         }
 
-        //! Set the imaginary part
-        constexpr ALPAKA_FN_HOST_ACC void imag(T value)
+        //! Subtraction of a real and a complex number
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator-(T const& lhs, Complex<T> const& rhs)
         {
-            m_imag = value;
+            return Complex<T>{lhs - rhs.real(), -rhs.imag()};
         }
 
-        //! Addition assignment with a real number
-        ALPAKA_FN_HOST_ACC Complex& operator+=(T const& other)
+        //! Muptiplication of two complex numbers
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator*(Complex<T> const& lhs, Complex<T> const& rhs)
         {
-            m_real += other;
-            return *this;
+            return Complex<T>{
+                lhs.real() * rhs.real() - lhs.imag() * rhs.imag(),
+                lhs.imag() * rhs.real() + lhs.real() * rhs.imag()};
         }
 
-        //! Addition assignment with a complex number
-        template<typename U>
-        ALPAKA_FN_HOST_ACC Complex& operator+=(Complex<U> const& other)
+        //! Muptiplication of a complex and a real number
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator*(Complex<T> const& lhs, T const& rhs)
         {
-            m_real += static_cast<T>(other.real());
-            m_imag += static_cast<T>(other.imag());
-            return *this;
+            return Complex<T>{lhs.real() * rhs, lhs.imag() * rhs};
         }
 
-        //! Subtraction assignment with a real number
-        ALPAKA_FN_HOST_ACC Complex& operator-=(T const& other)
+        //! Muptiplication of a real and a complex number
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator*(T const& lhs, Complex<T> const& rhs)
         {
-            m_real -= other;
-            return *this;
+            return Complex<T>{lhs * rhs.real(), lhs * rhs.imag()};
         }
 
-        //! Subtraction assignment with a complex number
-        template<typename U>
-        ALPAKA_FN_HOST_ACC Complex& operator-=(Complex<U> const& other)
+        //! Division of two complex numbers
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator/(Complex<T> const& lhs, Complex<T> const& rhs)
         {
-            m_real -= static_cast<T>(other.real());
-            m_imag -= static_cast<T>(other.imag());
-            return *this;
+            return Complex<T>{
+                (lhs.real() * rhs.real() + lhs.imag() * rhs.imag())
+                    / (rhs.real() * rhs.real() + rhs.imag() * rhs.imag()),
+                (lhs.imag() * rhs.real() - lhs.real() * rhs.imag())
+                    / (rhs.real() * rhs.real() + rhs.imag() * rhs.imag())};
         }
 
-        //! Multiplication assignment with a real number
-        ALPAKA_FN_HOST_ACC Complex& operator*=(T const& other)
+        //! Division of complex and a real number
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator/(Complex<T> const& lhs, T const& rhs)
         {
-            m_real *= other;
-            m_imag *= other;
-            return *this;
+            return Complex<T>{lhs.real() / rhs, lhs.imag() / rhs};
         }
 
-        //! Multiplication assignment with a complex number
-        template<typename U>
-        ALPAKA_FN_HOST_ACC Complex& operator*=(Complex<U> const& other)
+        //! Division of a real and a complex number
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator/(T const& lhs, Complex<T> const& rhs)
         {
-            auto const newReal = m_real * static_cast<T>(other.real()) - m_imag * static_cast<T>(other.imag());
-            auto const newImag = m_imag * static_cast<T>(other.real()) + m_real * static_cast<T>(other.imag());
-            m_real = newReal;
-            m_imag = newImag;
-            return *this;
+            return Complex<T>{
+                lhs * rhs.real() / (rhs.real() * rhs.real() + rhs.imag() * rhs.imag()),
+                -lhs * rhs.imag() / (rhs.real() * rhs.real() + rhs.imag() * rhs.imag())};
         }
 
-        //! Division assignment with a real number
-        ALPAKA_FN_HOST_ACC Complex& operator/=(T const& other)
+        //! Equality of two complex numbers
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC bool operator==(Complex<T> const& lhs, Complex<T> const& rhs)
         {
-            m_real /= other;
-            m_imag /= other;
-            return *this;
+            return math::floatEqualExactNoWarning(lhs.real(), rhs.real())
+                   && math::floatEqualExactNoWarning(lhs.imag(), rhs.imag());
         }
 
-        //! Division assignment with a complex number
-        template<typename U>
-        ALPAKA_FN_HOST_ACC Complex& operator/=(Complex<U> const& other)
+        //! Equality of a complex and a real number
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC bool operator==(Complex<T> const& lhs, T const& rhs)
         {
-            return *this *= Complex{
-                       static_cast<T>(other.real() / (other.real() * other.real() + other.imag() * other.imag())),
-                       static_cast<T>(-other.imag() / (other.real() * other.real() + other.imag() * other.imag()))};
+            return math::floatEqualExactNoWarning(lhs.real(), rhs)
+                   && math::floatEqualExactNoWarning(lhs.imag(), static_cast<T>(0));
         }
 
-    private:
-        //! Real and imaginary parts, storage enables array-oriented access
-        T m_real, m_imag;
-    };
-
-    //! Host-device arithmetic operations matching std::complex<T>.
-    //!
-    //! They take and return alpaka::Complex.
-    //!
-    //! @{
-    //!
-
-    //! Unary plus (added for compatibility with std::complex)
-    template<typename T>
-    ALPAKA_FN_HOST_ACC Complex<T> operator+(Complex<T> const& val)
-    {
-        return val;
-    }
-
-    //! Unary minus
-    template<typename T>
-    ALPAKA_FN_HOST_ACC Complex<T> operator-(Complex<T> const& val)
-    {
-        return Complex<T>{-val.real(), -val.imag()};
-    }
-
-    //! Addition of two complex numbers
-    template<typename T>
-    ALPAKA_FN_HOST_ACC Complex<T> operator+(Complex<T> const& lhs, Complex<T> const& rhs)
-    {
-        return Complex<T>{lhs.real() + rhs.real(), lhs.imag() + rhs.imag()};
-    }
-
-    //! Addition of a complex and a real number
-    template<typename T>
-    ALPAKA_FN_HOST_ACC Complex<T> operator+(Complex<T> const& lhs, T const& rhs)
-    {
-        return Complex<T>{lhs.real() + rhs, lhs.imag()};
-    }
-
-    //! Addition of a real and a complex number
-    template<typename T>
-    ALPAKA_FN_HOST_ACC Complex<T> operator+(T const& lhs, Complex<T> const& rhs)
-    {
-        return Complex<T>{lhs + rhs.real(), rhs.imag()};
-    }
-
-    //! Subtraction of two complex numbers
-    template<typename T>
-    ALPAKA_FN_HOST_ACC Complex<T> operator-(Complex<T> const& lhs, Complex<T> const& rhs)
-    {
-        return Complex<T>{lhs.real() - rhs.real(), lhs.imag() - rhs.imag()};
-    }
-
-    //! Subtraction of a complex and a real number
-    template<typename T>
-    ALPAKA_FN_HOST_ACC Complex<T> operator-(Complex<T> const& lhs, T const& rhs)
-    {
-        return Complex<T>{lhs.real() - rhs, lhs.imag()};
-    }
-
-    //! Subtraction of a real and a complex number
-    template<typename T>
-    ALPAKA_FN_HOST_ACC Complex<T> operator-(T const& lhs, Complex<T> const& rhs)
-    {
-        return Complex<T>{lhs - rhs.real(), -rhs.imag()};
-    }
-
-    //! Muptiplication of two complex numbers
-    template<typename T>
-    ALPAKA_FN_HOST_ACC Complex<T> operator*(Complex<T> const& lhs, Complex<T> const& rhs)
-    {
-        return Complex<T>{
-            lhs.real() * rhs.real() - lhs.imag() * rhs.imag(),
-            lhs.imag() * rhs.real() + lhs.real() * rhs.imag()};
-    }
-
-    //! Muptiplication of a complex and a real number
-    template<typename T>
-    ALPAKA_FN_HOST_ACC Complex<T> operator*(Complex<T> const& lhs, T const& rhs)
-    {
-        return Complex<T>{lhs.real() * rhs, lhs.imag() * rhs};
-    }
-
-    //! Muptiplication of a real and a complex number
-    template<typename T>
-    ALPAKA_FN_HOST_ACC Complex<T> operator*(T const& lhs, Complex<T> const& rhs)
-    {
-        return Complex<T>{lhs * rhs.real(), lhs * rhs.imag()};
-    }
-
-    //! Division of two complex numbers
-    template<typename T>
-    ALPAKA_FN_HOST_ACC Complex<T> operator/(Complex<T> const& lhs, Complex<T> const& rhs)
-    {
-        return Complex<T>{
-            (lhs.real() * rhs.real() + lhs.imag() * rhs.imag()) / (rhs.real() * rhs.real() + rhs.imag() * rhs.imag()),
-            (lhs.imag() * rhs.real() - lhs.real() * rhs.imag()) / (rhs.real() * rhs.real() + rhs.imag() * rhs.imag())};
-    }
-
-    //! Division of complex and a real number
-    template<typename T>
-    ALPAKA_FN_HOST_ACC Complex<T> operator/(Complex<T> const& lhs, T const& rhs)
-    {
-        return Complex<T>{lhs.real() / rhs, lhs.imag() / rhs};
-    }
+        //! Equality of a real and a complex number
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC bool operator==(T const& lhs, Complex<T> const& rhs)
+        {
+            return math::floatEqualExactNoWarning(lhs, rhs.real())
+                   && math::floatEqualExactNoWarning(static_cast<T>(0), rhs.imag());
+        }
 
-    //! Division of a real and a complex number
-    template<typename T>
-    ALPAKA_FN_HOST_ACC Complex<T> operator/(T const& lhs, Complex<T> const& rhs)
-    {
-        return Complex<T>{
-            lhs * rhs.real() / (rhs.real() * rhs.real() + rhs.imag() * rhs.imag()),
-            -lhs * rhs.imag() / (rhs.real() * rhs.real() + rhs.imag() * rhs.imag())};
-    }
-
-    //! Equality of two complex numbers
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC bool operator==(Complex<T> const& lhs, Complex<T> const& rhs)
-    {
-        return math::floatEqualExactNoWarning(lhs.real(), rhs.real())
-               && math::floatEqualExactNoWarning(lhs.imag(), rhs.imag());
-    }
+        //! Inequality of two complex numbers.
+        //!
+        //! @note this and other versions of operator != should be removed since C++20, as so does std::complex
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC bool operator!=(Complex<T> const& lhs, Complex<T> const& rhs)
+        {
+            return !(lhs == rhs);
+        }
 
-    //! Equality of a complex and a real number
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC bool operator==(Complex<T> const& lhs, T const& rhs)
-    {
-        return math::floatEqualExactNoWarning(lhs.real(), rhs)
-               && math::floatEqualExactNoWarning(lhs.imag(), static_cast<T>(0));
-    }
+        //! Inequality of a complex and a real number
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC bool operator!=(Complex<T> const& lhs, T const& rhs)
+        {
+            return !math::floatEqualExactNoWarning(lhs.real(), rhs)
+                   || !math::floatEqualExactNoWarning(lhs.imag(), static_cast<T>(0));
+        }
 
-    //! Equality of a real and a complex number
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC bool operator==(T const& lhs, Complex<T> const& rhs)
-    {
-        return math::floatEqualExactNoWarning(lhs, rhs.real())
-               && math::floatEqualExactNoWarning(static_cast<T>(0), rhs.imag());
-    }
-
-    //! Inequality of two complex numbers.
-    //!
-    //! @note this and other versions of operator != should be removed since C++20, as so does std::complex
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC bool operator!=(Complex<T> const& lhs, Complex<T> const& rhs)
-    {
-        return !(lhs == rhs);
-    }
+        //! Inequality of a real and a complex number
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC bool operator!=(T const& lhs, Complex<T> const& rhs)
+        {
+            return !math::floatEqualExactNoWarning(lhs, rhs.real())
+                   || !math::floatEqualExactNoWarning(static_cast<T>(0), rhs.imag());
+        }
 
-    //! Inequality of a complex and a real number
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC bool operator!=(Complex<T> const& lhs, T const& rhs)
-    {
-        return !math::floatEqualExactNoWarning(lhs.real(), rhs)
-               || !math::floatEqualExactNoWarning(lhs.imag(), static_cast<T>(0));
-    }
+        //! @}
 
-    //! Inequality of a real and a complex number
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC bool operator!=(T const& lhs, Complex<T> const& rhs)
-    {
-        return !math::floatEqualExactNoWarning(lhs, rhs.real())
-               || !math::floatEqualExactNoWarning(static_cast<T>(0), rhs.imag());
-    }
+        //! Host-only output of a complex number
+        template<typename T, typename TChar, typename TTraits>
+        std::basic_ostream<TChar, TTraits>& operator<<(std::basic_ostream<TChar, TTraits>& os, Complex<T> const& x)
+        {
+            os << x.operator std::complex<T>();
+            return os;
+        }
 
-    //! @}
+        //! Host-only input of a complex number
+        template<typename T, typename TChar, typename TTraits>
+        std::basic_istream<TChar, TTraits>& operator>>(std::basic_istream<TChar, TTraits>& is, Complex<T> const& x)
+        {
+            std::complex<T> z;
+            is >> z;
+            x = z;
+            return is;
+        }
 
-    //! Host-only output of a complex number
-    template<typename T, typename TChar, typename TTraits>
-    std::basic_ostream<TChar, TTraits>& operator<<(std::basic_ostream<TChar, TTraits>& os, Complex<T> const& x)
-    {
-        os << x.operator std::complex<T>();
-        return os;
-    }
+        //! Host-only math functions matching std::complex<T>.
+        //!
+        //! Due to issue #1688, these functions are technically marked host-device and suppress related warnings.
+        //! However, they must be called for host only.
+        //!
+        //! They take and return alpaka::Complex (or a real number when appropriate).
+        //! Internally cast, fall back to std::complex implementation and cast back.
+        //! These functions can be used directly on the host side.
+        //! They are also picked up by ADL in math traits for CPU backends.
+        //!
+        //! On the device side, alpaka math traits must be used instead.
+        //! Note that the set of the traits is currently a bit smaller.
+        //!
+        //! @{
+        //!
+
+        //! Absolute value
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC T abs(Complex<T> const& x)
+        {
+            return std::abs(std::complex<T>(x));
+        }
 
-    //! Host-only input of a complex number
-    template<typename T, typename TChar, typename TTraits>
-    std::basic_istream<TChar, TTraits>& operator>>(std::basic_istream<TChar, TTraits>& is, Complex<T> const& x)
-    {
-        std::complex<T> z;
-        is >> z;
-        x = z;
-        return is;
-    }
-
-    //! Host-only math functions matching std::complex<T>.
-    //!
-    //! Due to issue #1688, these functions are technically marked host-device and suppress related warnings.
-    //! However, they must be called for host only.
-    //!
-    //! They take and return alpaka::Complex (or a real number when appropriate).
-    //! Internally cast, fall back to std::complex implementation and cast back.
-    //! These functions can be used directly on the host side.
-    //! They are also picked up by ADL in math traits for CPU backends.
-    //!
-    //! On the device side, alpaka math traits must be used instead.
-    //! Note that the set of the traits is currently a bit smaller.
-    //!
-    //! @{
-    //!
-
-    //! Absolute value
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC T abs(Complex<T> const& x)
-    {
-        return std::abs(std::complex<T>(x));
-    }
+        //! Arc cosine
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> acos(Complex<T> const& x)
+        {
+            return std::acos(std::complex<T>(x));
+        }
 
-    //! Arc cosine
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC Complex<T> acos(Complex<T> const& x)
-    {
-        return std::acos(std::complex<T>(x));
-    }
+        //! Arc hyperbolic cosine
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> acosh(Complex<T> const& x)
+        {
+            return std::acosh(std::complex<T>(x));
+        }
 
-    //! Arc hyperbolic cosine
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC Complex<T> acosh(Complex<T> const& x)
-    {
-        return std::acosh(std::complex<T>(x));
-    }
+        //! Argument
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC T arg(Complex<T> const& x)
+        {
+            return std::arg(std::complex<T>(x));
+        }
 
-    //! Argument
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC T arg(Complex<T> const& x)
-    {
-        return std::arg(std::complex<T>(x));
-    }
+        //! Arc sine
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> asin(Complex<T> const& x)
+        {
+            return std::asin(std::complex<T>(x));
+        }
 
-    //! Arc sine
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC Complex<T> asin(Complex<T> const& x)
-    {
-        return std::asin(std::complex<T>(x));
-    }
+        //! Arc hyperbolic sine
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> asinh(Complex<T> const& x)
+        {
+            return std::asinh(std::complex<T>(x));
+        }
 
-    //! Arc hyperbolic sine
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC Complex<T> asinh(Complex<T> const& x)
-    {
-        return std::asinh(std::complex<T>(x));
-    }
+        //! Arc tangent
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> atan(Complex<T> const& x)
+        {
+            return std::atan(std::complex<T>(x));
+        }
 
-    //! Arc tangent
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC Complex<T> atan(Complex<T> const& x)
-    {
-        return std::atan(std::complex<T>(x));
-    }
+        //! Arc hyperbolic tangent
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> atanh(Complex<T> const& x)
+        {
+            return std::atanh(std::complex<T>(x));
+        }
 
-    //! Arc hyperbolic tangent
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC Complex<T> atanh(Complex<T> const& x)
-    {
-        return std::atanh(std::complex<T>(x));
-    }
+        //! Complex conjugate
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> conj(Complex<T> const& x)
+        {
+            return std::conj(std::complex<T>(x));
+        }
 
-    //! Complex conjugate
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC Complex<T> conj(Complex<T> const& x)
-    {
-        return std::conj(std::complex<T>(x));
-    }
+        //! Cosine
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> cos(Complex<T> const& x)
+        {
+            return std::cos(std::complex<T>(x));
+        }
 
-    //! Cosine
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC Complex<T> cos(Complex<T> const& x)
-    {
-        return std::cos(std::complex<T>(x));
-    }
+        //! Hyperbolic cosine
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> cosh(Complex<T> const& x)
+        {
+            return std::cosh(std::complex<T>(x));
+        }
 
-    //! Hyperbolic cosine
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC Complex<T> cosh(Complex<T> const& x)
-    {
-        return std::cosh(std::complex<T>(x));
-    }
+        //! Exponential
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> exp(Complex<T> const& x)
+        {
+            return std::exp(std::complex<T>(x));
+        }
 
-    //! Exponential
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC Complex<T> exp(Complex<T> const& x)
-    {
-        return std::exp(std::complex<T>(x));
-    }
+        //! Natural logarithm
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> log(Complex<T> const& x)
+        {
+            return std::log(std::complex<T>(x));
+        }
 
-    //! Natural logarithm
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC Complex<T> log(Complex<T> const& x)
-    {
-        return std::log(std::complex<T>(x));
-    }
+        //! Base 10 logarithm
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> log10(Complex<T> const& x)
+        {
+            return std::log10(std::complex<T>(x));
+        }
 
-    //! Base 10 logarithm
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC Complex<T> log10(Complex<T> const& x)
-    {
-        return std::log10(std::complex<T>(x));
-    }
+        //! Squared magnitude
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC T norm(Complex<T> const& x)
+        {
+            return std::norm(std::complex<T>(x));
+        }
 
-    //! Squared magnitude
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC T norm(Complex<T> const& x)
-    {
-        return std::norm(std::complex<T>(x));
-    }
+        //! Get a complex number with given magnitude and phase angle
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> polar(T const& r, T const& theta = T())
+        {
+            return std::polar(r, theta);
+        }
 
-    //! Get a complex number with given magnitude and phase angle
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC Complex<T> polar(T const& r, T const& theta = T())
-    {
-        return std::polar(r, theta);
-    }
+        //! Complex power of a complex number
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T, typename U>
+        constexpr ALPAKA_FN_HOST_ACC auto pow(Complex<T> const& x, Complex<U> const& y)
+        {
+            // Use same type promotion as std::pow
+            auto const result = std::pow(std::complex<T>(x), std::complex<U>(y));
+            using ValueType = typename decltype(result)::value_type;
+            return Complex<ValueType>(result);
+        }
 
-    //! Complex power of a complex number
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename U>
-    constexpr ALPAKA_FN_HOST_ACC auto pow(Complex<T> const& x, Complex<U> const& y)
-    {
-        // Use same type promotion as std::pow
-        auto const result = std::pow(std::complex<T>(x), std::complex<U>(y));
-        using ValueType = typename decltype(result)::value_type;
-        return Complex<ValueType>(result);
-    }
-
-    //! Real power of a complex number
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename U>
-    constexpr ALPAKA_FN_HOST_ACC auto pow(Complex<T> const& x, U const& y)
-    {
-        return pow(x, Complex<U>(y));
-    }
+        //! Real power of a complex number
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T, typename U>
+        constexpr ALPAKA_FN_HOST_ACC auto pow(Complex<T> const& x, U const& y)
+        {
+            return pow(x, Complex<U>(y));
+        }
 
-    //! Complex power of a real number
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename U>
-    constexpr ALPAKA_FN_HOST_ACC auto pow(T const& x, Complex<U> const& y)
-    {
-        return pow(Complex<T>(x), y);
-    }
+        //! Complex power of a real number
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T, typename U>
+        constexpr ALPAKA_FN_HOST_ACC auto pow(T const& x, Complex<U> const& y)
+        {
+            return pow(Complex<T>(x), y);
+        }
 
-    //! Projection onto the Riemann sphere
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC Complex<T> proj(Complex<T> const& x)
-    {
-        return std::proj(std::complex<T>(x));
-    }
+        //! Projection onto the Riemann sphere
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> proj(Complex<T> const& x)
+        {
+            return std::proj(std::complex<T>(x));
+        }
 
-    //! Sine
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC Complex<T> sin(Complex<T> const& x)
-    {
-        return std::sin(std::complex<T>(x));
-    }
+        //! Sine
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> sin(Complex<T> const& x)
+        {
+            return std::sin(std::complex<T>(x));
+        }
 
-    //! Hyperbolic sine
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC Complex<T> sinh(Complex<T> const& x)
-    {
-        return std::sinh(std::complex<T>(x));
-    }
+        //! Hyperbolic sine
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> sinh(Complex<T> const& x)
+        {
+            return std::sinh(std::complex<T>(x));
+        }
 
-    //! Square root
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC Complex<T> sqrt(Complex<T> const& x)
-    {
-        return std::sqrt(std::complex<T>(x));
-    }
+        //! Square root
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> sqrt(Complex<T> const& x)
+        {
+            return std::sqrt(std::complex<T>(x));
+        }
 
-    //! Tangent
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC Complex<T> tan(Complex<T> const& x)
-    {
-        return std::tan(std::complex<T>(x));
-    }
+        //! Tangent
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> tan(Complex<T> const& x)
+        {
+            return std::tan(std::complex<T>(x));
+        }
 
-    //! Hyperbolic tangent
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T>
-    constexpr ALPAKA_FN_HOST_ACC Complex<T> tanh(Complex<T> const& x)
-    {
-        return std::tanh(std::complex<T>(x));
-    }
+        //! Hyperbolic tangent
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> tanh(Complex<T> const& x)
+        {
+            return std::tanh(std::complex<T>(x));
+        }
 
-    //! @}
+        //! @}
+    } // namespace internal
 
+    using internal::Complex;
 } // namespace alpaka
diff --git a/alpaka/include/alpaka/mem/buf/BufCpuSycl.hpp b/alpaka/include/alpaka/mem/buf/BufCpuSycl.hpp
index d63eebf5..ab36f8be 100644
--- a/alpaka/include/alpaka/mem/buf/BufCpuSycl.hpp
+++ b/alpaka/include/alpaka/mem/buf/BufCpuSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
+/* Copyright 2024 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -6,6 +6,7 @@
 
 #include "alpaka/dev/DevCpuSycl.hpp"
 #include "alpaka/mem/buf/BufGenericSycl.hpp"
+#include "alpaka/platform/PlatformCpuSycl.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_CPU)
 
diff --git a/alpaka/include/alpaka/mem/buf/BufFpgaSyclIntel.hpp b/alpaka/include/alpaka/mem/buf/BufFpgaSyclIntel.hpp
index 2dca26f1..562fae94 100644
--- a/alpaka/include/alpaka/mem/buf/BufFpgaSyclIntel.hpp
+++ b/alpaka/include/alpaka/mem/buf/BufFpgaSyclIntel.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan
+/* Copyright 2024 Jan Stephan, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -6,13 +6,14 @@
 
 #include "alpaka/dev/DevFpgaSyclIntel.hpp"
 #include "alpaka/mem/buf/BufGenericSycl.hpp"
+#include "alpaka/platform/PlatformFpgaSyclIntel.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_FPGA)
 
 namespace alpaka
 {
     template<typename TElem, typename TDim, typename TIdx>
-    using BufFpgaSyclIntel = BufGenericSycl<TElem, TDim, TIdx, DevFpgaSyclIntel>;
+    using BufFpgaSyclIntel = BufGenericSycl<TElem, TDim, TIdx, PlatformFpgaSyclIntel>;
 } // namespace alpaka
 
 #endif
diff --git a/alpaka/include/alpaka/mem/buf/BufGenericSycl.hpp b/alpaka/include/alpaka/mem/buf/BufGenericSycl.hpp
index b4a5fd94..9beb16c7 100644
--- a/alpaka/include/alpaka/mem/buf/BufGenericSycl.hpp
+++ b/alpaka/include/alpaka/mem/buf/BufGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego, Andrea Bocci
+/* Copyright 2024 Jan Stephan, Luca Ferragina, Aurora Perego, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -24,8 +24,8 @@
 namespace alpaka
 {
     //! The SYCL memory buffer.
-    template<typename TElem, typename TDim, typename TIdx, typename TPlatform>
-    class BufGenericSycl : public internal::ViewAccessOps<BufGenericSycl<TElem, TDim, TIdx, TPlatform>>
+    template<typename TElem, typename TDim, typename TIdx, typename TTag>
+    class BufGenericSycl : public internal::ViewAccessOps<BufGenericSycl<TElem, TDim, TIdx, TTag>>
     {
     public:
         static_assert(
@@ -36,7 +36,7 @@ namespace alpaka
 
         //! Constructor
         template<typename TExtent, typename Deleter>
-        BufGenericSycl(DevGenericSycl<TPlatform> const& dev, TElem* const pMem, Deleter deleter, TExtent const& extent)
+        BufGenericSycl(DevGenericSycl<TTag> const& dev, TElem* const pMem, Deleter deleter, TExtent const& extent)
             : m_dev{dev}
             , m_extentElements{getExtentVecEnd<TDim>(extent)}
             , m_spMem(pMem, std::move(deleter))
@@ -53,7 +53,7 @@ namespace alpaka
                 "The idx type of TExtent and the TIdx template parameter have to be identical!");
         }
 
-        DevGenericSycl<TPlatform> m_dev;
+        DevGenericSycl<TTag> m_dev;
         Vec<TDim, TIdx> m_extentElements;
         std::shared_ptr<TElem> m_spMem;
     };
@@ -62,68 +62,67 @@ namespace alpaka
 namespace alpaka::trait
 {
     //! The BufGenericSycl device type trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TPlatform>
-    struct DevType<BufGenericSycl<TElem, TDim, TIdx, TPlatform>>
+    template<typename TElem, typename TDim, typename TIdx, typename TTag>
+    struct DevType<BufGenericSycl<TElem, TDim, TIdx, TTag>>
     {
-        using type = DevGenericSycl<TPlatform>;
+        using type = DevGenericSycl<TTag>;
     };
 
     //! The BufGenericSycl device get trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TPlatform>
-    struct GetDev<BufGenericSycl<TElem, TDim, TIdx, TPlatform>>
+    template<typename TElem, typename TDim, typename TIdx, typename TTag>
+    struct GetDev<BufGenericSycl<TElem, TDim, TIdx, TTag>>
     {
-        static auto getDev(BufGenericSycl<TElem, TDim, TIdx, TPlatform> const& buf)
+        static auto getDev(BufGenericSycl<TElem, TDim, TIdx, TTag> const& buf)
         {
             return buf.m_dev;
         }
     };
 
     //! The BufGenericSycl dimension getter trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TPlatform>
-    struct DimType<BufGenericSycl<TElem, TDim, TIdx, TPlatform>>
+    template<typename TElem, typename TDim, typename TIdx, typename TTag>
+    struct DimType<BufGenericSycl<TElem, TDim, TIdx, TTag>>
     {
         using type = TDim;
     };
 
     //! The BufGenericSycl memory element type get trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TPlatform>
-    struct ElemType<BufGenericSycl<TElem, TDim, TIdx, TPlatform>>
+    template<typename TElem, typename TDim, typename TIdx, typename TTag>
+    struct ElemType<BufGenericSycl<TElem, TDim, TIdx, TTag>>
     {
         using type = TElem;
     };
 
     //! The BufGenericSycl extent get trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TPlatform>
-    struct GetExtents<BufGenericSycl<TElem, TDim, TIdx, TPlatform>>
+    template<typename TElem, typename TDim, typename TIdx, typename TTag>
+    struct GetExtents<BufGenericSycl<TElem, TDim, TIdx, TTag>>
     {
-        auto operator()(BufGenericSycl<TElem, TDim, TIdx, TPlatform> const& buf) const
+        auto operator()(BufGenericSycl<TElem, TDim, TIdx, TTag> const& buf) const
         {
             return buf.m_extentElements;
         }
     };
 
     //! The BufGenericSycl native pointer get trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TPlatform>
-    struct GetPtrNative<BufGenericSycl<TElem, TDim, TIdx, TPlatform>>
+    template<typename TElem, typename TDim, typename TIdx, typename TTag>
+    struct GetPtrNative<BufGenericSycl<TElem, TDim, TIdx, TTag>>
     {
-        static auto getPtrNative(BufGenericSycl<TElem, TDim, TIdx, TPlatform> const& buf) -> TElem const*
+        static auto getPtrNative(BufGenericSycl<TElem, TDim, TIdx, TTag> const& buf) -> TElem const*
         {
             return buf.m_spMem.get();
         }
 
-        static auto getPtrNative(BufGenericSycl<TElem, TDim, TIdx, TPlatform>& buf) -> TElem*
+        static auto getPtrNative(BufGenericSycl<TElem, TDim, TIdx, TTag>& buf) -> TElem*
         {
             return buf.m_spMem.get();
         }
     };
 
     //! The BufGenericSycl pointer on device get trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TPlatform>
-    struct GetPtrDev<BufGenericSycl<TElem, TDim, TIdx, TPlatform>, DevGenericSycl<TPlatform>>
+    template<typename TElem, typename TDim, typename TIdx, typename TTag>
+    struct GetPtrDev<BufGenericSycl<TElem, TDim, TIdx, TTag>, DevGenericSycl<TTag>>
     {
-        static auto getPtrDev(
-            BufGenericSycl<TElem, TDim, TIdx, TPlatform> const& buf,
-            DevGenericSycl<TPlatform> const& dev) -> TElem const*
+        static auto getPtrDev(BufGenericSycl<TElem, TDim, TIdx, TTag> const& buf, DevGenericSycl<TTag> const& dev)
+            -> TElem const*
         {
             if(dev == getDev(buf))
             {
@@ -135,8 +134,7 @@ namespace alpaka::trait
             }
         }
 
-        static auto getPtrDev(BufGenericSycl<TElem, TDim, TIdx, TPlatform>& buf, DevGenericSycl<TPlatform> const& dev)
-            -> TElem*
+        static auto getPtrDev(BufGenericSycl<TElem, TDim, TIdx, TTag>& buf, DevGenericSycl<TTag> const& dev) -> TElem*
         {
             if(dev == getDev(buf))
             {
@@ -150,12 +148,12 @@ namespace alpaka::trait
     };
 
     //! The SYCL memory allocation trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TPlatform>
-    struct BufAlloc<TElem, TDim, TIdx, DevGenericSycl<TPlatform>>
+    template<typename TElem, typename TDim, typename TIdx, typename TTag>
+    struct BufAlloc<TElem, TDim, TIdx, DevGenericSycl<TTag>>
     {
         template<typename TExtent>
-        static auto allocBuf(DevGenericSycl<TPlatform> const& dev, TExtent const& extent)
-            -> BufGenericSycl<TElem, TDim, TIdx, TPlatform>
+        static auto allocBuf(DevGenericSycl<TTag> const& dev, TExtent const& extent)
+            -> BufGenericSycl<TElem, TDim, TIdx, TTag>
         {
             ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
@@ -197,38 +195,40 @@ namespace alpaka::trait
                 nativeContext);
             auto deleter = [ctx = nativeContext](TElem* ptr) { sycl::free(ptr, ctx); };
 
-            return BufGenericSycl<TElem, TDim, TIdx, TPlatform>(dev, memPtr, std::move(deleter), extent);
+            return BufGenericSycl<TElem, TDim, TIdx, TTag>(dev, memPtr, std::move(deleter), extent);
         }
     };
 
     //! The BufGenericSycl stream-ordered memory allocation capability trait specialization.
-    template<typename TDim, typename TPlatform>
-    struct HasAsyncBufSupport<TDim, DevGenericSycl<TPlatform>> : std::false_type
+    template<typename TDim, typename TTag>
+    struct HasAsyncBufSupport<TDim, DevGenericSycl<TTag>> : std::false_type
     {
     };
 
     //! The BufGenericSycl offset get trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TPlatform>
-    struct GetOffsets<BufGenericSycl<TElem, TDim, TIdx, TPlatform>>
+    template<typename TElem, typename TDim, typename TIdx, typename TTag>
+    struct GetOffsets<BufGenericSycl<TElem, TDim, TIdx, TTag>>
     {
-        auto operator()(BufGenericSycl<TElem, TDim, TIdx, TPlatform> const&) const -> Vec<TDim, TIdx>
+        auto operator()(BufGenericSycl<TElem, TDim, TIdx, TTag> const&) const -> Vec<TDim, TIdx>
         {
             return Vec<TDim, TIdx>::zeros();
         }
     };
 
     //! The pinned/mapped memory allocation trait specialization for the SYCL devices.
-    template<typename TPlatform, typename TElem, typename TDim, typename TIdx>
-    struct BufAllocMapped
+    template<typename TTag, typename TElem, typename TDim, typename TIdx>
+    struct BufAllocMapped<PlatformGenericSycl<TTag>, TElem, TDim, TIdx>
     {
         template<typename TExtent>
-        static auto allocMappedBuf(DevCpu const& host, TPlatform const& platform, TExtent const& extent)
-            -> BufCpu<TElem, TDim, TIdx>
+        static auto allocMappedBuf(
+            DevCpu const& host,
+            PlatformGenericSycl<TTag> const& platform,
+            TExtent const& extent) -> BufCpu<TElem, TDim, TIdx>
         {
             ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
-            // Allocate SYCL page-locked memory on the host, mapped into the TPlatform address space and
-            // accessible to all devices in the TPlatform.
+            // Allocate SYCL page-locked memory on the host, mapped into the SYCL platform's address space and
+            // accessible to all devices in the SYCL platform.
             auto ctx = platform.syclContext();
             TElem* memPtr = sycl::malloc_host<TElem>(static_cast<std::size_t>(getExtentProduct(extent)), ctx);
             auto deleter = [ctx](TElem* ptr) { sycl::free(ptr, ctx); };
@@ -237,23 +237,29 @@ namespace alpaka::trait
         }
     };
 
+    //! The pinned/mapped memory allocation capability trait specialization.
+    template<typename TTag>
+    struct HasMappedBufSupport<PlatformGenericSycl<TTag>> : public std::true_type
+    {
+    };
+
     //! The BufGenericSycl idx type trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TPlatform>
-    struct IdxType<BufGenericSycl<TElem, TDim, TIdx, TPlatform>>
+    template<typename TElem, typename TDim, typename TIdx, typename TTag>
+    struct IdxType<BufGenericSycl<TElem, TDim, TIdx, TTag>>
     {
         using type = TIdx;
     };
 
     //! The BufCpu pointer on SYCL device get trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TPlatform>
-    struct GetPtrDev<BufCpu<TElem, TDim, TIdx>, DevGenericSycl<TPlatform>>
+    template<typename TElem, typename TDim, typename TIdx, typename TTag>
+    struct GetPtrDev<BufCpu<TElem, TDim, TIdx>, DevGenericSycl<TTag>>
     {
-        static auto getPtrDev(BufCpu<TElem, TDim, TIdx> const& buf, DevGenericSycl<TPlatform> const&) -> TElem const*
+        static auto getPtrDev(BufCpu<TElem, TDim, TIdx> const& buf, DevGenericSycl<TTag> const&) -> TElem const*
         {
             return getPtrNative(buf);
         }
 
-        static auto getPtrDev(BufCpu<TElem, TDim, TIdx>& buf, DevGenericSycl<TPlatform> const&) -> TElem*
+        static auto getPtrDev(BufCpu<TElem, TDim, TIdx>& buf, DevGenericSycl<TTag> const&) -> TElem*
         {
             return getPtrNative(buf);
         }
diff --git a/alpaka/include/alpaka/mem/buf/BufGpuSyclIntel.hpp b/alpaka/include/alpaka/mem/buf/BufGpuSyclIntel.hpp
index dd20f8a3..5597f701 100644
--- a/alpaka/include/alpaka/mem/buf/BufGpuSyclIntel.hpp
+++ b/alpaka/include/alpaka/mem/buf/BufGpuSyclIntel.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Luca Ferragina
+/* Copyright 2024 Jan Stephan, Luca Ferragina, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -6,6 +6,7 @@
 
 #include "alpaka/dev/DevGpuSyclIntel.hpp"
 #include "alpaka/mem/buf/BufGenericSycl.hpp"
+#include "alpaka/platform/PlatformGpuSyclIntel.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_GPU)
 
diff --git a/alpaka/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp b/alpaka/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp
index 64981209..826edaba 100644
--- a/alpaka/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp
+++ b/alpaka/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp
@@ -78,7 +78,7 @@ namespace alpaka
                 "The dimensionality of TExtent and the dimensionality of the TDim template parameter have to be "
                 "identical!");
             static_assert(
-                std::is_same_v<TIdx, Idx<TExtent>>,
+                std::is_same_v<TIdx, alpaka::Idx<TExtent>>,
                 "The idx type of TExtent and the TIdx template parameter have to be identical!");
         }
 
diff --git a/alpaka/include/alpaka/mem/buf/Traits.hpp b/alpaka/include/alpaka/mem/buf/Traits.hpp
index 33e7c9bd..e29cf5bf 100644
--- a/alpaka/include/alpaka/mem/buf/Traits.hpp
+++ b/alpaka/include/alpaka/mem/buf/Traits.hpp
@@ -127,14 +127,14 @@ namespace alpaka
 
     //! Allocates pinned/mapped host memory, accessible by all devices in the given platform.
     //!
-    //! \tparam TPlatform The platform from which the buffer is accessible.
     //! \tparam TElem The element type of the returned buffer.
     //! \tparam TIdx The linear index type of the buffer.
     //! \tparam TExtent The extent type of the buffer.
+    //! \tparam TPlatform The platform from which the buffer is accessible.
     //! \param host The host device to allocate the buffer on.
     //! \param extent The extent of the buffer.
     //! \return The newly allocated buffer.
-    template<typename TPlatform, typename TElem, typename TIdx, typename TExtent>
+    template<typename TElem, typename TIdx, typename TExtent, typename TPlatform>
     ALPAKA_FN_HOST auto allocMappedBuf(
         DevCpu const& host,
         TPlatform const& platform,
@@ -180,7 +180,7 @@ namespace alpaka
         using Platform = alpaka::Platform<TPlatform>;
         if constexpr(hasMappedBufSupport<Platform>)
         {
-            return allocMappedBuf<Platform, TElem, TIdx>(host, platform, extent);
+            return allocMappedBuf<TElem, TIdx>(host, platform, extent);
         }
         else
         {
diff --git a/alpaka/include/alpaka/mem/buf/sycl/Copy.hpp b/alpaka/include/alpaka/mem/buf/sycl/Copy.hpp
index 806c728a..44098f1b 100644
--- a/alpaka/include/alpaka/mem/buf/sycl/Copy.hpp
+++ b/alpaka/include/alpaka/mem/buf/sycl/Copy.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Bernhard Manfred Gruber, Luca Ferragina, Aurora Perego, Andrea Bocci
+/* Copyright 2024 Jan Stephan, Bernhard Manfred Gruber, Luca Ferragina, Aurora Perego, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -195,8 +195,8 @@ namespace alpaka::detail
 namespace alpaka::trait
 {
     //! The SYCL host-to-device memory copy trait specialization.
-    template<typename TPlatform, typename TDim>
-    struct CreateTaskMemcpy<TDim, DevGenericSycl<TPlatform>, DevCpu>
+    template<typename TTag, typename TDim>
+    struct CreateTaskMemcpy<TDim, DevGenericSycl<TTag>, DevCpu>
     {
         template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
         static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
@@ -209,8 +209,8 @@ namespace alpaka::trait
     };
 
     //! The SYCL device-to-host memory copy trait specialization.
-    template<typename TPlatform, typename TDim>
-    struct CreateTaskMemcpy<TDim, DevCpu, DevGenericSycl<TPlatform>>
+    template<typename TTag, typename TDim>
+    struct CreateTaskMemcpy<TDim, DevCpu, DevGenericSycl<TTag>>
     {
         template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
         static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
@@ -223,8 +223,8 @@ namespace alpaka::trait
     };
 
     //! The SYCL device-to-device memory copy trait specialization.
-    template<typename TPlatformDst, typename TPlatformSrc, typename TDim>
-    struct CreateTaskMemcpy<TDim, DevGenericSycl<TPlatformDst>, DevGenericSycl<TPlatformSrc>>
+    template<typename TTagDst, typename TTagSrc, typename TDim>
+    struct CreateTaskMemcpy<TDim, DevGenericSycl<TTagDst>, DevGenericSycl<TTagSrc>>
     {
         template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
         static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
diff --git a/alpaka/include/alpaka/mem/buf/sycl/Set.hpp b/alpaka/include/alpaka/mem/buf/sycl/Set.hpp
index 17187fd0..73478d34 100644
--- a/alpaka/include/alpaka/mem/buf/sycl/Set.hpp
+++ b/alpaka/include/alpaka/mem/buf/sycl/Set.hpp
@@ -200,9 +200,9 @@ namespace alpaka
         {
             template<typename TExtent, typename TView>
             static auto createTaskMemset(TView& view, std::uint8_t const& byte, TExtent const& extent)
-                -> detail::TaskSetSycl<TDim, TView, TExtent>
+                -> alpaka::detail::TaskSetSycl<TDim, TView, TExtent>
             {
-                return detail::TaskSetSycl<TDim, TView, TExtent>(view, byte, extent);
+                return alpaka::detail::TaskSetSycl<TDim, TView, TExtent>(view, byte, extent);
             }
         };
 
diff --git a/alpaka/include/alpaka/mem/fence/MemFenceGenericSycl.hpp b/alpaka/include/alpaka/mem/fence/MemFenceGenericSycl.hpp
index 2d6ae066..2c2cd9ef 100644
--- a/alpaka/include/alpaka/mem/fence/MemFenceGenericSycl.hpp
+++ b/alpaka/include/alpaka/mem/fence/MemFenceGenericSycl.hpp
@@ -51,7 +51,7 @@ namespace alpaka::trait
     {
         static auto mem_fence(MemFenceGenericSycl const&, TMemScope const&)
         {
-            static constexpr auto scope = detail::SyclFenceProps<TMemScope>::scope;
+            static constexpr auto scope = alpaka::detail::SyclFenceProps<TMemScope>::scope;
             sycl::atomic_fence(sycl::memory_order::acq_rel, scope);
         }
     };
diff --git a/alpaka/include/alpaka/mem/global/DeviceGlobalCpu.hpp b/alpaka/include/alpaka/mem/global/DeviceGlobalCpu.hpp
new file mode 100644
index 00000000..aafcb06f
--- /dev/null
+++ b/alpaka/include/alpaka/mem/global/DeviceGlobalCpu.hpp
@@ -0,0 +1,151 @@
+/* Copyright 2024 Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/mem/buf/cpu/Copy.hpp"
+#include "alpaka/mem/global/Traits.hpp"
+#include "alpaka/mem/view/ViewPlainPtr.hpp"
+
+#include <type_traits>
+
+// memcpy specialization for device global variables
+namespace alpaka
+{
+
+    namespace detail
+    {
+        template<typename T>
+        struct DevGlobalTrait<TagCpuOmp2Blocks, T>
+        {
+            using Type = detail::DevGlobalImplGeneric<TagCpuOmp2Blocks, T>;
+        };
+
+        template<typename T>
+        struct DevGlobalTrait<TagCpuOmp2Threads, T>
+        {
+            using Type = detail::DevGlobalImplGeneric<TagCpuOmp2Threads, T>;
+        };
+
+        template<typename T>
+        struct DevGlobalTrait<TagCpuSerial, T>
+        {
+            using Type = detail::DevGlobalImplGeneric<TagCpuSerial, T>;
+        };
+
+        template<typename T>
+        struct DevGlobalTrait<TagCpuTbbBlocks, T>
+        {
+            using Type = detail::DevGlobalImplGeneric<TagCpuTbbBlocks, T>;
+        };
+
+        template<typename T>
+        struct DevGlobalTrait<TagCpuThreads, T>
+        {
+            using Type = detail::DevGlobalImplGeneric<TagCpuThreads, T>;
+        };
+    } // namespace detail
+
+    template<
+        typename TTag,
+        typename TViewSrc,
+        typename TTypeDst,
+        typename TQueue,
+        typename std::enable_if_t<
+            std::is_same_v<TTag, TagCpuOmp2Blocks> || std::is_same_v<TTag, TagCpuOmp2Threads>
+                || std::is_same_v<TTag, TagCpuSerial> || std::is_same_v<TTag, TagCpuTbbBlocks>
+                || std::is_same_v<TTag, TagCpuThreads>,
+            int>
+        = 0>
+    ALPAKA_FN_HOST auto memcpy(
+        TQueue& queue,
+        alpaka::detail::DevGlobalImplGeneric<TTag, TTypeDst>& viewDst,
+        TViewSrc const& viewSrc) -> void
+    {
+        using Type = std::remove_const_t<std::remove_all_extents_t<TTypeDst>>;
+        auto extent = getExtents(viewSrc);
+        auto view = alpaka::ViewPlainPtr<DevCpu, Type, alpaka::Dim<decltype(extent)>, alpaka::Idx<decltype(extent)>>(
+            reinterpret_cast<Type*>(const_cast<std::remove_const_t<TTypeDst>*>(&viewDst)),
+            alpaka::getDev(queue),
+            extent);
+        enqueue(queue, createTaskMemcpy(std::forward<decltype(view)>(view), viewSrc, extent));
+    }
+
+    template<
+        typename TTag,
+        typename TTypeSrc,
+        typename TViewDstFwd,
+        typename TQueue,
+        typename std::enable_if_t<
+            std::is_same_v<TTag, TagCpuOmp2Blocks> || std::is_same_v<TTag, TagCpuOmp2Threads>
+                || std::is_same_v<TTag, TagCpuSerial> || std::is_same_v<TTag, TagCpuTbbBlocks>
+                || std::is_same_v<TTag, TagCpuThreads>,
+            int>
+        = 0>
+    ALPAKA_FN_HOST auto memcpy(
+        TQueue& queue,
+        TViewDstFwd&& viewDst,
+        alpaka::detail::DevGlobalImplGeneric<TTag, TTypeSrc>& viewSrc) -> void
+    {
+        using Type = std::remove_all_extents_t<TTypeSrc>;
+        auto extent = getExtents(viewDst);
+        auto view = alpaka::ViewPlainPtr<DevCpu, Type, alpaka::Dim<decltype(extent)>, alpaka::Idx<decltype(extent)>>(
+            reinterpret_cast<Type*>(&viewSrc),
+            alpaka::getDev(queue),
+            extent);
+        enqueue(queue, createTaskMemcpy(std::forward<TViewDstFwd>(viewDst), view, extent));
+    }
+
+    template<
+        typename TTag,
+        typename TExtent,
+        typename TViewSrc,
+        typename TTypeDst,
+        typename TQueue,
+        typename std::enable_if_t<
+            std::is_same_v<TTag, TagCpuOmp2Blocks> || std::is_same_v<TTag, TagCpuOmp2Threads>
+                || std::is_same_v<TTag, TagCpuSerial> || std::is_same_v<TTag, TagCpuTbbBlocks>
+                || std::is_same_v<TTag, TagCpuThreads>,
+            int>
+        = 0>
+    ALPAKA_FN_HOST auto memcpy(
+        TQueue& queue,
+        alpaka::detail::DevGlobalImplGeneric<TTag, TTypeDst>& viewDst,
+        TViewSrc const& viewSrc,
+        TExtent const& extent) -> void
+    {
+        using Type = std::remove_const_t<std::remove_all_extents_t<TTypeDst>>;
+        auto view = alpaka::ViewPlainPtr<DevCpu, Type, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
+            reinterpret_cast<Type*>(const_cast<std::remove_const_t<TTypeDst>*>(&viewDst)),
+            alpaka::getDev(queue),
+            extent);
+        enqueue(queue, createTaskMemcpy(std::forward<decltype(view)>(view), viewSrc, extent));
+    }
+
+    template<
+        typename TTag,
+        typename TExtent,
+        typename TTypeSrc,
+        typename TViewDstFwd,
+        typename TQueue,
+        typename std::enable_if_t<
+            std::is_same_v<TTag, TagCpuOmp2Blocks> || std::is_same_v<TTag, TagCpuOmp2Threads>
+                || std::is_same_v<TTag, TagCpuSerial> || std::is_same_v<TTag, TagCpuTbbBlocks>
+                || std::is_same_v<TTag, TagCpuThreads>,
+            int>
+        = 0>
+    ALPAKA_FN_HOST auto memcpy(
+        TQueue& queue,
+        TViewDstFwd&& viewDst,
+        alpaka::detail::DevGlobalImplGeneric<TTag, TTypeSrc>& viewSrc,
+        TExtent const& extent) -> void
+    {
+        using Type = std::remove_all_extents_t<TTypeSrc>;
+        auto view = alpaka::ViewPlainPtr<DevCpu, Type, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
+            reinterpret_cast<Type*>(&viewSrc),
+            alpaka::getDev(queue),
+            extent);
+        enqueue(queue, createTaskMemcpy(std::forward<TViewDstFwd>(viewDst), view, extent));
+    }
+} // namespace alpaka
diff --git a/alpaka/include/alpaka/mem/global/DeviceGlobalGenericSycl.hpp b/alpaka/include/alpaka/mem/global/DeviceGlobalGenericSycl.hpp
new file mode 100644
index 00000000..56ee98c8
--- /dev/null
+++ b/alpaka/include/alpaka/mem/global/DeviceGlobalGenericSycl.hpp
@@ -0,0 +1,96 @@
+/* Copyright 2024 Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/elem/Traits.hpp"
+#include "alpaka/mem/global/Traits.hpp"
+#include "alpaka/queue/sycl/QueueGenericSyclBase.hpp"
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+#    include <sycl/sycl.hpp>
+
+namespace alpaka
+{
+    namespace detail
+    {
+        template<typename T>
+        struct DevGlobalTrait<TagCpuSycl, T>
+        {
+            // SYCL CPU implementation
+            using Type = sycl::ext::oneapi::experimental::device_global<T>;
+        };
+
+        template<typename T>
+        struct DevGlobalTrait<TagGpuSyclIntel, T>
+        {
+            // SYCL GPU implementation
+            using Type = sycl::ext::oneapi::experimental::device_global<T>;
+        };
+
+        template<typename T>
+        struct DevGlobalTrait<TagFpgaSyclIntel, T>
+        {
+            // SYCL FPGA implementation
+            using Type = sycl::ext::oneapi::experimental::device_global<T>;
+        };
+
+        template<typename T>
+        struct DevGlobalTrait<TagGenericSycl, T>
+        {
+            // generic SYCL implementation
+            using Type = sycl::ext::oneapi::experimental::device_global<T>;
+        };
+    } // namespace detail
+
+    // from device to host
+    template<typename TDev, bool TBlocking, typename TViewDst, typename TTypeSrc>
+    ALPAKA_FN_HOST auto memcpy(
+        detail::QueueGenericSyclBase<TDev, TBlocking>& queue,
+        TViewDst&& viewDst,
+        sycl::ext::oneapi::experimental::device_global<TTypeSrc> const& viewSrc)
+    {
+        queue.getNativeHandle().memcpy(reinterpret_cast<void*>(getPtrNative(viewDst)), viewSrc);
+    }
+
+    // from host to device
+    template<typename TDev, bool TBlocking, typename TTypeDst, typename TViewSrc>
+    ALPAKA_FN_HOST auto memcpy(
+        detail::QueueGenericSyclBase<TDev, TBlocking>& queue,
+        sycl::ext::oneapi::experimental::device_global<TTypeDst>& viewDst,
+        TViewSrc const& viewSrc)
+    {
+        queue.getNativeHandle().memcpy(viewDst, reinterpret_cast<void const*>(getPtrNative(viewSrc)));
+    }
+
+    // from device to host
+    template<typename TDev, bool TBlocking, typename TViewDst, typename TTypeSrc, typename TExtent>
+    ALPAKA_FN_HOST auto memcpy(
+        detail::QueueGenericSyclBase<TDev, TBlocking>& queue,
+        TViewDst&& viewDst,
+        sycl::ext::oneapi::experimental::device_global<TTypeSrc> const& viewSrc,
+        TExtent extent)
+    {
+        using Elem = alpaka::Elem<std::remove_reference_t<TViewDst>>;
+        auto size = static_cast<std::size_t>(getHeight(extent)) * static_cast<std::size_t>(getDepth(extent))
+                    * static_cast<std::size_t>(getWidth(extent)) * sizeof(Elem);
+        queue.getNativeHandle().memcpy(reinterpret_cast<void*>(getPtrNative(viewDst)), viewSrc, size);
+    }
+
+    // from host to device
+    template<typename TDev, bool TBlocking, typename TTypeDst, typename TViewSrc, typename TExtent>
+    ALPAKA_FN_HOST auto memcpy(
+        detail::QueueGenericSyclBase<TDev, TBlocking>& queue,
+        sycl::ext::oneapi::experimental::device_global<TTypeDst>& viewDst,
+        TViewSrc const& viewSrc,
+        TExtent extent)
+    {
+        using Elem = alpaka::Elem<TViewSrc>;
+        auto size = static_cast<std::size_t>(getHeight(extent)) * static_cast<std::size_t>(getDepth(extent))
+                    * static_cast<std::size_t>(getWidth(extent)) * sizeof(Elem);
+        queue.getNativeHandle().memcpy(viewDst, reinterpret_cast<void const*>(getPtrNative(viewSrc)), size);
+    }
+} // namespace alpaka
+#endif
diff --git a/alpaka/include/alpaka/mem/global/DeviceGlobalUniformCudaHipBuiltIn.hpp b/alpaka/include/alpaka/mem/global/DeviceGlobalUniformCudaHipBuiltIn.hpp
new file mode 100644
index 00000000..6b802fc9
--- /dev/null
+++ b/alpaka/include/alpaka/mem/global/DeviceGlobalUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,187 @@
+/* Copyright 2024 Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/dev/DevUniformCudaHipRt.hpp"
+#include "alpaka/mem/global/Traits.hpp"
+#include "alpaka/mem/view/ViewPlainPtr.hpp"
+#include "alpaka/queue/cuda_hip/QueueUniformCudaHipRt.hpp"
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include "alpaka/core/ApiCudaRt.hpp"
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+#        include "alpaka/core/ApiHipRt.hpp"
+#    endif
+
+namespace alpaka
+{
+
+    namespace detail
+    {
+        template<typename T>
+        struct DevGlobalTrait<TagGpuCudaRt, T>
+        {
+            // CUDA implementation
+            using Type = detail::DevGlobalImplGeneric<TagGpuCudaRt, T>;
+        };
+
+        template<typename T>
+        struct DevGlobalTrait<TagGpuHipRt, T>
+        {
+            // HIP/ROCm implementation
+            using Type = detail::DevGlobalImplGeneric<TagGpuHipRt, T>;
+        };
+    } // namespace detail
+
+    // from device to host
+    template<
+        typename TTag,
+        typename TApi,
+        bool TBlocking,
+        typename TViewDst,
+        typename TTypeSrc,
+        typename std::enable_if_t<
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+            (std::is_same_v<TTag, TagGpuCudaRt> && std::is_same_v<TApi, ApiCudaRt>)
+#    else
+            (std::is_same_v<TTag, TagGpuHipRt> && std::is_same_v<TApi, ApiHipRt>)
+#    endif
+                ,
+            int>
+        = 0>
+    ALPAKA_FN_HOST auto memcpy(
+        uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>& queue,
+        TViewDst& viewDst,
+        alpaka::detail::DevGlobalImplGeneric<TTag, TTypeSrc>& viewSrc)
+    {
+        using Type = std::remove_const_t<std::remove_all_extents_t<TTypeSrc>>;
+        using TypeExt = std::remove_const_t<TTypeSrc>;
+        auto extent = getExtents(viewDst);
+        TypeExt* pMemAcc(nullptr);
+        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+            TApi::getSymbolAddress(reinterpret_cast<void**>(&pMemAcc), *(const_cast<TypeExt*>(&viewSrc))));
+
+        auto view = alpaka::ViewPlainPtr<
+            DevUniformCudaHipRt<TApi>,
+            Type,
+            alpaka::Dim<decltype(extent)>,
+            alpaka::Idx<decltype(extent)>>(reinterpret_cast<Type*>(pMemAcc), alpaka::getDev(queue), extent);
+        enqueue(queue, createTaskMemcpy(std::forward<TViewDst>(viewDst), view, extent));
+    }
+
+    // from host to device
+    template<
+        typename TTag,
+        typename TApi,
+        bool TBlocking,
+        typename TTypeDst,
+        typename TViewSrc,
+        typename std::enable_if_t<
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+            (std::is_same_v<TTag, TagGpuCudaRt> && std::is_same_v<TApi, ApiCudaRt>)
+#    else
+            (std::is_same_v<TTag, TagGpuHipRt> && std::is_same_v<TApi, ApiHipRt>)
+#    endif
+                ,
+            int>
+        = 0>
+    ALPAKA_FN_HOST auto memcpy(
+        uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>& queue,
+        alpaka::detail::DevGlobalImplGeneric<TTag, TTypeDst>& viewDst,
+        TViewSrc const& viewSrc)
+    {
+        using Type = std::remove_const_t<std::remove_all_extents_t<TTypeDst>>;
+        using TypeExt = std::remove_const_t<TTypeDst>;
+        auto extent = getExtents(viewSrc);
+        Type* pMemAcc(nullptr);
+        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+            TApi::getSymbolAddress(reinterpret_cast<void**>(&pMemAcc), *(const_cast<TypeExt*>(&viewDst))));
+
+        auto view = alpaka::ViewPlainPtr<
+            DevUniformCudaHipRt<TApi>,
+            Type,
+            alpaka::Dim<decltype(extent)>,
+            alpaka::Idx<decltype(extent)>>(reinterpret_cast<Type*>(pMemAcc), alpaka::getDev(queue), extent);
+        enqueue(queue, createTaskMemcpy(std::forward<decltype(view)>(view), viewSrc, extent));
+    }
+
+    // from device to host
+    template<
+        typename TTag,
+        typename TApi,
+        bool TBlocking,
+        typename TViewDst,
+        typename TTypeSrc,
+        typename TExtent,
+        typename std::enable_if_t<
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+            (std::is_same_v<TTag, TagGpuCudaRt> && std::is_same_v<TApi, ApiCudaRt>)
+#    else
+            (std::is_same_v<TTag, TagGpuHipRt> && std::is_same_v<TApi, ApiHipRt>)
+#    endif
+                ,
+            int>
+        = 0>
+    ALPAKA_FN_HOST auto memcpy(
+        uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>& queue,
+        TViewDst& viewDst,
+        alpaka::detail::DevGlobalImplGeneric<TTag, TTypeSrc>& viewSrc,
+        TExtent extent)
+    {
+        using Type = std::remove_const_t<std::remove_all_extents_t<TTypeSrc>>;
+        using TypeExt = std::remove_const_t<TTypeSrc>;
+        Type* pMemAcc(nullptr);
+        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+            TApi::getSymbolAddress(reinterpret_cast<void**>(&pMemAcc), *(const_cast<TypeExt*>(&viewSrc))));
+
+        auto view = alpaka::ViewPlainPtr<DevUniformCudaHipRt<TApi>, Type, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
+            reinterpret_cast<Type*>(pMemAcc),
+            alpaka::getDev(queue),
+            extent);
+        enqueue(queue, createTaskMemcpy(std::forward<TViewDst>(viewDst), view, extent));
+    }
+
+    // from host to device
+    template<
+        typename TTag,
+        typename TApi,
+        bool TBlocking,
+        typename TTypeDst,
+        typename TViewSrc,
+        typename TExtent,
+        typename std::enable_if_t<
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+            (std::is_same_v<TTag, TagGpuCudaRt> && std::is_same_v<TApi, ApiCudaRt>)
+#    else
+            (std::is_same_v<TTag, TagGpuHipRt> && std::is_same_v<TApi, ApiHipRt>)
+#    endif
+                ,
+            int>
+        = 0>
+    ALPAKA_FN_HOST auto memcpy(
+        uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>& queue,
+        alpaka::detail::DevGlobalImplGeneric<TTag, TTypeDst>& viewDst,
+        TViewSrc const& viewSrc,
+        TExtent extent)
+    {
+        using Type = std::remove_const_t<std::remove_all_extents_t<TTypeDst>>;
+        using TypeExt = std::remove_const_t<TTypeDst>;
+        Type* pMemAcc(nullptr);
+        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+            TApi::getSymbolAddress(reinterpret_cast<void**>(&pMemAcc), *(const_cast<TypeExt*>(&viewDst))));
+
+        auto view = alpaka::ViewPlainPtr<DevUniformCudaHipRt<TApi>, Type, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
+            reinterpret_cast<Type*>(pMemAcc),
+            alpaka::getDev(queue),
+            extent);
+        enqueue(queue, createTaskMemcpy(std::forward<decltype(view)>(view), viewSrc, extent));
+    }
+} // namespace alpaka
+
+#endif
diff --git a/alpaka/include/alpaka/mem/global/Traits.hpp b/alpaka/include/alpaka/mem/global/Traits.hpp
new file mode 100644
index 00000000..7b3c3d1c
--- /dev/null
+++ b/alpaka/include/alpaka/mem/global/Traits.hpp
@@ -0,0 +1,45 @@
+/* Copyright 2024 Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/core/Common.hpp"
+#include "alpaka/meta/DependentFalseType.hpp"
+
+namespace alpaka
+{
+
+    namespace detail
+    {
+        template<typename TTag, typename T>
+        struct DevGlobalImplGeneric
+        {
+            // does not make use of TTag
+            using Type = std::remove_const_t<T>;
+            Type value; // backend specific value
+
+            ALPAKA_FN_HOST_ACC T* operator&()
+            {
+                return &value;
+            }
+
+            ALPAKA_FN_HOST_ACC T& get()
+            {
+                return value;
+            }
+        };
+
+        template<typename TTag, typename T>
+        struct DevGlobalTrait
+        {
+            static constexpr bool const IsImplementedFor = alpaka::meta::DependentFalseType<TTag>::value;
+
+            static_assert(IsImplementedFor, "Error: device global variables are not implemented for the given Tag");
+        };
+    } // namespace detail
+
+    template<typename TAcc, typename T>
+    using DevGlobal = typename detail::DevGlobalTrait<typename alpaka::trait::AccToTag<TAcc>::type, T>::Type;
+} // namespace alpaka
diff --git a/alpaka/include/alpaka/mem/view/Traits.hpp b/alpaka/include/alpaka/mem/view/Traits.hpp
index 8493a2d3..5a9db5b2 100644
--- a/alpaka/include/alpaka/mem/view/Traits.hpp
+++ b/alpaka/include/alpaka/mem/view/Traits.hpp
@@ -1,4 +1,5 @@
-/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Matthias Werner, Andrea Bocci, Jan Stephan, Bernhard Manfred Gruber
+/* Copyright 2024 Axel Hübl, Benjamin Worpitz, Matthias Werner, Andrea Bocci, Jan Stephan, Bernhard Manfred Gruber,
+ *                Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -118,10 +119,6 @@ namespace alpaka
         template<typename TDim, typename TDevDst, typename TDevSrc, typename TSfinae = void>
         struct CreateTaskMemcpy;
 
-        //! The static device memory view creation trait.
-        template<typename TDev, typename TSfinae = void>
-        struct CreateStaticDevMemView;
-
         //! The device memory view creation trait.
         template<typename TDev, typename TSfinae = void>
         struct CreateViewPlainPtr;
@@ -425,13 +422,6 @@ namespace alpaka
         return subVecEnd<TDim>(getPitchesInBytes(view));
     }
 
-    //! \return A view to static device memory.
-    template<typename TElem, typename TDev, typename TExtent>
-    auto createStaticDevMemView(TElem* pMem, TDev const& dev, TExtent const& extent)
-    {
-        return trait::CreateStaticDevMemView<TDev>::createStaticDevMemView(pMem, dev, extent);
-    }
-
     //! Creates a view to a device pointer
     //!
     //! \param dev Device from where pMem can be accessed.
diff --git a/alpaka/include/alpaka/mem/view/ViewPlainPtr.hpp b/alpaka/include/alpaka/mem/view/ViewPlainPtr.hpp
index 532545c9..dda4a179 100644
--- a/alpaka/include/alpaka/mem/view/ViewPlainPtr.hpp
+++ b/alpaka/include/alpaka/mem/view/ViewPlainPtr.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Sergei Bastrakov, Bernhard Manfred Gruber,
+/* Copyright 2024 Benjamin Worpitz, Matthias Werner, René Widera, Sergei Bastrakov, Bernhard Manfred Gruber,
  *                Jan Stephan, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
@@ -115,54 +115,6 @@ namespace alpaka
             }
         };
 
-        //! The CPU device CreateStaticDevMemView trait specialization.
-        template<>
-        struct CreateStaticDevMemView<DevCpu>
-        {
-            template<typename TElem, typename TExtent>
-            static auto createStaticDevMemView(TElem* pMem, DevCpu const& dev, TExtent const& extent)
-            {
-                return alpaka::ViewPlainPtr<DevCpu, TElem, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
-                    pMem,
-                    dev,
-                    extent);
-            }
-        };
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-        //! The CUDA/HIP RT device CreateStaticDevMemView trait specialization.
-        template<typename TApi>
-        struct CreateStaticDevMemView<DevUniformCudaHipRt<TApi>>
-        {
-            template<typename TElem, typename TExtent>
-            static auto createStaticDevMemView(
-                TElem* pMem,
-                DevUniformCudaHipRt<TApi> const& dev,
-                TExtent const& extent)
-            {
-                TElem* pMemAcc(nullptr);
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::getSymbolAddress(reinterpret_cast<void**>(&pMemAcc), *pMem));
-
-                return alpaka::
-                    ViewPlainPtr<DevUniformCudaHipRt<TApi>, TElem, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
-                        pMemAcc,
-                        dev,
-                        extent);
-            }
-        };
-#endif
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED)
-        //! The SYCL device CreateStaticDevMemView trait specialization.
-        template<typename TPlatform>
-        struct CreateStaticDevMemView<DevGenericSycl<TPlatform>>
-        {
-            static_assert(
-                meta::DependentFalseType<TPlatform>::value,
-                "The SYCL backend does not support global device variables.");
-        };
-#endif
-
         //! The CPU device CreateViewPlainPtr trait specialization.
         template<>
         struct CreateViewPlainPtr<DevCpu>
@@ -202,22 +154,21 @@ namespace alpaka
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED)
         //! The SYCL device CreateViewPlainPtr trait specialization.
-        template<typename TPlatform>
-        struct CreateViewPlainPtr<DevGenericSycl<TPlatform>>
+        template<typename TTag>
+        struct CreateViewPlainPtr<DevGenericSycl<TTag>>
         {
             template<typename TElem, typename TExtent, typename TPitch>
             static auto createViewPlainPtr(
-                DevGenericSycl<TPlatform> const& dev,
+                DevGenericSycl<TTag> const& dev,
                 TElem* pMem,
                 TExtent const& extent,
                 TPitch pitch)
             {
-                return alpaka::
-                    ViewPlainPtr<DevGenericSycl<TPlatform>, TElem, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
-                        pMem,
-                        dev,
-                        extent,
-                        pitch);
+                return alpaka::ViewPlainPtr<DevGenericSycl<TTag>, TElem, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
+                    pMem,
+                    dev,
+                    extent,
+                    pitch);
             }
         };
 #endif
diff --git a/alpaka/include/alpaka/mem/view/ViewSubView.hpp b/alpaka/include/alpaka/mem/view/ViewSubView.hpp
index 88857b08..a35fa22b 100644
--- a/alpaka/include/alpaka/mem/view/ViewSubView.hpp
+++ b/alpaka/include/alpaka/mem/view/ViewSubView.hpp
@@ -53,23 +53,23 @@ namespace alpaka
                 "The dev type of TView and the Dev template parameter have to be identical!");
 
             static_assert(
-                std::is_same_v<TIdx, Idx<View>>,
+                std::is_same_v<TIdx, alpaka::Idx<View>>,
                 "The idx type of TView and the TIdx template parameter have to be identical!");
             static_assert(
-                std::is_same_v<TIdx, Idx<TExtent>>,
+                std::is_same_v<TIdx, alpaka::Idx<TExtent>>,
                 "The idx type of TExtent and the TIdx template parameter have to be identical!");
             static_assert(
-                std::is_same_v<TIdx, Idx<TOffsets>>,
+                std::is_same_v<TIdx, alpaka::Idx<TOffsets>>,
                 "The idx type of TOffsets and the TIdx template parameter have to be identical!");
 
             static_assert(
-                std::is_same_v<TDim, Dim<View>>,
+                std::is_same_v<TDim, alpaka::Dim<View>>,
                 "The dim type of TView and the TDim template parameter have to be identical!");
             static_assert(
-                std::is_same_v<TDim, Dim<TExtent>>,
+                std::is_same_v<TDim, alpaka::Dim<TExtent>>,
                 "The dim type of TExtent and the TDim template parameter have to be identical!");
             static_assert(
-                std::is_same_v<TDim, Dim<TOffsets>>,
+                std::is_same_v<TDim, alpaka::Dim<TOffsets>>,
                 "The dim type of TOffsets and the TDim template parameter have to be identical!");
 
             ALPAKA_ASSERT(((m_offsetsElements + m_extentElements) <= getExtents(view)).all());
diff --git a/alpaka/include/alpaka/meta/CudaVectorArrayWrapper.hpp b/alpaka/include/alpaka/meta/CudaVectorArrayWrapper.hpp
deleted file mode 100644
index 57010ef2..00000000
--- a/alpaka/include/alpaka/meta/CudaVectorArrayWrapper.hpp
+++ /dev/null
@@ -1,329 +0,0 @@
-/* Copyright 2022 Jiří Vyskočil, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-
-#include <functional>
-#include <initializer_list>
-#include <numeric>
-#include <type_traits>
-
-#if defined(ALPAKA_ACC_GPU_HIP_ENABLED) || defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-
-#    ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-#        include <cuda_runtime.h>
-#    endif
-
-#    ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-#        include <hip/hip_runtime.h>
-#    endif
-
-namespace alpaka::meta
-{
-    namespace detail
-    {
-        template<typename TScalar, unsigned N>
-        struct CudaVectorArrayTypeTraits;
-
-        template<>
-        struct CudaVectorArrayTypeTraits<float, 1>
-        {
-            using type = float1;
-        };
-
-        template<>
-        struct CudaVectorArrayTypeTraits<float, 2>
-        {
-            using type = float2;
-        };
-
-        template<>
-        struct CudaVectorArrayTypeTraits<float, 3>
-        {
-            using type = float3;
-        };
-
-        template<>
-        struct CudaVectorArrayTypeTraits<float, 4>
-        {
-            using type = float4;
-        };
-
-        template<>
-        struct CudaVectorArrayTypeTraits<double, 1>
-        {
-            using type = double1;
-        };
-
-        template<>
-        struct CudaVectorArrayTypeTraits<double, 2>
-        {
-            using type = double2;
-        };
-
-        template<>
-        struct CudaVectorArrayTypeTraits<double, 3>
-        {
-            using type = double3;
-        };
-
-        template<>
-        struct CudaVectorArrayTypeTraits<double, 4>
-        {
-            using type = double4;
-        };
-
-        template<>
-        struct CudaVectorArrayTypeTraits<unsigned, 1>
-        {
-            using type = uint1;
-        };
-
-        template<>
-        struct CudaVectorArrayTypeTraits<unsigned, 2>
-        {
-            using type = uint2;
-        };
-
-        template<>
-        struct CudaVectorArrayTypeTraits<unsigned, 3>
-        {
-            using type = uint3;
-        };
-
-        template<>
-        struct CudaVectorArrayTypeTraits<unsigned, 4>
-        {
-            using type = uint4;
-        };
-
-        template<>
-        struct CudaVectorArrayTypeTraits<int, 1>
-        {
-            using type = int1;
-        };
-
-        template<>
-        struct CudaVectorArrayTypeTraits<int, 2>
-        {
-            using type = int2;
-        };
-
-        template<>
-        struct CudaVectorArrayTypeTraits<int, 3>
-        {
-            using type = int3;
-        };
-
-        template<>
-        struct CudaVectorArrayTypeTraits<int, 4>
-        {
-            using type = int4;
-        };
-    } // namespace detail
-
-    /// Helper struct providing [] subscript access to CUDA vector types
-    template<typename TScalar, unsigned N>
-    struct CudaVectorArrayWrapper;
-
-    template<typename TScalar>
-    struct CudaVectorArrayWrapper<TScalar, 4> : public detail::CudaVectorArrayTypeTraits<TScalar, 4>::type
-    {
-        using value_type = TScalar;
-        static constexpr unsigned size = 4;
-
-        ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE CudaVectorArrayWrapper(std::initializer_list<TScalar> init)
-        {
-            auto it = std::begin(init);
-            this->x = *it++;
-            this->y = *it++;
-            this->z = *it++;
-            this->w = *it++;
-        }
-
-        template<class Other>
-        ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE CudaVectorArrayWrapper(Other const& o)
-        {
-            static_assert(std::tuple_size_v<Other> == size, "Can only convert between vectors of same size.");
-            static_assert(
-                std::is_same_v<typename Other::value_type, value_type>,
-                "Can only convert between vectors of same element type.");
-            this->x = o[0];
-            this->y = o[1];
-            this->z = o[2];
-            this->w = o[3];
-        }
-
-        ALPAKA_FN_HOST_ACC constexpr operator std::array<value_type, size>() const
-        {
-            std::array<value_type, size> ret;
-            ret[0] = this->x;
-            ret[1] = this->y;
-            ret[2] = this->z;
-            ret[3] = this->w;
-            return ret;
-        }
-
-        ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE constexpr value_type& operator[](int const k) noexcept
-        {
-            assert(k >= 0 && k < 4);
-            return k == 0 ? this->x : (k == 1 ? this->y : (k == 2 ? this->z : this->w));
-        }
-
-        ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE constexpr value_type const& operator[](int const k) const noexcept
-        {
-            assert(k >= 0 && k < 4);
-            return k == 0 ? this->x : (k == 1 ? this->y : (k == 2 ? this->z : this->w));
-        }
-    };
-
-    template<typename TScalar>
-    struct CudaVectorArrayWrapper<TScalar, 3> : public detail::CudaVectorArrayTypeTraits<TScalar, 3>::type
-    {
-        using value_type = TScalar;
-        static constexpr unsigned size = 3;
-
-        ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE CudaVectorArrayWrapper(std::initializer_list<TScalar> init)
-        {
-            auto it = std::begin(init);
-            this->x = *it++;
-            this->y = *it++;
-            this->z = *it++;
-        }
-
-        template<class Other>
-        ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE CudaVectorArrayWrapper(Other const& o)
-        {
-            static_assert(std::tuple_size<Other>::value == size, "Can only convert between vectors of same size.");
-            static_assert(
-                std::is_same<typename Other::value_type, value_type>::value,
-                "Can only convert between vectors of same element type.");
-            this->x = o[0];
-            this->y = o[1];
-            this->z = o[2];
-        }
-
-        ALPAKA_FN_HOST_ACC constexpr operator std::array<value_type, size>() const
-        {
-            std::array<value_type, size> ret;
-            ret[0] = this->x;
-            ret[1] = this->y;
-            ret[2] = this->z;
-            return ret;
-        }
-
-        ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE constexpr value_type& operator[](int const k) noexcept
-        {
-            assert(k >= 0 && k < 3);
-            return k == 0 ? this->x : (k == 1 ? this->y : this->z);
-        }
-
-        ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE constexpr value_type const& operator[](int const k) const noexcept
-        {
-            assert(k >= 0 && k < 3);
-            return k == 0 ? this->x : (k == 1 ? this->y : this->z);
-        }
-    };
-
-    template<typename TScalar>
-    struct CudaVectorArrayWrapper<TScalar, 2> : public detail::CudaVectorArrayTypeTraits<TScalar, 2>::type
-    {
-        using value_type = TScalar;
-        static constexpr unsigned size = 2;
-
-        ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE CudaVectorArrayWrapper(std::initializer_list<TScalar> init)
-        {
-            auto it = std::begin(init);
-            this->x = *it++;
-            this->y = *it++;
-        }
-
-        template<class Other>
-        ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE CudaVectorArrayWrapper(Other const& o)
-        {
-            static_assert(std::tuple_size<Other>::value == size, "Can only convert between vectors of same size.");
-            static_assert(
-                std::is_same<typename Other::value_type, value_type>::value,
-                "Can only convert between vectors of same element type.");
-            this->x = o[0];
-            this->y = o[1];
-        }
-
-        ALPAKA_FN_HOST_ACC constexpr operator std::array<value_type, size>() const
-        {
-            std::array<value_type, size> ret;
-            ret[0] = this->x;
-            ret[1] = this->y;
-            return ret;
-        }
-
-        ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE constexpr value_type& operator[](int const k) noexcept
-        {
-            assert(k >= 0 && k < 2);
-            return k == 0 ? this->x : this->y;
-        }
-
-        ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE constexpr value_type const& operator[](int const k) const noexcept
-        {
-            assert(k >= 0 && k < 2);
-            return k == 0 ? this->x : this->y;
-        }
-    };
-
-    template<typename TScalar>
-    struct CudaVectorArrayWrapper<TScalar, 1> : public detail::CudaVectorArrayTypeTraits<TScalar, 1>::type
-    {
-        using value_type = TScalar;
-        static constexpr unsigned size = 1;
-
-        ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE CudaVectorArrayWrapper(std::initializer_list<TScalar> init)
-        {
-            auto it = std::begin(init);
-            this->x = *it;
-        }
-
-        template<class Other>
-        ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE CudaVectorArrayWrapper(Other const& o)
-        {
-            static_assert(std::tuple_size<Other>::value == size, "Can only convert between vectors of same size.");
-            static_assert(
-                std::is_same<typename Other::value_type, value_type>::value,
-                "Can only convert between vectors of same element type.");
-            this->x = o[0];
-        }
-
-        ALPAKA_FN_HOST_ACC constexpr operator std::array<value_type, size>() const
-        {
-            std::array<value_type, size> ret;
-            ret[0] = this->x;
-            return ret;
-        }
-
-        ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE constexpr value_type& operator[]([[maybe_unused]] int const k) noexcept
-        {
-            assert(k == 0);
-            return this->x;
-        }
-
-        ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE constexpr value_type const& operator[](
-            [[maybe_unused]] int const k) const noexcept
-        {
-            assert(k == 0);
-            return this->x;
-        }
-    };
-} // namespace alpaka::meta
-
-namespace std
-{
-    /// Specialization of std::tuple_size for \a float4_array
-    template<typename T, unsigned N>
-    struct tuple_size<alpaka::meta::CudaVectorArrayWrapper<T, N>> : integral_constant<size_t, N>
-    {
-    };
-} // namespace std
-
-#endif
diff --git a/alpaka/include/alpaka/meta/Filter.hpp b/alpaka/include/alpaka/meta/Filter.hpp
index 4a260777..52e93dc1 100644
--- a/alpaka/include/alpaka/meta/Filter.hpp
+++ b/alpaka/include/alpaka/meta/Filter.hpp
@@ -12,16 +12,16 @@ namespace alpaka::meta
 {
     namespace detail
     {
-        template<template<typename...> class TList, template<typename> class TPred, typename... Ts>
+        template<template<typename...> class TList, template<typename...> class TPred, typename... Ts>
         struct FilterImplHelper;
 
-        template<template<typename...> class TList, template<typename> class TPred>
+        template<template<typename...> class TList, template<typename...> class TPred>
         struct FilterImplHelper<TList, TPred>
         {
             using type = TList<>;
         };
 
-        template<template<typename...> class TList, template<typename> class TPred, typename T, typename... Ts>
+        template<template<typename...> class TList, template<typename...> class TPred, typename T, typename... Ts>
         struct FilterImplHelper<TList, TPred, T, Ts...>
         {
             using type = std::conditional_t<
@@ -30,15 +30,18 @@ namespace alpaka::meta
                 typename FilterImplHelper<TList, TPred, Ts...>::type>;
         };
 
-        template<typename TList, template<typename> class TPred>
+        template<typename TList, template<typename...> class TPred>
         struct FilterImpl;
 
-        template<template<typename...> class TList, template<typename> class TPred, typename... Ts>
+        template<template<typename...> class TList, template<typename...> class TPred, typename... Ts>
         struct FilterImpl<TList<Ts...>, TPred>
         {
             using type = typename detail::FilterImplHelper<TList, TPred, Ts...>::type;
         };
     } // namespace detail
-    template<typename TList, template<typename> class TPred>
+
+    /// \tparam TPred Only the first parameter is used, all other must be set by TPred to some default.
+    ///               Using '...' instead of a single type is a workaround for CrayClang.
+    template<typename TList, template<typename...> class TPred>
     using Filter = typename detail::FilterImpl<TList, TPred>::type;
 } // namespace alpaka::meta
diff --git a/alpaka/include/alpaka/meta/IsArrayOrVector.hpp b/alpaka/include/alpaka/meta/IsArrayOrVector.hpp
index ae946362..f755916d 100644
--- a/alpaka/include/alpaka/meta/IsArrayOrVector.hpp
+++ b/alpaka/include/alpaka/meta/IsArrayOrVector.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "alpaka/meta/CudaVectorArrayWrapper.hpp"
+#include "alpaka/vec/Vec.hpp"
 
 #include <functional>
 #include <numeric>
@@ -52,11 +52,14 @@ namespace alpaka::meta
     {
     };
 
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-    /// Specialization of \a IsArrayOrVector for CUDA vector array wrapper
-    template<typename T, unsigned N>
-    struct IsArrayOrVector<CudaVectorArrayWrapper<T, N>> : std::true_type
+    /** Specialization of \a IsArrayOrVector for alpaka::Vec
+     *
+     * @tparam T inner type held in the array
+     * @tparam N size of the array
+     */
+    template<typename T, typename N>
+    struct IsArrayOrVector<alpaka::Vec<N, T>> : std::true_type
     {
     };
-#endif
+
 } // namespace alpaka::meta
diff --git a/alpaka/include/alpaka/meta/TypeListOps.hpp b/alpaka/include/alpaka/meta/TypeListOps.hpp
index 2d6bcfe7..c63b6561 100644
--- a/alpaka/include/alpaka/meta/TypeListOps.hpp
+++ b/alpaka/include/alpaka/meta/TypeListOps.hpp
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <tuple>
 #include <type_traits>
 
 namespace alpaka::meta
@@ -35,4 +36,60 @@ namespace alpaka::meta
     {
         static constexpr bool value = std::is_same_v<Head, Value> || Contains<List<Tail...>, Value>::value;
     };
+
+    // copied from https://stackoverflow.com/a/51073558/22035743
+    template<typename T>
+    struct IsList : std::false_type
+    {
+    };
+
+    template<template<typename...> class TList, typename... TTypes>
+    struct IsList<TList<TTypes...>> : std::true_type
+    {
+    };
+
+    //! \brief Checks whether the specified type is a list. List is a type with a variadic number of template types.
+    template<typename T>
+    constexpr bool isList = IsList<std::decay_t<T>>::value;
+
+    namespace detail
+    {
+        template<template<typename...> class TListType, typename TType, typename = void>
+        struct ToListImpl
+        {
+            using type = TListType<TType>;
+        };
+
+        template<template<typename...> class TListType, typename TList>
+        struct ToListImpl<TListType, TList, std::enable_if_t<alpaka::meta::isList<TList>>>
+        {
+            using type = TList;
+        };
+    } // namespace detail
+
+    //! \brief Takes an arbitrary number of types (T) and creates a type list of type TListType with the types (T). If
+    //! T is a single template parameter and it satisfies alpaka::meta::isList, the type of the structure is T (no type
+    //! change). For example std::tuple can be used as TListType.
+    //! \tparam TListType type of the created list
+    //! \tparam T possible list types or type list
+    template<template<typename...> class TListType, typename... T>
+    struct ToList;
+
+    template<template<typename...> class TListType, typename T>
+    struct ToList<TListType, T> : detail::ToListImpl<TListType, T>
+    {
+    };
+
+    template<template<typename...> class TListType, typename T, typename... Ts>
+    struct ToList<TListType, T, Ts...>
+    {
+        using type = TListType<T, Ts...>;
+    };
+
+    //! \brief If T is a single argument and a type list (fullfil alpaka::meta::isList), the return type is T.
+    //! Otherwise, std::tuple is returned with T types as template parameters.
+    template<typename... T>
+    using ToTuple = typename ToList<std::tuple, T...>::type;
+
+
 } // namespace alpaka::meta
diff --git a/alpaka/include/alpaka/platform/PlatformCpuSycl.hpp b/alpaka/include/alpaka/platform/PlatformCpuSycl.hpp
index db055f96..4fdda8d2 100644
--- a/alpaka/include/alpaka/platform/PlatformCpuSycl.hpp
+++ b/alpaka/include/alpaka/platform/PlatformCpuSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
+/* Copyright 2024 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -8,8 +8,6 @@
 #include "alpaka/dev/Traits.hpp"
 #include "alpaka/platform/PlatformGenericSycl.hpp"
 
-#include <string>
-
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_CPU)
 
 #    include <sycl/sycl.hpp>
@@ -18,7 +16,8 @@ namespace alpaka
 {
     namespace detail
     {
-        struct SyclCpuSelector
+        template<>
+        struct SYCLDeviceSelector<TagCpuSycl>
         {
             auto operator()(sycl::device const& dev) const -> int
             {
@@ -28,17 +27,7 @@ namespace alpaka
     } // namespace detail
 
     //! The SYCL device manager.
-    using PlatformCpuSycl = PlatformGenericSycl<detail::SyclCpuSelector>;
+    using PlatformCpuSycl = PlatformGenericSycl<TagCpuSycl>;
 } // namespace alpaka
 
-namespace alpaka::trait
-{
-    //! The SYCL device manager device type trait specialization.
-    template<>
-    struct DevType<PlatformCpuSycl>
-    {
-        using type = DevGenericSycl<PlatformCpuSycl>; // = DevCpuSycl
-    };
-} // namespace alpaka::trait
-
 #endif
diff --git a/alpaka/include/alpaka/platform/PlatformFpgaSyclIntel.hpp b/alpaka/include/alpaka/platform/PlatformFpgaSyclIntel.hpp
index 21ee2c25..a3a73423 100644
--- a/alpaka/include/alpaka/platform/PlatformFpgaSyclIntel.hpp
+++ b/alpaka/include/alpaka/platform/PlatformFpgaSyclIntel.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
+/* Copyright 2024 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -12,8 +12,6 @@
 
 #    include <sycl/sycl.hpp>
 
-#    include <string>
-
 namespace alpaka
 {
     namespace detail
@@ -24,7 +22,8 @@ namespace alpaka
 #        pragma clang diagnostic push
 #        pragma clang diagnostic ignored "-Wweak-vtables"
 #    endif
-        struct IntelFpgaSelector final
+        template<>
+        struct SYCLDeviceSelector<TagFpgaSyclIntel>
         {
 #    ifdef ALPAKA_FPGA_EMULATION
             static constexpr auto platform_name = "Intel(R) FPGA Emulation Platform for OpenCL(TM)";
@@ -46,17 +45,7 @@ namespace alpaka
     } // namespace detail
 
     //! The SYCL device manager.
-    using PlatformFpgaSyclIntel = PlatformGenericSycl<detail::IntelFpgaSelector>;
+    using PlatformFpgaSyclIntel = PlatformGenericSycl<TagFpgaSyclIntel>;
 } // namespace alpaka
 
-namespace alpaka::trait
-{
-    //! The SYCL device manager device type trait specialization.
-    template<>
-    struct DevType<PlatformFpgaSyclIntel>
-    {
-        using type = DevGenericSycl<PlatformFpgaSyclIntel>; // = DevFpgaSyclIntel
-    };
-} // namespace alpaka::trait
-
 #endif
diff --git a/alpaka/include/alpaka/platform/PlatformGenericSycl.hpp b/alpaka/include/alpaka/platform/PlatformGenericSycl.hpp
index c4df17c6..12e00fcf 100644
--- a/alpaka/include/alpaka/platform/PlatformGenericSycl.hpp
+++ b/alpaka/include/alpaka/platform/PlatformGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego
+/* Copyright 2024 Jan Stephan, Luca Ferragina, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -6,6 +6,7 @@
 
 #include "alpaka/core/Concepts.hpp"
 #include "alpaka/core/Sycl.hpp"
+#include "alpaka/dev/DevGenericSycl.hpp"
 #include "alpaka/dev/Traits.hpp"
 #include "alpaka/platform/Traits.hpp"
 
@@ -20,16 +21,27 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic push
+#        pragma clang diagnostic ignored "-Wswitch-default"
+#    endif
+
 #    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
+    namespace detail
+    {
+        template<typename TTag>
+        struct SYCLDeviceSelector;
+    } // namespace detail
+
     //! The SYCL device manager.
-    template<typename TSelector>
-    struct PlatformGenericSycl : concepts::Implements<ConceptPlatform, PlatformGenericSycl<TSelector>>
+    template<typename TTag>
+    struct PlatformGenericSycl : concepts::Implements<ConceptPlatform, PlatformGenericSycl<TTag>>
     {
         PlatformGenericSycl()
-            : platform{TSelector{}}
+            : platform{detail::SYCLDeviceSelector<TTag>{}}
             , devices(platform.get_devices())
             , context{sycl::context{
                   devices,
@@ -88,636 +100,647 @@ namespace alpaka
         std::vector<sycl::device> devices;
         sycl::context context;
     };
-} // namespace alpaka
 
-namespace alpaka::trait
-{
-    //! The SYCL platform device count get trait specialization.
-    template<typename TSelector>
-    struct GetDevCount<PlatformGenericSycl<TSelector>>
+    namespace trait
     {
-        static auto getDevCount(PlatformGenericSycl<TSelector> const& platform) -> std::size_t
+        //! The SYCL platform device type trait specialization.
+        template<typename TTag>
+        struct DevType<PlatformGenericSycl<TTag>>
         {
-            ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-            return platform.syclDevices().size();
-        }
-    };
+            using type = DevGenericSycl<TTag>;
+        };
 
-    //! The SYCL platform device get trait specialization.
-    template<typename TSelector>
-    struct GetDevByIdx<alpaka::PlatformGenericSycl<TSelector>>
-    {
-        static auto getDevByIdx(PlatformGenericSycl<TSelector> const& platform, std::size_t const& devIdx)
+        //! The SYCL platform device count get trait specialization.
+        template<typename TTag>
+        struct GetDevCount<PlatformGenericSycl<TTag>>
         {
-            ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-            auto const& devices = platform.syclDevices();
-            if(devIdx >= devices.size())
+            static auto getDevCount(PlatformGenericSycl<TTag> const& platform) -> std::size_t
             {
-                auto ss_err = std::stringstream{};
-                ss_err << "Unable to return device handle for device " << devIdx << ". There are only "
-                       << devices.size() << " SYCL devices!";
-                throw std::runtime_error(ss_err.str());
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                return platform.syclDevices().size();
             }
+        };
 
-            auto sycl_dev = devices.at(devIdx);
+        //! The SYCL platform device get trait specialization.
+        template<typename TTag>
+        struct GetDevByIdx<PlatformGenericSycl<TTag>>
+        {
+            static auto getDevByIdx(PlatformGenericSycl<TTag> const& platform, std::size_t const& devIdx)
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
 
-            // Log this device.
+                auto const& devices = platform.syclDevices();
+                if(devIdx >= devices.size())
+                {
+                    auto ss_err = std::stringstream{};
+                    ss_err << "Unable to return device handle for device " << devIdx << ". There are only "
+                           << devices.size() << " SYCL devices!";
+                    throw std::runtime_error(ss_err.str());
+                }
+
+                auto sycl_dev = devices.at(devIdx);
+
+                // Log this device.
 #    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            printDeviceProperties(sycl_dev);
+                printDeviceProperties(sycl_dev);
 #    elif ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-            std::cout << __func__ << sycl_dev.template get_info<sycl::info::device::name>() << '\n';
+                std::cout << __func__ << sycl_dev.template get_info<sycl::info::device::name>() << '\n';
 #    endif
-            using SyclPlatform = alpaka::PlatformGenericSycl<TSelector>;
-            return typename DevType<SyclPlatform>::type{sycl_dev, platform.syclContext()};
-        }
+                using SyclPlatform = alpaka::PlatformGenericSycl<TTag>;
+                return typename DevType<SyclPlatform>::type{sycl_dev, platform.syclContext()};
+            }
 
-    private:
+        private:
 #    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-        //! Prints all the device properties to std::cout.
-        static auto printDeviceProperties(sycl::device const& device) -> void
-        {
-            ALPAKA_DEBUG_FULL_LOG_SCOPE;
+            //! Prints all the device properties to std::cout.
+            static auto printDeviceProperties(sycl::device const& device) -> void
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
 
-            constexpr auto KiB = std::size_t{1024};
-            constexpr auto MiB = KiB * KiB;
+                constexpr auto KiB = std::size_t{1024};
+                constexpr auto MiB = KiB * KiB;
 
-            std::cout << "Device type: ";
-            switch(device.get_info<sycl::info::device::device_type>())
-            {
-            case sycl::info::device_type::cpu:
-                std::cout << "CPU";
-                break;
-
-            case sycl::info::device_type::gpu:
-                std::cout << "GPU";
-                break;
-
-            case sycl::info::device_type::accelerator:
-                std::cout << "Accelerator";
-                break;
-
-            case sycl::info::device_type::custom:
-                std::cout << "Custom";
-                break;
-
-            case sycl::info::device_type::automatic:
-                std::cout << "Automatic";
-                break;
-
-            case sycl::info::device_type::host:
-                std::cout << "Host";
-                break;
-
-            // The SYCL spec forbids the return of device_type::all
-            // Including this here to prevent warnings because of
-            // missing cases
-            case sycl::info::device_type::all:
-                std::cout << "All";
-                break;
-            }
-            std::cout << '\n';
+                std::cout << "Device type: ";
+                switch(device.get_info<sycl::info::device::device_type>())
+                {
+                case sycl::info::device_type::cpu:
+                    std::cout << "CPU";
+                    break;
 
-            std::cout << "Name: " << device.get_info<sycl::info::device::name>() << '\n';
+                case sycl::info::device_type::gpu:
+                    std::cout << "GPU";
+                    break;
+
+                case sycl::info::device_type::accelerator:
+                    std::cout << "Accelerator";
+                    break;
+
+                case sycl::info::device_type::custom:
+                    std::cout << "Custom";
+                    break;
+
+                case sycl::info::device_type::automatic:
+                    std::cout << "Automatic";
+                    break;
+
+                case sycl::info::device_type::host:
+                    std::cout << "Host";
+                    break;
+
+                // The SYCL spec forbids the return of device_type::all
+                // Including this here to prevent warnings because of
+                // missing cases
+                case sycl::info::device_type::all:
+                    std::cout << "All";
+                    break;
+                }
+                std::cout << '\n';
 
-            std::cout << "Vendor: " << device.get_info<sycl::info::device::vendor>() << '\n';
+                std::cout << "Name: " << device.get_info<sycl::info::device::name>() << '\n';
 
-            std::cout << "Vendor ID: " << device.get_info<sycl::info::device::vendor_id>() << '\n';
+                std::cout << "Vendor: " << device.get_info<sycl::info::device::vendor>() << '\n';
 
-            std::cout << "Driver version: " << device.get_info<sycl::info::device::driver_version>() << '\n';
+                std::cout << "Vendor ID: " << device.get_info<sycl::info::device::vendor_id>() << '\n';
 
-            std::cout << "SYCL version: " << device.get_info<sycl::info::device::version>() << '\n';
+                std::cout << "Driver version: " << device.get_info<sycl::info::device::driver_version>() << '\n';
+
+                std::cout << "SYCL version: " << device.get_info<sycl::info::device::version>() << '\n';
 
 #        if !defined(BOOST_COMP_ICPX)
-            // Not defined by Level Zero back-end
-            std::cout << "Backend version: " << device.get_info<sycl::info::device::backend_version>() << '\n';
+                // Not defined by Level Zero back-end
+                std::cout << "Backend version: " << device.get_info<sycl::info::device::backend_version>() << '\n';
 #        endif
 
-            std::cout << "Aspects: " << '\n';
+                std::cout << "Aspects: " << '\n';
 
 #        if defined(BOOST_COMP_ICPX)
 #            if BOOST_COMP_ICPX >= BOOST_VERSION_NUMBER(53, 2, 0)
-            // These aspects are missing from oneAPI versions < 2023.2.0
-            if(device.has(sycl::aspect::emulated))
-                std::cout << "\t* emulated\n";
+                // These aspects are missing from oneAPI versions < 2023.2.0
+                if(device.has(sycl::aspect::emulated))
+                    std::cout << "\t* emulated\n";
 
-            if(device.has(sycl::aspect::host_debuggable))
-                std::cout << "\t* debuggable using standard debuggers\n";
+                if(device.has(sycl::aspect::host_debuggable))
+                    std::cout << "\t* debuggable using standard debuggers\n";
 #            endif
 #        endif
 
-            if(device.has(sycl::aspect::fp16))
-                std::cout << "\t* supports sycl::half precision\n";
-
-            if(device.has(sycl::aspect::fp64))
-                std::cout << "\t* supports double precision\n";
+                if(device.has(sycl::aspect::fp16))
+                    std::cout << "\t* supports sycl::half precision\n";
 
-            if(device.has(sycl::aspect::atomic64))
-                std::cout << "\t* supports 64-bit atomics\n";
+                if(device.has(sycl::aspect::fp64))
+                    std::cout << "\t* supports double precision\n";
 
-            if(device.has(sycl::aspect::image))
-                std::cout << "\t* supports images\n";
+                if(device.has(sycl::aspect::atomic64))
+                    std::cout << "\t* supports 64-bit atomics\n";
 
-            if(device.has(sycl::aspect::online_compiler))
-                std::cout << "\t* supports online compilation of device code\n";
+                if(device.has(sycl::aspect::image))
+                    std::cout << "\t* supports images\n";
 
-            if(device.has(sycl::aspect::online_linker))
-                std::cout << "\t* supports online linking of device code\n";
+                if(device.has(sycl::aspect::online_compiler))
+                    std::cout << "\t* supports online compilation of device code\n";
 
-            if(device.has(sycl::aspect::queue_profiling))
-                std::cout << "\t* supports queue profiling\n";
+                if(device.has(sycl::aspect::online_linker))
+                    std::cout << "\t* supports online linking of device code\n";
 
-            if(device.has(sycl::aspect::usm_device_allocations))
-                std::cout << "\t* supports explicit USM allocations\n";
+                if(device.has(sycl::aspect::queue_profiling))
+                    std::cout << "\t* supports queue profiling\n";
 
-            if(device.has(sycl::aspect::usm_host_allocations))
-                std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::host\n";
+                if(device.has(sycl::aspect::usm_device_allocations))
+                    std::cout << "\t* supports explicit USM allocations\n";
 
-            if(device.has(sycl::aspect::usm_atomic_host_allocations))
-                std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::host atomically\n";
+                if(device.has(sycl::aspect::usm_host_allocations))
+                    std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::host\n";
 
-            if(device.has(sycl::aspect::usm_shared_allocations))
-                std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::shared\n";
+                if(device.has(sycl::aspect::usm_atomic_host_allocations))
+                    std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::host atomically\n";
 
-            if(device.has(sycl::aspect::usm_atomic_shared_allocations))
-                std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::shared atomically\n";
+                if(device.has(sycl::aspect::usm_shared_allocations))
+                    std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::shared\n";
 
-            if(device.has(sycl::aspect::usm_system_allocations))
-                std::cout << "\t* can access memory allocated by the system allocator\n";
+                if(device.has(sycl::aspect::usm_atomic_shared_allocations))
+                    std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::shared atomically\n";
 
-            std::cout << "Available compute units: " << device.get_info<sycl::info::device::max_compute_units>()
-                      << '\n';
+                if(device.has(sycl::aspect::usm_system_allocations))
+                    std::cout << "\t* can access memory allocated by the system allocator\n";
 
-            std::cout << "Maximum work item dimensions: ";
-            auto dims = device.get_info<sycl::info::device::max_work_item_dimensions>();
-            std::cout << dims << std::endl;
+                std::cout << "Available compute units: " << device.get_info<sycl::info::device::max_compute_units>()
+                          << '\n';
 
-            std::cout << "Maximum number of work items:\n";
-            auto const wi_1D = device.get_info<sycl::info::device::max_work_item_sizes<1>>();
-            auto const wi_2D = device.get_info<sycl::info::device::max_work_item_sizes<2>>();
-            auto const wi_3D = device.get_info<sycl::info::device::max_work_item_sizes<3>>();
-            std::cout << "\t* 1D: (" << wi_1D.get(0) << ")\n";
-            std::cout << "\t* 2D: (" << wi_2D.get(0) << ", " << wi_2D.get(1) << ")\n";
-            std::cout << "\t* 3D: (" << wi_3D.get(0) << ", " << wi_3D.get(1) << ", " << wi_3D.get(2) << ")\n";
+                std::cout << "Maximum work item dimensions: ";
+                auto dims = device.get_info<sycl::info::device::max_work_item_dimensions>();
+                std::cout << dims << std::endl;
 
-            std::cout << "Maximum number of work items per work-group: "
-                      << device.get_info<sycl::info::device::max_work_group_size>() << '\n';
+                std::cout << "Maximum number of work items:\n";
+                auto const wi_1D = device.get_info<sycl::info::device::max_work_item_sizes<1>>();
+                auto const wi_2D = device.get_info<sycl::info::device::max_work_item_sizes<2>>();
+                auto const wi_3D = device.get_info<sycl::info::device::max_work_item_sizes<3>>();
+                std::cout << "\t* 1D: (" << wi_1D.get(0) << ")\n";
+                std::cout << "\t* 2D: (" << wi_2D.get(0) << ", " << wi_2D.get(1) << ")\n";
+                std::cout << "\t* 3D: (" << wi_3D.get(0) << ", " << wi_3D.get(1) << ", " << wi_3D.get(2) << ")\n";
 
-            std::cout << "Maximum number of sub-groups per work-group: "
-                      << device.get_info<sycl::info::device::max_num_sub_groups>() << '\n';
+                std::cout << "Maximum number of work items per work-group: "
+                          << device.get_info<sycl::info::device::max_work_group_size>() << '\n';
 
-            std::cout << "Supported sub-group sizes: ";
-            auto const sg_sizes = device.get_info<sycl::info::device::sub_group_sizes>();
-            for(auto const& sz : sg_sizes)
-                std::cout << sz << ", ";
-            std::cout << '\n';
+                std::cout << "Maximum number of sub-groups per work-group: "
+                          << device.get_info<sycl::info::device::max_num_sub_groups>() << '\n';
 
-            std::cout << "Preferred native vector width (char): "
-                      << device.get_info<sycl::info::device::preferred_vector_width_char>() << '\n';
+                std::cout << "Supported sub-group sizes: ";
+                auto const sg_sizes = device.get_info<sycl::info::device::sub_group_sizes>();
+                for(auto const& sz : sg_sizes)
+                    std::cout << sz << ", ";
+                std::cout << '\n';
 
-            std::cout << "Native ISA vector width (char): "
-                      << device.get_info<sycl::info::device::native_vector_width_char>() << '\n';
+                std::cout << "Preferred native vector width (char): "
+                          << device.get_info<sycl::info::device::preferred_vector_width_char>() << '\n';
 
-            std::cout << "Preferred native vector width (short): "
-                      << device.get_info<sycl::info::device::preferred_vector_width_short>() << '\n';
+                std::cout << "Native ISA vector width (char): "
+                          << device.get_info<sycl::info::device::native_vector_width_char>() << '\n';
 
-            std::cout << "Native ISA vector width (short): "
-                      << device.get_info<sycl::info::device::native_vector_width_short>() << '\n';
+                std::cout << "Preferred native vector width (short): "
+                          << device.get_info<sycl::info::device::preferred_vector_width_short>() << '\n';
 
-            std::cout << "Preferred native vector width (int): "
-                      << device.get_info<sycl::info::device::preferred_vector_width_int>() << '\n';
+                std::cout << "Native ISA vector width (short): "
+                          << device.get_info<sycl::info::device::native_vector_width_short>() << '\n';
 
-            std::cout << "Native ISA vector width (int): "
-                      << device.get_info<sycl::info::device::native_vector_width_int>() << '\n';
+                std::cout << "Preferred native vector width (int): "
+                          << device.get_info<sycl::info::device::preferred_vector_width_int>() << '\n';
 
-            std::cout << "Preferred native vector width (long): "
-                      << device.get_info<sycl::info::device::preferred_vector_width_long>() << '\n';
+                std::cout << "Native ISA vector width (int): "
+                          << device.get_info<sycl::info::device::native_vector_width_int>() << '\n';
 
-            std::cout << "Native ISA vector width (long): "
-                      << device.get_info<sycl::info::device::native_vector_width_long>() << '\n';
+                std::cout << "Preferred native vector width (long): "
+                          << device.get_info<sycl::info::device::preferred_vector_width_long>() << '\n';
 
-            std::cout << "Preferred native vector width (float): "
-                      << device.get_info<sycl::info::device::preferred_vector_width_float>() << '\n';
+                std::cout << "Native ISA vector width (long): "
+                          << device.get_info<sycl::info::device::native_vector_width_long>() << '\n';
 
-            std::cout << "Native ISA vector width (float): "
-                      << device.get_info<sycl::info::device::native_vector_width_float>() << '\n';
+                std::cout << "Preferred native vector width (float): "
+                          << device.get_info<sycl::info::device::preferred_vector_width_float>() << '\n';
 
-            if(device.has(sycl::aspect::fp64))
-            {
-                std::cout << "Preferred native vector width (double): "
-                          << device.get_info<sycl::info::device::preferred_vector_width_double>() << '\n';
+                std::cout << "Native ISA vector width (float): "
+                          << device.get_info<sycl::info::device::native_vector_width_float>() << '\n';
 
-                std::cout << "Native ISA vector width (double): "
-                          << device.get_info<sycl::info::device::native_vector_width_double>() << '\n';
-            }
+                if(device.has(sycl::aspect::fp64))
+                {
+                    std::cout << "Preferred native vector width (double): "
+                              << device.get_info<sycl::info::device::preferred_vector_width_double>() << '\n';
 
-            if(device.has(sycl::aspect::fp16))
-            {
-                std::cout << "Preferred native vector width (half): "
-                          << device.get_info<sycl::info::device::preferred_vector_width_half>() << '\n';
+                    std::cout << "Native ISA vector width (double): "
+                              << device.get_info<sycl::info::device::native_vector_width_double>() << '\n';
+                }
 
-                std::cout << "Native ISA vector width (half): "
-                          << device.get_info<sycl::info::device::native_vector_width_half>() << '\n';
-            }
+                if(device.has(sycl::aspect::fp16))
+                {
+                    std::cout << "Preferred native vector width (half): "
+                              << device.get_info<sycl::info::device::preferred_vector_width_half>() << '\n';
 
-            std::cout << "Maximum clock frequency: " << device.get_info<sycl::info::device::max_clock_frequency>()
-                      << " MHz\n";
+                    std::cout << "Native ISA vector width (half): "
+                              << device.get_info<sycl::info::device::native_vector_width_half>() << '\n';
+                }
 
-            std::cout << "Address space size: " << device.get_info<sycl::info::device::address_bits>() << "-bit\n";
+                std::cout << "Maximum clock frequency: " << device.get_info<sycl::info::device::max_clock_frequency>()
+                          << " MHz\n";
 
-            std::cout << "Maximum size of memory object allocation: "
-                      << device.get_info<sycl::info::device::max_mem_alloc_size>() << " bytes\n";
+                std::cout << "Address space size: " << device.get_info<sycl::info::device::address_bits>() << "-bit\n";
 
-            if(device.has(sycl::aspect::image))
-            {
-                std::cout << "Maximum number of simultaneous image object reads per kernel: "
-                          << device.get_info<sycl::info::device::max_read_image_args>() << '\n';
+                std::cout << "Maximum size of memory object allocation: "
+                          << device.get_info<sycl::info::device::max_mem_alloc_size>() << " bytes\n";
 
-                std::cout << "Maximum number of simultaneous image writes per kernel: "
-                          << device.get_info<sycl::info::device::max_write_image_args>() << '\n';
+                if(device.has(sycl::aspect::image))
+                {
+                    std::cout << "Maximum number of simultaneous image object reads per kernel: "
+                              << device.get_info<sycl::info::device::max_read_image_args>() << '\n';
 
-                std::cout << "Maximum 1D/2D image width: " << device.get_info<sycl::info::device::image2d_max_width>()
-                          << " px\n";
+                    std::cout << "Maximum number of simultaneous image writes per kernel: "
+                              << device.get_info<sycl::info::device::max_write_image_args>() << '\n';
 
-                std::cout << "Maximum 2D image height: " << device.get_info<sycl::info::device::image2d_max_height>()
-                          << " px\n";
+                    std::cout << "Maximum 1D/2D image width: "
+                              << device.get_info<sycl::info::device::image2d_max_width>() << " px\n";
 
-                std::cout << "Maximum 3D image width: " << device.get_info<sycl::info::device::image3d_max_width>()
-                          << " px\n";
+                    std::cout << "Maximum 2D image height: "
+                              << device.get_info<sycl::info::device::image2d_max_height>() << " px\n";
 
-                std::cout << "Maximum 3D image height: " << device.get_info<sycl::info::device::image3d_max_height>()
-                          << " px\n";
+                    std::cout << "Maximum 3D image width: " << device.get_info<sycl::info::device::image3d_max_width>()
+                              << " px\n";
 
-                std::cout << "Maximum 3D image depth: " << device.get_info<sycl::info::device::image3d_max_depth>()
-                          << " px\n";
+                    std::cout << "Maximum 3D image height: "
+                              << device.get_info<sycl::info::device::image3d_max_height>() << " px\n";
 
-                std::cout << "Maximum number of samplers per kernel: "
-                          << device.get_info<sycl::info::device::max_samplers>() << '\n';
-            }
+                    std::cout << "Maximum 3D image depth: " << device.get_info<sycl::info::device::image3d_max_depth>()
+                              << " px\n";
 
-            std::cout << "Maximum kernel argument size: " << device.get_info<sycl::info::device::max_parameter_size>()
-                      << " bytes\n";
+                    std::cout << "Maximum number of samplers per kernel: "
+                              << device.get_info<sycl::info::device::max_samplers>() << '\n';
+                }
 
-            std::cout << "Memory base address alignment: "
-                      << device.get_info<sycl::info::device::mem_base_addr_align>() << " bit\n";
+                std::cout << "Maximum kernel argument size: "
+                          << device.get_info<sycl::info::device::max_parameter_size>() << " bytes\n";
 
-            auto print_fp_config = [](std::string const& fp, std::vector<sycl::info::fp_config> const& conf)
-            {
-                std::cout << fp << " precision floating-point capabilities:\n";
+                std::cout << "Memory base address alignment: "
+                          << device.get_info<sycl::info::device::mem_base_addr_align>() << " bit\n";
 
-                auto find_and_print = [&](sycl::info::fp_config val)
+                auto print_fp_config = [](std::string const& fp, std::vector<sycl::info::fp_config> const& conf)
                 {
-                    auto it = std::find(begin(conf), end(conf), val);
-                    std::cout << (it == std::end(conf) ? "No" : "Yes") << '\n';
-                };
+                    std::cout << fp << " precision floating-point capabilities:\n";
 
-                std::cout << "\t* denorm support: ";
-                find_and_print(sycl::info::fp_config::denorm);
+                    auto find_and_print = [&](sycl::info::fp_config val)
+                    {
+                        auto it = std::find(begin(conf), end(conf), val);
+                        std::cout << (it == std::end(conf) ? "No" : "Yes") << '\n';
+                    };
 
-                std::cout << "\t* INF & quiet NaN support: ";
-                find_and_print(sycl::info::fp_config::inf_nan);
+                    std::cout << "\t* denorm support: ";
+                    find_and_print(sycl::info::fp_config::denorm);
 
-                std::cout << "\t* round to nearest even support: ";
-                find_and_print(sycl::info::fp_config::round_to_nearest);
+                    std::cout << "\t* INF & quiet NaN support: ";
+                    find_and_print(sycl::info::fp_config::inf_nan);
 
-                std::cout << "\t* round to zero support: ";
-                find_and_print(sycl::info::fp_config::round_to_zero);
+                    std::cout << "\t* round to nearest even support: ";
+                    find_and_print(sycl::info::fp_config::round_to_nearest);
 
-                std::cout << "\t* round to infinity support: ";
-                find_and_print(sycl::info::fp_config::round_to_inf);
+                    std::cout << "\t* round to zero support: ";
+                    find_and_print(sycl::info::fp_config::round_to_zero);
 
-                std::cout << "\t* IEEE754-2008 FMA support: ";
-                find_and_print(sycl::info::fp_config::fma);
+                    std::cout << "\t* round to infinity support: ";
+                    find_and_print(sycl::info::fp_config::round_to_inf);
 
-                std::cout << "\t* correctly rounded divide/sqrt support: ";
-                find_and_print(sycl::info::fp_config::correctly_rounded_divide_sqrt);
+                    std::cout << "\t* IEEE754-2008 FMA support: ";
+                    find_and_print(sycl::info::fp_config::fma);
 
-                std::cout << "\t* software-implemented floating point operations: ";
-                find_and_print(sycl::info::fp_config::soft_float);
-            };
+                    std::cout << "\t* correctly rounded divide/sqrt support: ";
+                    find_and_print(sycl::info::fp_config::correctly_rounded_divide_sqrt);
 
-            if(device.has(sycl::aspect::fp16))
-            {
-                auto const fp16_conf = device.get_info<sycl::info::device::half_fp_config>();
-                print_fp_config("Half", fp16_conf);
-            }
+                    std::cout << "\t* software-implemented floating point operations: ";
+                    find_and_print(sycl::info::fp_config::soft_float);
+                };
 
-            auto const fp32_conf = device.get_info<sycl::info::device::single_fp_config>();
-            print_fp_config("Single", fp32_conf);
+                if(device.has(sycl::aspect::fp16))
+                {
+                    auto const fp16_conf = device.get_info<sycl::info::device::half_fp_config>();
+                    print_fp_config("Half", fp16_conf);
+                }
 
-            if(device.has(sycl::aspect::fp64))
-            {
-                auto const fp64_conf = device.get_info<sycl::info::device::double_fp_config>();
-                print_fp_config("Double", fp64_conf);
-            }
+                auto const fp32_conf = device.get_info<sycl::info::device::single_fp_config>();
+                print_fp_config("Single", fp32_conf);
 
-            std::cout << "Global memory cache type: ";
-            auto has_global_mem_cache = false;
-            switch(device.get_info<sycl::info::device::global_mem_cache_type>())
-            {
-            case sycl::info::global_mem_cache_type::none:
-                std::cout << "none";
-                break;
-
-            case sycl::info::global_mem_cache_type::read_only:
-                std::cout << "read-only";
-                has_global_mem_cache = true;
-                break;
-
-            case sycl::info::global_mem_cache_type::read_write:
-                std::cout << "read-write";
-                has_global_mem_cache = true;
-                break;
-            }
-            std::cout << '\n';
+                if(device.has(sycl::aspect::fp64))
+                {
+                    auto const fp64_conf = device.get_info<sycl::info::device::double_fp_config>();
+                    print_fp_config("Double", fp64_conf);
+                }
 
-            if(has_global_mem_cache)
-            {
-                std::cout << "Global memory cache line size: "
-                          << device.get_info<sycl::info::device::global_mem_cache_line_size>() << " bytes\n";
+                std::cout << "Global memory cache type: ";
+                auto has_global_mem_cache = false;
+                switch(device.get_info<sycl::info::device::global_mem_cache_type>())
+                {
+                case sycl::info::global_mem_cache_type::none:
+                    std::cout << "none";
+                    break;
 
-                std::cout << "Global memory cache size: "
-                          << device.get_info<sycl::info::device::global_mem_cache_size>() / KiB << " KiB\n";
-            }
+                case sycl::info::global_mem_cache_type::read_only:
+                    std::cout << "read-only";
+                    has_global_mem_cache = true;
+                    break;
 
-            std::cout << "Global memory size: " << device.get_info<sycl::info::device::global_mem_size>() / MiB
-                      << " MiB" << std::endl;
+                case sycl::info::global_mem_cache_type::read_write:
+                    std::cout << "read-write";
+                    has_global_mem_cache = true;
+                    break;
+                }
+                std::cout << '\n';
 
-            std::cout << "Local memory type: ";
-            auto has_local_memory = false;
-            switch(device.get_info<sycl::info::device::local_mem_type>())
-            {
-            case sycl::info::local_mem_type::none:
-                std::cout << "none";
-                break;
-
-            case sycl::info::local_mem_type::local:
-                std::cout << "local";
-                has_local_memory = true;
-                break;
-
-            case sycl::info::local_mem_type::global:
-                std::cout << "global";
-                has_local_memory = true;
-                break;
-            }
-            std::cout << '\n';
+                if(has_global_mem_cache)
+                {
+                    std::cout << "Global memory cache line size: "
+                              << device.get_info<sycl::info::device::global_mem_cache_line_size>() << " bytes\n";
 
-            if(has_local_memory)
-                std::cout << "Local memory size: " << device.get_info<sycl::info::device::local_mem_size>() / KiB
-                          << " KiB\n";
+                    std::cout << "Global memory cache size: "
+                              << device.get_info<sycl::info::device::global_mem_cache_size>() / KiB << " KiB\n";
+                }
 
-            std::cout << "Error correction support: "
-                      << (device.get_info<sycl::info::device::error_correction_support>() ? "Yes" : "No") << '\n';
+                std::cout << "Global memory size: " << device.get_info<sycl::info::device::global_mem_size>() / MiB
+                          << " MiB" << std::endl;
 
-            auto print_memory_orders = [](std::vector<sycl::memory_order> const& mem_orders)
-            {
-                for(auto const& cap : mem_orders)
+                std::cout << "Local memory type: ";
+                auto has_local_memory = false;
+                switch(device.get_info<sycl::info::device::local_mem_type>())
+                {
+                case sycl::info::local_mem_type::none:
+                    std::cout << "none";
+                    break;
+
+                case sycl::info::local_mem_type::local:
+                    std::cout << "local";
+                    has_local_memory = true;
+                    break;
+
+                case sycl::info::local_mem_type::global:
+                    std::cout << "global";
+                    has_local_memory = true;
+                    break;
+                }
+                std::cout << '\n';
+
+                if(has_local_memory)
+                    std::cout << "Local memory size: " << device.get_info<sycl::info::device::local_mem_size>() / KiB
+                              << " KiB\n";
+
+                std::cout << "Error correction support: "
+                          << (device.get_info<sycl::info::device::error_correction_support>() ? "Yes" : "No") << '\n';
+
+                auto print_memory_orders = [](std::vector<sycl::memory_order> const& mem_orders)
                 {
-                    switch(cap)
+                    for(auto const& cap : mem_orders)
                     {
-                    case sycl::memory_order::relaxed:
-                        std::cout << "relaxed";
-                        break;
+                        switch(cap)
+                        {
+                        case sycl::memory_order::relaxed:
+                            std::cout << "relaxed";
+                            break;
 
-                    case sycl::memory_order::acquire:
-                        std::cout << "acquire";
-                        break;
+                        case sycl::memory_order::acquire:
+                            std::cout << "acquire";
+                            break;
 
-                    case sycl::memory_order::release:
-                        std::cout << "release";
-                        break;
+                        case sycl::memory_order::release:
+                            std::cout << "release";
+                            break;
 
-                    case sycl::memory_order::acq_rel:
-                        std::cout << "acq_rel";
-                        break;
+                        case sycl::memory_order::acq_rel:
+                            std::cout << "acq_rel";
+                            break;
 
-                    case sycl::memory_order::seq_cst:
-                        std::cout << "seq_cst";
-                        break;
+                        case sycl::memory_order::seq_cst:
+                            std::cout << "seq_cst";
+                            break;
 #        if defined(BOOST_COMP_ICPX)
-                    // Stop icpx from complaining about its own internals.
-                    case sycl::memory_order::__consume_unsupported:
-                        break;
+                        // Stop icpx from complaining about its own internals.
+                        case sycl::memory_order::__consume_unsupported:
+                            break;
 #        endif
+                        }
+                        std::cout << ", ";
                     }
-                    std::cout << ", ";
-                }
-                std::cout << '\n';
-            };
+                    std::cout << '\n';
+                };
 
-            std::cout << "Supported memory orderings for atomic operations: ";
-            auto const mem_orders = device.get_info<sycl::info::device::atomic_memory_order_capabilities>();
-            print_memory_orders(mem_orders);
+                std::cout << "Supported memory orderings for atomic operations: ";
+                auto const mem_orders = device.get_info<sycl::info::device::atomic_memory_order_capabilities>();
+                print_memory_orders(mem_orders);
 
 #        if defined(BOOST_COMP_ICPX)
 #            if BOOST_COMP_ICPX >= BOOST_VERSION_NUMBER(53, 2, 0)
-            // Not implemented in oneAPI < 2023.2.0
-            std::cout << "Supported memory orderings for sycl::atomic_fence: ";
-            auto const fence_orders = device.get_info<sycl::info::device::atomic_fence_order_capabilities>();
-            print_memory_orders(fence_orders);
+                // Not implemented in oneAPI < 2023.2.0
+                std::cout << "Supported memory orderings for sycl::atomic_fence: ";
+                auto const fence_orders = device.get_info<sycl::info::device::atomic_fence_order_capabilities>();
+                print_memory_orders(fence_orders);
 #            endif
 #        endif
 
-            auto print_memory_scopes = [](std::vector<sycl::memory_scope> const& mem_scopes)
-            {
-                for(auto const& cap : mem_scopes)
+                auto print_memory_scopes = [](std::vector<sycl::memory_scope> const& mem_scopes)
                 {
-                    switch(cap)
+                    for(auto const& cap : mem_scopes)
                     {
-                    case sycl::memory_scope::work_item:
-                        std::cout << "work-item";
-                        break;
+                        switch(cap)
+                        {
+                        case sycl::memory_scope::work_item:
+                            std::cout << "work-item";
+                            break;
 
-                    case sycl::memory_scope::sub_group:
-                        std::cout << "sub-group";
-                        break;
+                        case sycl::memory_scope::sub_group:
+                            std::cout << "sub-group";
+                            break;
 
-                    case sycl::memory_scope::work_group:
-                        std::cout << "work-group";
-                        break;
+                        case sycl::memory_scope::work_group:
+                            std::cout << "work-group";
+                            break;
 
-                    case sycl::memory_scope::device:
-                        std::cout << "device";
-                        break;
+                        case sycl::memory_scope::device:
+                            std::cout << "device";
+                            break;
 
-                    case sycl::memory_scope::system:
-                        std::cout << "system";
-                        break;
+                        case sycl::memory_scope::system:
+                            std::cout << "system";
+                            break;
+                        }
+                        std::cout << ", ";
                     }
-                    std::cout << ", ";
-                }
-                std::cout << '\n';
-            };
+                    std::cout << '\n';
+                };
 
-            std::cout << "Supported memory scopes for atomic operations: ";
-            auto const mem_scopes = device.get_info<sycl::info::device::atomic_memory_scope_capabilities>();
-            print_memory_scopes(mem_scopes);
+                std::cout << "Supported memory scopes for atomic operations: ";
+                auto const mem_scopes = device.get_info<sycl::info::device::atomic_memory_scope_capabilities>();
+                print_memory_scopes(mem_scopes);
 
 #        if defined(BOOST_COMP_ICPX)
 #            if BOOST_COMP_ICPX >= BOOST_VERSION_NUMBER(53, 2, 0)
-            // Not implemented in oneAPI < 2023.2.0
-            std::cout << "Supported memory scopes for sycl::atomic_fence: ";
-            auto const fence_scopes = device.get_info<sycl::info::device::atomic_fence_scope_capabilities>();
-            print_memory_scopes(fence_scopes);
+                // Not implemented in oneAPI < 2023.2.0
+                std::cout << "Supported memory scopes for sycl::atomic_fence: ";
+                auto const fence_scopes = device.get_info<sycl::info::device::atomic_fence_scope_capabilities>();
+                print_memory_scopes(fence_scopes);
 #            endif
 #        endif
 
-            std::cout << "Device timer resolution: "
-                      << device.get_info<sycl::info::device::profiling_timer_resolution>() << " ns\n";
+                std::cout << "Device timer resolution: "
+                          << device.get_info<sycl::info::device::profiling_timer_resolution>() << " ns\n";
 
-            std::cout << "Built-in kernels: ";
-            auto const builtins = device.get_info<sycl::info::device::built_in_kernel_ids>();
-            for(auto const& b : builtins)
-                std::cout << b.get_name() << ", ";
-            std::cout << '\n';
+                std::cout << "Built-in kernels: ";
+                auto const builtins = device.get_info<sycl::info::device::built_in_kernel_ids>();
+                for(auto const& b : builtins)
+                    std::cout << b.get_name() << ", ";
+                std::cout << '\n';
 
-            std::cout << "Maximum number of subdevices: ";
-            auto const max_subs = device.get_info<sycl::info::device::partition_max_sub_devices>();
-            std::cout << max_subs << '\n';
+                std::cout << "Maximum number of subdevices: ";
+                auto const max_subs = device.get_info<sycl::info::device::partition_max_sub_devices>();
+                std::cout << max_subs << '\n';
 
-            if(max_subs > 1)
-            {
-                std::cout << "Supported partition properties: ";
-                auto const part_props = device.get_info<sycl::info::device::partition_properties>();
-                auto has_affinity_domains = false;
-                for(auto const& prop : part_props)
+                if(max_subs > 1)
                 {
-                    switch(prop)
+                    std::cout << "Supported partition properties: ";
+                    auto const part_props = device.get_info<sycl::info::device::partition_properties>();
+                    auto has_affinity_domains = false;
+                    for(auto const& prop : part_props)
+                    {
+                        switch(prop)
+                        {
+                        case sycl::info::partition_property::no_partition:
+                            std::cout << "no partition";
+                            break;
+
+                        case sycl::info::partition_property::partition_equally:
+                            std::cout << "equally";
+                            break;
+
+                        case sycl::info::partition_property::partition_by_counts:
+                            std::cout << "by counts";
+                            break;
+
+                        case sycl::info::partition_property::partition_by_affinity_domain:
+                            std::cout << "by affinity domain";
+                            has_affinity_domains = true;
+                            break;
+#        if defined(BOOST_COMP_ICPX)
+                        case sycl::info::partition_property::ext_intel_partition_by_cslice:
+                            std::cout << "by compute slice (Intel extension; deprecated)";
+                            break;
+#        endif
+                        }
+                        std::cout << ", ";
+                    }
+                    std::cout << '\n';
+
+                    if(has_affinity_domains)
+                    {
+                        std::cout << "Supported partition affinity domains: ";
+                        auto const aff_doms = device.get_info<sycl::info::device::partition_affinity_domains>();
+                        for(auto const& dom : aff_doms)
+                        {
+                            switch(dom)
+                            {
+                            case sycl::info::partition_affinity_domain::not_applicable:
+                                std::cout << "not applicable";
+                                break;
+
+                            case sycl::info::partition_affinity_domain::numa:
+                                std::cout << "NUMA";
+                                break;
+
+                            case sycl::info::partition_affinity_domain::L4_cache:
+                                std::cout << "L4 cache";
+                                break;
+
+                            case sycl::info::partition_affinity_domain::L3_cache:
+                                std::cout << "L3 cache";
+                                break;
+
+                            case sycl::info::partition_affinity_domain::L2_cache:
+                                std::cout << "L2 cache";
+                                break;
+
+                            case sycl::info::partition_affinity_domain::L1_cache:
+                                std::cout << "L1 cache";
+                                break;
+
+                            case sycl::info::partition_affinity_domain::next_partitionable:
+                                std::cout << "next partitionable";
+                                break;
+                            }
+                            std::cout << ", ";
+                        }
+                        std::cout << '\n';
+                    }
+
+                    std::cout << "Current partition property: ";
+                    switch(device.get_info<sycl::info::device::partition_type_property>())
                     {
                     case sycl::info::partition_property::no_partition:
                         std::cout << "no partition";
                         break;
 
                     case sycl::info::partition_property::partition_equally:
-                        std::cout << "equally";
+                        std::cout << "partitioned equally";
                         break;
 
                     case sycl::info::partition_property::partition_by_counts:
-                        std::cout << "by counts";
+                        std::cout << "partitioned by counts";
                         break;
 
                     case sycl::info::partition_property::partition_by_affinity_domain:
-                        std::cout << "by affinity domain";
-                        has_affinity_domains = true;
+                        std::cout << "partitioned by affinity domain";
                         break;
+
 #        if defined(BOOST_COMP_ICPX)
                     case sycl::info::partition_property::ext_intel_partition_by_cslice:
-                        std::cout << "by compute slice (Intel extension; deprecated)";
+                        std::cout << "partitioned by compute slice (Intel extension; deprecated)";
                         break;
 #        endif
                     }
-                    std::cout << ", ";
-                }
-                std::cout << '\n';
+                    std::cout << '\n';
 
-                if(has_affinity_domains)
-                {
-                    std::cout << "Supported partition affinity domains: ";
-                    auto const aff_doms = device.get_info<sycl::info::device::partition_affinity_domains>();
-                    for(auto const& dom : aff_doms)
+                    std::cout << "Current partition affinity domain: ";
+                    switch(device.get_info<sycl::info::device::partition_type_affinity_domain>())
                     {
-                        switch(dom)
-                        {
-                        case sycl::info::partition_affinity_domain::not_applicable:
-                            std::cout << "not applicable";
-                            break;
+                    case sycl::info::partition_affinity_domain::not_applicable:
+                        std::cout << "not applicable";
+                        break;
 
-                        case sycl::info::partition_affinity_domain::numa:
-                            std::cout << "NUMA";
-                            break;
+                    case sycl::info::partition_affinity_domain::numa:
+                        std::cout << "NUMA";
+                        break;
 
-                        case sycl::info::partition_affinity_domain::L4_cache:
-                            std::cout << "L4 cache";
-                            break;
+                    case sycl::info::partition_affinity_domain::L4_cache:
+                        std::cout << "L4 cache";
+                        break;
 
-                        case sycl::info::partition_affinity_domain::L3_cache:
-                            std::cout << "L3 cache";
-                            break;
+                    case sycl::info::partition_affinity_domain::L3_cache:
+                        std::cout << "L3 cache";
+                        break;
 
-                        case sycl::info::partition_affinity_domain::L2_cache:
-                            std::cout << "L2 cache";
-                            break;
+                    case sycl::info::partition_affinity_domain::L2_cache:
+                        std::cout << "L2 cache";
+                        break;
 
-                        case sycl::info::partition_affinity_domain::L1_cache:
-                            std::cout << "L1 cache";
-                            break;
+                    case sycl::info::partition_affinity_domain::L1_cache:
+                        std::cout << "L1 cache";
+                        break;
 
-                        case sycl::info::partition_affinity_domain::next_partitionable:
-                            std::cout << "next partitionable";
-                            break;
-                        }
-                        std::cout << ", ";
+                    case sycl::info::partition_affinity_domain::next_partitionable:
+                        std::cout << "next partitionable";
+                        break;
                     }
                     std::cout << '\n';
                 }
 
-                std::cout << "Current partition property: ";
-                switch(device.get_info<sycl::info::device::partition_type_property>())
-                {
-                case sycl::info::partition_property::no_partition:
-                    std::cout << "no partition";
-                    break;
-
-                case sycl::info::partition_property::partition_equally:
-                    std::cout << "partitioned equally";
-                    break;
-
-                case sycl::info::partition_property::partition_by_counts:
-                    std::cout << "partitioned by counts";
-                    break;
-
-                case sycl::info::partition_property::partition_by_affinity_domain:
-                    std::cout << "partitioned by affinity domain";
-                    break;
-
-#        if defined(BOOST_COMP_ICPX)
-                case sycl::info::partition_property::ext_intel_partition_by_cslice:
-                    std::cout << "partitioned by compute slice (Intel extension; deprecated)";
-                    break;
-#        endif
-                }
-                std::cout << '\n';
-
-                std::cout << "Current partition affinity domain: ";
-                switch(device.get_info<sycl::info::device::partition_type_affinity_domain>())
-                {
-                case sycl::info::partition_affinity_domain::not_applicable:
-                    std::cout << "not applicable";
-                    break;
-
-                case sycl::info::partition_affinity_domain::numa:
-                    std::cout << "NUMA";
-                    break;
-
-                case sycl::info::partition_affinity_domain::L4_cache:
-                    std::cout << "L4 cache";
-                    break;
-
-                case sycl::info::partition_affinity_domain::L3_cache:
-                    std::cout << "L3 cache";
-                    break;
-
-                case sycl::info::partition_affinity_domain::L2_cache:
-                    std::cout << "L2 cache";
-                    break;
-
-                case sycl::info::partition_affinity_domain::L1_cache:
-                    std::cout << "L1 cache";
-                    break;
-
-                case sycl::info::partition_affinity_domain::next_partitionable:
-                    std::cout << "next partitionable";
-                    break;
-                }
-                std::cout << '\n';
+                std::cout.flush();
             }
+#    endif
+        };
+    } // namespace trait
+} // namespace alpaka
 
-            std::cout.flush();
-        }
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic pop
 #    endif
-    };
-} // namespace alpaka::trait
 
 #endif
diff --git a/alpaka/include/alpaka/platform/PlatformGpuSyclIntel.hpp b/alpaka/include/alpaka/platform/PlatformGpuSyclIntel.hpp
index 216bb5ae..d49695a8 100644
--- a/alpaka/include/alpaka/platform/PlatformGpuSyclIntel.hpp
+++ b/alpaka/include/alpaka/platform/PlatformGpuSyclIntel.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
+/* Copyright 2024 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -8,8 +8,6 @@
 #include "alpaka/dev/Traits.hpp"
 #include "alpaka/platform/PlatformGenericSycl.hpp"
 
-#include <string>
-
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_GPU)
 
 #    include <sycl/sycl.hpp>
@@ -18,7 +16,8 @@ namespace alpaka
 {
     namespace detail
     {
-        struct IntelGpuSelector
+        template<>
+        struct SYCLDeviceSelector<TagGpuSyclIntel>
         {
             auto operator()(sycl::device const& dev) const -> int
             {
@@ -31,17 +30,7 @@ namespace alpaka
     } // namespace detail
 
     //! The SYCL device manager.
-    using PlatformGpuSyclIntel = PlatformGenericSycl<detail::IntelGpuSelector>;
+    using PlatformGpuSyclIntel = PlatformGenericSycl<TagGpuSyclIntel>;
 } // namespace alpaka
 
-namespace alpaka::trait
-{
-    //! The SYCL device manager device type trait specialization.
-    template<>
-    struct DevType<PlatformGpuSyclIntel>
-    {
-        using type = DevGenericSycl<PlatformGpuSyclIntel>; // = DevGpuSyclIntel
-    };
-} // namespace alpaka::trait
-
 #endif
diff --git a/alpaka/include/alpaka/platform/PlatformUniformCudaHipRt.hpp b/alpaka/include/alpaka/platform/PlatformUniformCudaHipRt.hpp
index 9784f545..a3ae0ef0 100644
--- a/alpaka/include/alpaka/platform/PlatformUniformCudaHipRt.hpp
+++ b/alpaka/include/alpaka/platform/PlatformUniformCudaHipRt.hpp
@@ -234,7 +234,7 @@ namespace alpaka
                     std::cout << "clockInstructionRate: " << devProp.clockInstructionRate << "kHz" << std::endl;
                     std::cout << "maxSharedMemoryPerMultiProcessor: " << devProp.maxSharedMemoryPerMultiProcessor / KiB
                               << " KiB" << std::endl;
-                    std::cout << "gcnArch: " << devProp.gcnArch << std::endl;
+                    std::cout << "gcnArchName: " << devProp.gcnArchName << std::endl;
                     std::cout << "arch: " << std::endl;
                     std::cout << "    hasGlobalInt32Atomics: " << devProp.arch.hasGlobalInt32Atomics << std::endl;
                     std::cout << "    hasGlobalFloatAtomicExch: " << devProp.arch.hasGlobalFloatAtomicExch
diff --git a/alpaka/include/alpaka/queue/QueueCpuSyclBlocking.hpp b/alpaka/include/alpaka/queue/QueueCpuSyclBlocking.hpp
index 63dc39fc..392740ae 100644
--- a/alpaka/include/alpaka/queue/QueueCpuSyclBlocking.hpp
+++ b/alpaka/include/alpaka/queue/QueueCpuSyclBlocking.hpp
@@ -1,17 +1,17 @@
-/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
+/* Copyright 2024 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
-#include "alpaka/dev/DevCpuSycl.hpp"
+#include "alpaka/acc/Tag.hpp"
 #include "alpaka/queue/QueueGenericSyclBlocking.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_CPU)
 
 namespace alpaka
 {
-    using QueueCpuSyclBlocking = QueueGenericSyclBlocking<DevCpuSycl>;
+    using QueueCpuSyclBlocking = QueueGenericSyclBlocking<TagCpuSycl>;
 } // namespace alpaka
 
 #endif
diff --git a/alpaka/include/alpaka/queue/QueueCpuSyclNonBlocking.hpp b/alpaka/include/alpaka/queue/QueueCpuSyclNonBlocking.hpp
index d3fab4dc..19904bae 100644
--- a/alpaka/include/alpaka/queue/QueueCpuSyclNonBlocking.hpp
+++ b/alpaka/include/alpaka/queue/QueueCpuSyclNonBlocking.hpp
@@ -1,17 +1,17 @@
-/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
+/* Copyright 2024 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
-#include "alpaka/dev/DevCpuSycl.hpp"
+#include "alpaka/acc/Tag.hpp"
 #include "alpaka/queue/QueueGenericSyclNonBlocking.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_CPU)
 
 namespace alpaka
 {
-    using QueueCpuSyclNonBlocking = QueueGenericSyclNonBlocking<DevCpuSycl>;
+    using QueueCpuSyclNonBlocking = QueueGenericSyclNonBlocking<TagCpuSycl>;
 } // namespace alpaka
 
 #endif
diff --git a/alpaka/include/alpaka/queue/QueueFpgaSyclIntelBlocking.hpp b/alpaka/include/alpaka/queue/QueueFpgaSyclIntelBlocking.hpp
index 9ff2e58d..7c2f791a 100644
--- a/alpaka/include/alpaka/queue/QueueFpgaSyclIntelBlocking.hpp
+++ b/alpaka/include/alpaka/queue/QueueFpgaSyclIntelBlocking.hpp
@@ -1,17 +1,17 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2024 Jan Stephan, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
-#include "alpaka/dev/DevFpgaSyclIntel.hpp"
+#include "alpaka/acc/Tag.hpp"
 #include "alpaka/queue/QueueGenericSyclBlocking.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_FPGA)
 
 namespace alpaka
 {
-    using QueueFpgaSyclIntelBlocking = QueueGenericSyclBlocking<DevFpgaSyclIntel>;
+    using QueueFpgaSyclIntelBlocking = QueueGenericSyclBlocking<TagFpgaSyclIntel>;
 } // namespace alpaka
 
 #endif
diff --git a/alpaka/include/alpaka/queue/QueueFpgaSyclIntelNonBlocking.hpp b/alpaka/include/alpaka/queue/QueueFpgaSyclIntelNonBlocking.hpp
index 20ea0bb8..de1d7a62 100644
--- a/alpaka/include/alpaka/queue/QueueFpgaSyclIntelNonBlocking.hpp
+++ b/alpaka/include/alpaka/queue/QueueFpgaSyclIntelNonBlocking.hpp
@@ -1,17 +1,17 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2024 Jan Stephan, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
-#include "alpaka/dev/DevFpgaSyclIntel.hpp"
+#include "alpaka/acc/Tag.hpp"
 #include "alpaka/queue/QueueGenericSyclNonBlocking.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_FPGA)
 
 namespace alpaka
 {
-    using QueueFpgaSyclIntelNonBlocking = QueueGenericSyclNonBlocking<DevFpgaSyclIntel>;
+    using QueueFpgaSyclIntelNonBlocking = QueueGenericSyclNonBlocking<TagFpgaSyclIntel>;
 } // namespace alpaka
 
 #endif
diff --git a/alpaka/include/alpaka/queue/QueueGenericSyclBlocking.hpp b/alpaka/include/alpaka/queue/QueueGenericSyclBlocking.hpp
index bb743226..44dfb149 100644
--- a/alpaka/include/alpaka/queue/QueueGenericSyclBlocking.hpp
+++ b/alpaka/include/alpaka/queue/QueueGenericSyclBlocking.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2024 Jan Stephan, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -10,8 +10,8 @@
 
 namespace alpaka
 {
-    template<typename TDev>
-    using QueueGenericSyclBlocking = detail::QueueGenericSyclBase<TDev, true>;
+    template<typename TTag>
+    using QueueGenericSyclBlocking = detail::QueueGenericSyclBase<TTag, true>;
 } // namespace alpaka
 
 #endif
diff --git a/alpaka/include/alpaka/queue/QueueGenericSyclNonBlocking.hpp b/alpaka/include/alpaka/queue/QueueGenericSyclNonBlocking.hpp
index b5dcbe84..22615cae 100644
--- a/alpaka/include/alpaka/queue/QueueGenericSyclNonBlocking.hpp
+++ b/alpaka/include/alpaka/queue/QueueGenericSyclNonBlocking.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2024 Jan Stephan, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -10,8 +10,8 @@
 
 namespace alpaka
 {
-    template<typename TDev>
-    using QueueGenericSyclNonBlocking = detail::QueueGenericSyclBase<TDev, false>;
+    template<typename TTag>
+    using QueueGenericSyclNonBlocking = detail::QueueGenericSyclBase<TTag, false>;
 } // namespace alpaka
 
 #endif
diff --git a/alpaka/include/alpaka/queue/QueueGpuSyclIntelBlocking.hpp b/alpaka/include/alpaka/queue/QueueGpuSyclIntelBlocking.hpp
index 358513e1..37d4bda6 100644
--- a/alpaka/include/alpaka/queue/QueueGpuSyclIntelBlocking.hpp
+++ b/alpaka/include/alpaka/queue/QueueGpuSyclIntelBlocking.hpp
@@ -1,17 +1,17 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2024 Jan Stephan, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
-#include "alpaka/dev/DevGpuSyclIntel.hpp"
+#include "alpaka/acc/Tag.hpp"
 #include "alpaka/queue/QueueGenericSyclBlocking.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_GPU)
 
 namespace alpaka
 {
-    using QueueGpuSyclIntelBlocking = QueueGenericSyclBlocking<DevGpuSyclIntel>;
+    using QueueGpuSyclIntelBlocking = QueueGenericSyclBlocking<TagGpuSyclIntel>;
 } // namespace alpaka
 
 #endif
diff --git a/alpaka/include/alpaka/queue/QueueGpuSyclIntelNonBlocking.hpp b/alpaka/include/alpaka/queue/QueueGpuSyclIntelNonBlocking.hpp
index f3be15c9..a50299e6 100644
--- a/alpaka/include/alpaka/queue/QueueGpuSyclIntelNonBlocking.hpp
+++ b/alpaka/include/alpaka/queue/QueueGpuSyclIntelNonBlocking.hpp
@@ -1,17 +1,17 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2024 Jan Stephan, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
-#include "alpaka/dev/DevGpuSyclIntel.hpp"
+#include "alpaka/acc/Tag.hpp"
 #include "alpaka/queue/QueueGenericSyclNonBlocking.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_GPU)
 
 namespace alpaka
 {
-    using QueueGpuSyclIntelNonBlocking = QueueGenericSyclNonBlocking<DevGpuSyclIntel>;
+    using QueueGpuSyclIntelNonBlocking = QueueGenericSyclNonBlocking<TagGpuSyclIntel>;
 } // namespace alpaka
 
 #endif
diff --git a/alpaka/include/alpaka/queue/Traits.hpp b/alpaka/include/alpaka/queue/Traits.hpp
index d207d92d..71d3ec9e 100644
--- a/alpaka/include/alpaka/queue/Traits.hpp
+++ b/alpaka/include/alpaka/queue/Traits.hpp
@@ -17,7 +17,7 @@ namespace alpaka
 
     //! True if TQueue is a queue, i.e. if it implements the ConceptQueue concept.
     template<typename TQueue>
-    inline constexpr bool isQueue = concepts::ImplementsConcept<ConceptQueue, TQueue>::value;
+    inline constexpr bool isQueue = concepts::ImplementsConcept<ConceptQueue, std::decay_t<TQueue>>::value;
 
     //! The queue traits.
     namespace trait
diff --git a/alpaka/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp b/alpaka/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp
index 38fed815..abf57631 100644
--- a/alpaka/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp
+++ b/alpaka/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Luca Ferragina, Andrea Bocci, Aurora Perego
+/* Copyright 2024 Jan Stephan, Antonio Di Pilato, Luca Ferragina, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -23,259 +23,267 @@
 
 #    include <sycl/sycl.hpp>
 
-namespace alpaka::detail
+namespace alpaka
 {
-    template<typename T, typename = void>
-    inline constexpr auto is_sycl_task = false;
-
-    template<typename T>
-    inline constexpr auto is_sycl_task<T, std::void_t<decltype(T::is_sycl_task)>> = true;
-
-    template<typename T, typename = void>
-    inline constexpr auto is_sycl_kernel = false;
+    template<typename TTag>
+    class DevGenericSycl;
 
-    template<typename T>
-    inline constexpr auto is_sycl_kernel<T, std::void_t<decltype(T::is_sycl_kernel)>> = true;
+    template<typename TTag>
+    class EventGenericSycl;
 
-    class QueueGenericSyclImpl
+    namespace detail
     {
-    public:
-        QueueGenericSyclImpl(sycl::context context, sycl::device device)
-            : m_queue{
-                std::move(context), // This is important. In SYCL a device can belong to multiple contexts.
-                std::move(device),
-                {sycl::property::queue::enable_profiling{}, sycl::property::queue::in_order{}}}
-        {
-        }
+        template<typename T, typename = void>
+        inline constexpr auto is_sycl_task = false;
+
+        template<typename T>
+        inline constexpr auto is_sycl_task<T, std::void_t<decltype(T::is_sycl_task)>> = true;
 
-        // This class will only exist as a pointer. We don't care about copy and move semantics.
-        QueueGenericSyclImpl(QueueGenericSyclImpl const& other) = delete;
-        auto operator=(QueueGenericSyclImpl const& rhs) -> QueueGenericSyclImpl& = delete;
+        template<typename T, typename = void>
+        inline constexpr auto is_sycl_kernel = false;
 
-        QueueGenericSyclImpl(QueueGenericSyclImpl&& other) noexcept = delete;
-        auto operator=(QueueGenericSyclImpl&& rhs) noexcept -> QueueGenericSyclImpl& = delete;
+        template<typename T>
+        inline constexpr auto is_sycl_kernel<T, std::void_t<decltype(T::is_sycl_kernel)>> = true;
 
-        ~QueueGenericSyclImpl()
+        class QueueGenericSyclImpl
         {
-            try
+        public:
+            QueueGenericSyclImpl(sycl::context context, sycl::device device)
+                : m_queue{
+                    std::move(context), // This is important. In SYCL a device can belong to multiple contexts.
+                    std::move(device),
+                    {sycl::property::queue::enable_profiling{}, sycl::property::queue::in_order{}}}
             {
-                m_queue.wait_and_throw();
             }
-            catch(sycl::exception const& err)
+
+            // This class will only exist as a pointer. We don't care about copy and move semantics.
+            QueueGenericSyclImpl(QueueGenericSyclImpl const& other) = delete;
+            auto operator=(QueueGenericSyclImpl const& rhs) -> QueueGenericSyclImpl& = delete;
+
+            QueueGenericSyclImpl(QueueGenericSyclImpl&& other) noexcept = delete;
+            auto operator=(QueueGenericSyclImpl&& rhs) noexcept -> QueueGenericSyclImpl& = delete;
+
+            ~QueueGenericSyclImpl()
             {
-                std::cerr << "Caught SYCL exception while destructing a SYCL queue: " << err.what() << " ("
-                          << err.code() << ')' << std::endl;
+                try
+                {
+                    m_queue.wait_and_throw();
+                }
+                catch(sycl::exception const& err)
+                {
+                    std::cerr << "Caught SYCL exception while destructing a SYCL queue: " << err.what() << " ("
+                              << err.code() << ')' << std::endl;
+                }
+                catch(std::exception const& err)
+                {
+                    std::cerr << "The following runtime error(s) occured while destructing a SYCL queue:" << err.what()
+                              << std::endl;
+                }
             }
-            catch(std::exception const& err)
+
+            // Don't call this without locking first!
+            auto clean_dependencies() -> void
             {
-                std::cerr << "The following runtime error(s) occured while destructing a SYCL queue:" << err.what()
-                          << std::endl;
+                // Clean up completed events
+                auto const start = std::begin(m_dependencies);
+                auto const old_end = std::end(m_dependencies);
+                auto const new_end = std::remove_if(
+                    start,
+                    old_end,
+                    [](sycl::event ev) {
+                        return ev.get_info<sycl::info::event::command_execution_status>()
+                               == sycl::info::event_command_status::complete;
+                    });
+
+                m_dependencies.erase(new_end, old_end);
             }
-        }
-
-        // Don't call this without locking first!
-        auto clean_dependencies() -> void
-        {
-            // Clean up completed events
-            auto const start = std::begin(m_dependencies);
-            auto const old_end = std::end(m_dependencies);
-            auto const new_end = std::remove_if(
-                start,
-                old_end,
-                [](sycl::event ev) {
-                    return ev.get_info<sycl::info::event::command_execution_status>()
-                           == sycl::info::event_command_status::complete;
-                });
-
-            m_dependencies.erase(new_end, old_end);
-        }
-
-        auto register_dependency(sycl::event event) -> void
-        {
-            std::lock_guard<std::shared_mutex> lock{m_mutex};
-
-            clean_dependencies();
-            m_dependencies.push_back(event);
-        }
 
-        auto empty() const -> bool
-        {
-            std::shared_lock<std::shared_mutex> lock{m_mutex};
-            return m_last_event.get_info<sycl::info::event::command_execution_status>()
-                   == sycl::info::event_command_status::complete;
-        }
+            auto register_dependency(sycl::event event) -> void
+            {
+                std::lock_guard<std::shared_mutex> lock{m_mutex};
 
-        auto wait() -> void
-        {
-            // SYCL queues are thread-safe.
-            m_queue.wait_and_throw();
-        }
+                clean_dependencies();
+                m_dependencies.push_back(event);
+            }
 
-        auto get_last_event() const -> sycl::event
-        {
-            std::shared_lock<std::shared_mutex> lock{m_mutex};
-            return m_last_event;
-        }
+            auto empty() const -> bool
+            {
+                std::shared_lock<std::shared_mutex> lock{m_mutex};
+                return m_last_event.get_info<sycl::info::event::command_execution_status>()
+                       == sycl::info::event_command_status::complete;
+            }
 
-        template<bool TBlocking, typename TTask>
-        auto enqueue(TTask const& task) -> void
-        {
+            auto wait() -> void
             {
-                std::lock_guard<std::shared_mutex> lock{m_mutex};
+                // SYCL queues are thread-safe.
+                m_queue.wait_and_throw();
+            }
 
-                clean_dependencies();
+            auto get_last_event() const -> sycl::event
+            {
+                std::shared_lock<std::shared_mutex> lock{m_mutex};
+                return m_last_event;
+            }
 
-                // Execute task
-                if constexpr(is_sycl_task<TTask> && !is_sycl_kernel<TTask>) // Copy / Fill
-                {
-                    m_last_event = task(m_queue, m_dependencies); // Will call queue.{copy, fill} internally
-                }
-                else
+            template<bool TBlocking, typename TTask>
+            auto enqueue(TTask const& task) -> void
+            {
                 {
-                    m_last_event = m_queue.submit(
-                        [this, &task](sycl::handler& cgh)
-                        {
-                            if(!m_dependencies.empty())
-                                cgh.depends_on(m_dependencies);
-
-                            if constexpr(is_sycl_kernel<TTask>) // Kernel
-                                task(cgh); // Will call cgh.parallel_for internally
-                            else // Host
-                                cgh.host_task(task);
-                        });
+                    std::lock_guard<std::shared_mutex> lock{m_mutex};
+
+                    clean_dependencies();
+
+                    // Execute task
+                    if constexpr(is_sycl_task<TTask> && !is_sycl_kernel<TTask>) // Copy / Fill
+                    {
+                        m_last_event = task(m_queue, m_dependencies); // Will call queue.{copy, fill} internally
+                    }
+                    else
+                    {
+                        m_last_event = m_queue.submit(
+                            [this, &task](sycl::handler& cgh)
+                            {
+                                if(!m_dependencies.empty())
+                                    cgh.depends_on(m_dependencies);
+
+                                if constexpr(is_sycl_kernel<TTask>) // Kernel
+                                    task(cgh); // Will call cgh.parallel_for internally
+                                else // Host
+                                    cgh.host_task(task);
+                            });
+                    }
+
+                    m_dependencies.clear();
                 }
 
-                m_dependencies.clear();
+                if constexpr(TBlocking)
+                    wait();
             }
 
-            if constexpr(TBlocking)
-                wait();
-        }
-
-        [[nodiscard]] auto getNativeHandle() const noexcept
-        {
-            return m_queue;
-        }
-
-        std::vector<sycl::event> m_dependencies;
-        sycl::event m_last_event;
-        std::shared_mutex mutable m_mutex;
+            [[nodiscard]] auto getNativeHandle() const noexcept
+            {
+                return m_queue;
+            }
 
-    private:
-        sycl::queue m_queue;
-    };
+            std::vector<sycl::event> m_dependencies;
+            sycl::event m_last_event;
+            std::shared_mutex mutable m_mutex;
 
-    template<typename TDev, bool TBlocking>
-    class QueueGenericSyclBase
-    {
-    public:
-        QueueGenericSyclBase(TDev const& dev)
-            : m_dev{dev}
-            , m_spQueueImpl{std::make_shared<detail::QueueGenericSyclImpl>(
-                  dev.getNativeHandle().second,
-                  dev.getNativeHandle().first)}
-        {
-            m_dev.m_impl->register_queue(m_spQueueImpl);
-        }
+        private:
+            sycl::queue m_queue;
+        };
 
-        friend auto operator==(QueueGenericSyclBase const& lhs, QueueGenericSyclBase const& rhs) -> bool
+        template<typename TTag, bool TBlocking>
+        class QueueGenericSyclBase
+            : public concepts::Implements<ConceptCurrentThreadWaitFor, QueueGenericSyclBase<TTag, TBlocking>>
+            , public concepts::Implements<ConceptQueue, QueueGenericSyclBase<TTag, TBlocking>>
+            , public concepts::Implements<ConceptGetDev, QueueGenericSyclBase<TTag, TBlocking>>
         {
-            return (lhs.m_dev == rhs.m_dev) && (lhs.m_spQueueImpl == rhs.m_spQueueImpl);
-        }
+        public:
+            QueueGenericSyclBase(DevGenericSycl<TTag> const& dev)
+                : m_dev{dev}
+                , m_spQueueImpl{std::make_shared<detail::QueueGenericSyclImpl>(
+                      dev.getNativeHandle().second,
+                      dev.getNativeHandle().first)}
+            {
+                m_dev.m_impl->register_queue(m_spQueueImpl);
+            }
 
-        friend auto operator!=(QueueGenericSyclBase const& lhs, QueueGenericSyclBase const& rhs) -> bool
-        {
-            return !(lhs == rhs);
-        }
+            friend auto operator==(QueueGenericSyclBase const& lhs, QueueGenericSyclBase const& rhs) -> bool
+            {
+                return (lhs.m_dev == rhs.m_dev) && (lhs.m_spQueueImpl == rhs.m_spQueueImpl);
+            }
 
-        [[nodiscard]] auto getNativeHandle() const noexcept
-        {
-            return m_spQueueImpl->getNativeHandle();
-        }
+            friend auto operator!=(QueueGenericSyclBase const& lhs, QueueGenericSyclBase const& rhs) -> bool
+            {
+                return !(lhs == rhs);
+            }
 
-        TDev m_dev;
-        std::shared_ptr<detail::QueueGenericSyclImpl> m_spQueueImpl;
-    };
-} // namespace alpaka::detail
+            [[nodiscard]] auto getNativeHandle() const noexcept
+            {
+                return m_spQueueImpl->getNativeHandle();
+            }
 
-namespace alpaka
-{
-    template<typename TDev>
-    class EventGenericSycl;
-} // namespace alpaka
+            DevGenericSycl<TTag> m_dev;
+            std::shared_ptr<detail::QueueGenericSyclImpl> m_spQueueImpl;
+        };
+    } // namespace detail
 
-namespace alpaka::trait
-{
-    //! The SYCL blocking queue device type trait specialization.
-    template<typename TDev, bool TBlocking>
-    struct DevType<detail::QueueGenericSyclBase<TDev, TBlocking>>
+    namespace trait
     {
-        using type = TDev;
-    };
+        //! The SYCL blocking queue device type trait specialization.
+        template<typename TTag, bool TBlocking>
+        struct DevType<alpaka::detail::QueueGenericSyclBase<TTag, TBlocking>>
+        {
+            using type = DevGenericSycl<TTag>;
+        };
 
-    //! The SYCL blocking queue device get trait specialization.
-    template<typename TDev, bool TBlocking>
-    struct GetDev<detail::QueueGenericSyclBase<TDev, TBlocking>>
-    {
-        static auto getDev(detail::QueueGenericSyclBase<TDev, TBlocking> const& queue)
+        //! The SYCL blocking queue device get trait specialization.
+        template<typename TTag, bool TBlocking>
+        struct GetDev<alpaka::detail::QueueGenericSyclBase<TTag, TBlocking>>
         {
-            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-            return queue.m_dev;
-        }
-    };
-
-    //! The SYCL blocking queue event type trait specialization.
-    template<typename TDev, bool TBlocking>
-    struct EventType<detail::QueueGenericSyclBase<TDev, TBlocking>>
-    {
-        using type = EventGenericSycl<TDev>;
-    };
+            static auto getDev(alpaka::detail::QueueGenericSyclBase<TTag, TBlocking> const& queue)
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+                return queue.m_dev;
+            }
+        };
 
-    //! The SYCL blocking queue enqueue trait specialization.
-    template<typename TDev, bool TBlocking, typename TTask>
-    struct Enqueue<detail::QueueGenericSyclBase<TDev, TBlocking>, TTask>
-    {
-        static auto enqueue(detail::QueueGenericSyclBase<TDev, TBlocking>& queue, TTask const& task) -> void
+        //! The SYCL blocking queue event type trait specialization.
+        template<typename TTag, bool TBlocking>
+        struct EventType<alpaka::detail::QueueGenericSyclBase<TTag, TBlocking>>
         {
-            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-            queue.m_spQueueImpl->template enqueue<TBlocking>(task);
-        }
-    };
-
-    //! The SYCL blocking queue test trait specialization.
-    template<typename TDev, bool TBlocking>
-    struct Empty<detail::QueueGenericSyclBase<TDev, TBlocking>>
-    {
-        static auto empty(detail::QueueGenericSyclBase<TDev, TBlocking> const& queue) -> bool
+            using type = EventGenericSycl<TTag>;
+        };
+
+        //! The SYCL blocking queue enqueue trait specialization.
+        template<typename TTag, bool TBlocking, typename TTask>
+        struct Enqueue<alpaka::detail::QueueGenericSyclBase<TTag, TBlocking>, TTask>
         {
-            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-            return queue.m_spQueueImpl->empty();
-        }
-    };
-
-    //! The SYCL blocking queue thread wait trait specialization.
-    //!
-    //! Blocks execution of the calling thread until the queue has finished processing all previously requested
-    //! tasks (kernels, data copies, ...)
-    template<typename TDev, bool TBlocking>
-    struct CurrentThreadWaitFor<detail::QueueGenericSyclBase<TDev, TBlocking>>
-    {
-        static auto currentThreadWaitFor(detail::QueueGenericSyclBase<TDev, TBlocking> const& queue) -> void
+            static auto enqueue(alpaka::detail::QueueGenericSyclBase<TTag, TBlocking>& queue, TTask const& task)
+                -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+                queue.m_spQueueImpl->template enqueue<TBlocking>(task);
+            }
+        };
+
+        //! The SYCL blocking queue test trait specialization.
+        template<typename TTag, bool TBlocking>
+        struct Empty<alpaka::detail::QueueGenericSyclBase<TTag, TBlocking>>
         {
-            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-            queue.m_spQueueImpl->wait();
-        }
-    };
-
-    //! The SYCL queue native handle trait specialization.
-    template<typename TDev, bool TBlocking>
-    struct NativeHandle<detail::QueueGenericSyclBase<TDev, TBlocking>>
-    {
-        [[nodiscard]] static auto getNativeHandle(detail::QueueGenericSyclBase<TDev, TBlocking> const& queue)
+            static auto empty(alpaka::detail::QueueGenericSyclBase<TTag, TBlocking> const& queue) -> bool
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+                return queue.m_spQueueImpl->empty();
+            }
+        };
+
+        //! The SYCL blocking queue thread wait trait specialization.
+        //!
+        //! Blocks execution of the calling thread until the queue has finished processing all previously requested
+        //! tasks (kernels, data copies, ...)
+        template<typename TTag, bool TBlocking>
+        struct CurrentThreadWaitFor<alpaka::detail::QueueGenericSyclBase<TTag, TBlocking>>
         {
-            return queue.getNativeHandle();
-        }
-    };
-} // namespace alpaka::trait
+            static auto currentThreadWaitFor(alpaka::detail::QueueGenericSyclBase<TTag, TBlocking> const& queue)
+                -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+                queue.m_spQueueImpl->wait();
+            }
+        };
 
+        //! The SYCL queue native handle trait specialization.
+        template<typename TTag, bool TBlocking>
+        struct NativeHandle<alpaka::detail::QueueGenericSyclBase<TTag, TBlocking>>
+        {
+            [[nodiscard]] static auto getNativeHandle(
+                alpaka::detail::QueueGenericSyclBase<TTag, TBlocking> const& queue)
+            {
+                return queue.getNativeHandle();
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
 #endif
diff --git a/alpaka/include/alpaka/rand/Philox/PhiloxBaseCommon.hpp b/alpaka/include/alpaka/rand/Philox/PhiloxBaseCommon.hpp
index 3e790fab..e80d8a1e 100644
--- a/alpaka/include/alpaka/rand/Philox/PhiloxBaseCommon.hpp
+++ b/alpaka/include/alpaka/rand/Philox/PhiloxBaseCommon.hpp
@@ -14,7 +14,6 @@ namespace alpaka::rand::engine
      *
      * Relies on `PhiloxStateless` to provide the PRNG and adds state to handling the counting.
      *
-     * @tparam TBackend device-dependent backend, specifies the array types
      * @tparam TParams Philox algorithm parameters \sa PhiloxParams
      * @tparam TImpl engine type implementation (CRTP)
      *
@@ -24,14 +23,16 @@ namespace alpaka::rand::engine
      * OpenMP <= 4.5 standard. In OpenMP >= 5.0 types with any kind of static
      * data member are mappable.
      */
-    template<typename TBackend, typename TParams, typename TImpl>
-    class PhiloxBaseCommon
-        : public TBackend
-        , public PhiloxStateless<TBackend, TParams>
+    template<typename TParams, typename TImpl>
+    class PhiloxBaseCommon : public PhiloxStateless<TParams>
     {
     public:
-        using Counter = typename PhiloxStateless<TBackend, TParams>::Counter;
-        using Key = typename PhiloxStateless<TBackend, TParams>::Key;
+        using Counter = typename PhiloxStateless<TParams>::Counter;
+        using Key = typename PhiloxStateless<TParams>::Key;
+
+        /// Distribution container type
+        template<typename TDistributionResultScalar>
+        using ResultContainer = typename alpaka::Vec<alpaka::DimInt<TParams::counterSize>, TDistributionResultScalar>;
 
     protected:
         /** Advance the \a counter to the next state
diff --git a/alpaka/include/alpaka/rand/Philox/PhiloxBaseCudaArray.hpp b/alpaka/include/alpaka/rand/Philox/PhiloxBaseCudaArray.hpp
deleted file mode 100644
index c2f8c9c8..00000000
--- a/alpaka/include/alpaka/rand/Philox/PhiloxBaseCudaArray.hpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright 2022 Jiri Vyskocil
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/meta/CudaVectorArrayWrapper.hpp"
-
-#if defined(ALPAKA_ACC_GPU_HIP_ENABLED) || defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-
-namespace alpaka::rand::engine
-{
-    namespace trait
-    {
-        template<typename TScalar>
-        struct PhiloxResultContainerTraits;
-
-        template<>
-        struct PhiloxResultContainerTraits<float>
-        {
-            using type = meta::CudaVectorArrayWrapper<float, 4>;
-        };
-
-        template<>
-        struct PhiloxResultContainerTraits<double>
-        {
-            using type = meta::CudaVectorArrayWrapper<double, 4>;
-        };
-
-        template<>
-        struct PhiloxResultContainerTraits<int>
-        {
-            using type = meta::CudaVectorArrayWrapper<int, 4>;
-        };
-
-        template<>
-        struct PhiloxResultContainerTraits<unsigned>
-        {
-            using type = meta::CudaVectorArrayWrapper<unsigned, 4>;
-        };
-
-        template<typename TScalar>
-        using PhiloxResultContainer = typename PhiloxResultContainerTraits<TScalar>::type;
-    } // namespace trait
-
-    /** Philox backend using array-like interface to CUDA uintN types for the storage of Key and Counter
-     *
-     * @tparam TParams Philox algorithm parameters \sa PhiloxParams
-     */
-    template<typename TParams>
-    class PhiloxBaseCudaArray
-    {
-        static_assert(TParams::counterSize == 4, "GPU Philox implemented only for counters of width == 4");
-
-    public:
-        using Counter
-            = meta::CudaVectorArrayWrapper<unsigned, 4>; ///< Counter type = array-like interface to CUDA uint4
-        using Key = meta::CudaVectorArrayWrapper<unsigned, 2>; ///< Key type = array-like interface to CUDA uint2
-        template<typename TDistributionResultScalar>
-        using ResultContainer = trait::PhiloxResultContainer<TDistributionResultScalar>; ///< Vector template for
-                                                                                         ///< distribution results
-    };
-} // namespace alpaka::rand::engine
-
-#endif
diff --git a/alpaka/include/alpaka/rand/Philox/PhiloxBaseStdArray.hpp b/alpaka/include/alpaka/rand/Philox/PhiloxBaseStdArray.hpp
deleted file mode 100644
index 53d3df56..00000000
--- a/alpaka/include/alpaka/rand/Philox/PhiloxBaseStdArray.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright 2022 Jiri Vyskocil, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include <array>
-#include <cstdint>
-
-namespace alpaka::rand::engine
-{
-    /** Philox backend using std::array for Key and Counter storage
-     *
-     * @tparam TParams Philox algorithm parameters \sa PhiloxParams
-     */
-    template<typename TParams>
-    class PhiloxBaseStdArray
-    {
-    public:
-        using Counter = std::array<std::uint32_t, TParams::counterSize>; ///< Counter type = std::array
-        using Key = std::array<std::uint32_t, TParams::counterSize / 2>; ///< Key type = std::array
-        template<typename TScalar>
-        using ResultContainer
-            = std::array<TScalar, TParams::counterSize>; ///< Vector template for distribution results
-    };
-} // namespace alpaka::rand::engine
diff --git a/alpaka/include/alpaka/rand/Philox/PhiloxBaseTraits.hpp b/alpaka/include/alpaka/rand/Philox/PhiloxBaseTraits.hpp
deleted file mode 100644
index 8c782fb9..00000000
--- a/alpaka/include/alpaka/rand/Philox/PhiloxBaseTraits.hpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright 2022 Jiří Vyskočil, Bernhard Manfred Gruber, Jeffrey Kelling, Jan Stephan
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/rand/Philox/PhiloxBaseCommon.hpp"
-#include "alpaka/rand/Philox/PhiloxBaseCudaArray.hpp"
-#include "alpaka/rand/Philox/PhiloxBaseStdArray.hpp"
-#include "alpaka/rand/Philox/PhiloxStateless.hpp"
-#include "alpaka/rand/Philox/PhiloxStatelessKeyedBase.hpp"
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-namespace alpaka
-{
-    template<typename TApi, typename TDim, typename TIdx>
-    class AccGpuUniformCudaHipRt;
-} // namespace alpaka
-#endif
-
-namespace alpaka::rand::engine::trait
-{
-    template<typename TAcc>
-    inline constexpr bool isGPU = false;
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-    template<typename TApi, typename TDim, typename TIdx>
-    inline constexpr bool isGPU<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>> = true;
-#endif
-
-    /** Selection of default backend
-     *
-     * Selects the data backend based on the accelerator device type. As of now, different backends operate
-     * on different array types.
-     *
-     * @tparam TAcc the accelerator as defined in alpaka/acc
-     * @tparam TParams Philox algorithm parameters
-     * @tparam TSfinae internal parameter to stop substitution search and provide the default
-     */
-    template<typename TAcc, typename TParams, typename TSfinae = void>
-    struct PhiloxStatelessBaseTraits
-    {
-        // template <typename Acc, typename TParams, typename TImpl>
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-        using Backend = std::conditional_t<isGPU<TAcc>, PhiloxBaseCudaArray<TParams>, PhiloxBaseStdArray<TParams>>;
-#else
-        using Backend = PhiloxBaseStdArray<TParams>;
-#endif
-        using Counter = typename Backend::Counter; ///< Counter array type
-        using Key = typename Backend::Key; ///< Key array type
-        template<typename TDistributionResultScalar>
-        using ResultContainer =
-            typename Backend::template ResultContainer<TDistributionResultScalar>; ///< Distribution
-                                                                                   ///< container type
-        /// Base type to be inherited from by stateless keyed engine
-        using Base = PhiloxStateless<Backend, TParams>;
-    };
-
-    /** Selection of default backend
-     *
-     * Selects the data backend based on the accelerator device type. As of now, different backends operate
-     * on different array types.
-     *
-     * @tparam TAcc the accelerator as defined in alpaka/acc
-     * @tparam TParams Philox algorithm parameters
-     * @tparam TSfinae internal parameter to stop substitution search and provide the default
-     */
-    template<typename TAcc, typename TParams, typename TSfinae = void>
-    struct PhiloxStatelessKeyedBaseTraits : public PhiloxStatelessBaseTraits<TAcc, TParams>
-    {
-        using Backend = typename PhiloxStatelessBaseTraits<TAcc, TParams>::Backend;
-        /// Base type to be inherited from by counting engines
-        using Base = PhiloxStatelessKeyedBase<Backend, TParams>;
-    };
-
-    /** Selection of default backend
-     *
-     * Selects the data backend based on the accelerator device type. As of now, different backends operate
-     * on different array types.
-     *
-     * @tparam TAcc the accelerator as defined in alpaka/acc
-     * @tparam TParams Philox algorithm parameters
-     * @tparam TImpl engine type implementation (CRTP)
-     * @tparam TSfinae internal parameter to stop substitution search and provide the default
-     */
-    template<typename TAcc, typename TParams, typename TImpl, typename TSfinae = void>
-    struct PhiloxBaseTraits : public PhiloxStatelessBaseTraits<TAcc, TParams>
-    {
-        using Backend = typename PhiloxStatelessBaseTraits<TAcc, TParams>::Backend;
-        /// Base type to be inherited from by counting engines
-        using Base = PhiloxBaseCommon<Backend, TParams, TImpl>;
-    };
-} // namespace alpaka::rand::engine::trait
diff --git a/alpaka/include/alpaka/rand/Philox/PhiloxConstants.hpp b/alpaka/include/alpaka/rand/Philox/PhiloxConstants.hpp
index f6000d6c..831a1de3 100644
--- a/alpaka/include/alpaka/rand/Philox/PhiloxConstants.hpp
+++ b/alpaka/include/alpaka/rand/Philox/PhiloxConstants.hpp
@@ -31,34 +31,40 @@ namespace alpaka::rand::engine
     class PhiloxConstants
     {
     public:
+        /// First Weyl sequence parameter: the golden ratio
         static constexpr std::uint64_t WEYL_64_0()
         {
-            return 0x9E37'79B9'7F4A'7C15; ///< First Weyl sequence parameter: the golden ratio
+            return 0x9E37'79B9'7F4A'7C15;
         }
 
+        /// Second Weyl sequence parameter: \f$ \sqrt{3}-1 \f$
         static constexpr std::uint64_t WEYL_64_1()
         {
-            return 0xBB67'AE85'84CA'A73B; ///< Second Weyl sequence parameter: \f$ \sqrt{3}-1 \f$
+            return 0xBB67'AE85'84CA'A73B;
         }
 
+        /// 1st Weyl sequence parameter, 32 bits
         static constexpr std::uint32_t WEYL_32_0()
         {
-            return high32Bits(WEYL_64_0()); ///< 1st Weyl sequence parameter, 32 bits
+            return high32Bits(WEYL_64_0());
         }
 
+        /// 2nd Weyl sequence parameter, 32 bits
         static constexpr std::uint32_t WEYL_32_1()
         {
-            return high32Bits(WEYL_64_1()); ///< 2nd Weyl sequence parameter, 32 bits
+            return high32Bits(WEYL_64_1());
         }
 
+        /// First Philox S-box multiplier
         static constexpr std::uint32_t MULTIPLITER_4x32_0()
         {
-            return 0xCD9E'8D57; ///< First Philox S-box multiplier
+            return 0xCD9E'8D57;
         }
 
+        /// Second Philox S-box multiplier
         static constexpr std::uint32_t MULTIPLITER_4x32_1()
         {
-            return 0xD251'1F53; ///< Second Philox S-box multiplier
+            return 0xD251'1F53;
         }
     };
 } // namespace alpaka::rand::engine
diff --git a/alpaka/include/alpaka/rand/Philox/PhiloxSingle.hpp b/alpaka/include/alpaka/rand/Philox/PhiloxSingle.hpp
index 4cccae7e..3f7b6ffc 100644
--- a/alpaka/include/alpaka/rand/Philox/PhiloxSingle.hpp
+++ b/alpaka/include/alpaka/rand/Philox/PhiloxSingle.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 #include "alpaka/rand/Philox/MultiplyAndSplit64to32.hpp"
-#include "alpaka/rand/Philox/PhiloxBaseTraits.hpp"
+#include "alpaka/rand/Philox/PhiloxBaseCommon.hpp"
 
 #include <utility>
 
@@ -22,10 +22,14 @@ namespace alpaka::rand::engine
         using Counter = TCounter;
         using Key = TKey;
 
-        Counter counter; ///< Counter array
-        Key key; ///< Key array
-        Counter result; ///< Intermediate result array
-        std::uint32_t position; ///< Pointer to the active intermediate result element
+        /// Counter array
+        Counter counter;
+        /// Key array
+        Key key;
+        /// Intermediate result array
+        Counter result;
+        /// Pointer to the active intermediate result element
+        std::uint32_t position;
         // TODO: Box-Muller states
     };
 
@@ -36,21 +40,23 @@ namespace alpaka::rand::engine
      * operator(). Additionally a pointer has to be stored indicating which part of the result array is to be
      * returned next.
      *
-     * @tparam TAcc Accelerator type as defined in alpaka/acc
      * @tparam TParams Basic parameters for the Philox algorithm
      */
-    template<typename TAcc, typename TParams>
-    class PhiloxSingle : public trait::PhiloxBaseTraits<TAcc, TParams, PhiloxSingle<TAcc, TParams>>::Base
+    template<typename TParams>
+    class PhiloxSingle : public PhiloxBaseCommon<TParams, PhiloxSingle<TParams>>
     {
     public:
-        /// Specialization for different TAcc backends
-        using Traits = typename trait::PhiloxBaseTraits<TAcc, TParams, PhiloxSingle<TAcc, TParams>>;
+        using Base = PhiloxBaseCommon<TParams, PhiloxSingle<TParams>>;
 
-        using Counter = typename Traits::Counter; ///< Backend-dependent Counter type
-        using Key = typename Traits::Key; ///< Backend-dependent Key type
-        using State = PhiloxStateSingle<Counter, Key>; ///< Backend-dependent State type
+        /// Counter type
+        using Counter = typename Base::Counter;
+        /// Key type
+        using Key = typename Base::Key;
+        /// State type
+        using State = PhiloxStateSingle<Counter, Key>;
 
-        State state; ///< Internal engine state
+        /// Internal engine state
+        State state;
 
     protected:
         /** Advance internal counter to the next value
diff --git a/alpaka/include/alpaka/rand/Philox/PhiloxStateless.hpp b/alpaka/include/alpaka/rand/Philox/PhiloxStateless.hpp
index 094b9d3d..3011d446 100644
--- a/alpaka/include/alpaka/rand/Philox/PhiloxStateless.hpp
+++ b/alpaka/include/alpaka/rand/Philox/PhiloxStateless.hpp
@@ -7,6 +7,7 @@
 #include "alpaka/core/Unroll.hpp"
 #include "alpaka/rand/Philox/MultiplyAndSplit64to32.hpp"
 #include "alpaka/rand/Philox/PhiloxConstants.hpp"
+#include "alpaka/vec/Vec.hpp"
 
 #include <utility>
 
@@ -28,13 +29,12 @@ namespace alpaka::rand::engine
 
     /** Class basic Philox family counter-based PRNG
      *
-     * Checks the validity of passed-in parameters and calls the \a TBackend methods to perform N rounds of the
+     * Checks the validity of passed-in parameters and calls the backend methods to perform N rounds of the
      * Philox shuffle.
      *
-     * @tparam TBackend device-dependent backend, specifies the array types
      * @tparam TParams Philox algorithm parameters \sa PhiloxParams
      */
-    template<typename TBackend, typename TParams>
+    template<typename TParams>
     class PhiloxStateless : public PhiloxConstants<TParams>
     {
         static constexpr unsigned numRounds()
@@ -60,8 +60,8 @@ namespace alpaka::rand::engine
         static_assert(numberWidth() == 32, "Philox implemented only for 32 bit numbers.");
 
     public:
-        using Counter = typename TBackend::Counter;
-        using Key = typename TBackend::Key;
+        using Counter = alpaka::Vec<alpaka::DimInt<TParams::counterSize>, std::uint32_t>;
+        using Key = alpaka::Vec<alpaka::DimInt<TParams::counterSize / 2>, std::uint32_t>;
         using Constants = PhiloxConstants<TParams>;
 
     protected:
diff --git a/alpaka/include/alpaka/rand/Philox/PhiloxStatelessKeyedBase.hpp b/alpaka/include/alpaka/rand/Philox/PhiloxStatelessKeyedBase.hpp
index c997ec0e..bb6795b7 100644
--- a/alpaka/include/alpaka/rand/Philox/PhiloxStatelessKeyedBase.hpp
+++ b/alpaka/include/alpaka/rand/Philox/PhiloxStatelessKeyedBase.hpp
@@ -10,20 +10,19 @@ namespace alpaka::rand::engine
 {
     /** Common class for Philox family engines
      *
-     * Checks the validity of passed-in parameters and calls the \a TBackend methods to perform N rounds of the
+     * Checks the validity of passed-in parameters and calls the backend methods to perform N rounds of the
      * Philox shuffle.
      *
-     * @tparam TBackend device-dependent backend, specifies the array types
      * @tparam TParams Philox algorithm parameters \sa PhiloxParams
      */
-    template<typename TBackend, typename TParams>
-    struct PhiloxStatelessKeyedBase : public PhiloxStateless<TBackend, TParams>
+    template<typename TParams>
+    struct PhiloxStatelessKeyedBase : public PhiloxStateless<TParams>
     {
     public:
-        using Counter = typename PhiloxStateless<TBackend, TParams>::Counter;
-        using Key = typename PhiloxStateless<TBackend, TParams>::Key;
+        using Counter = typename PhiloxStateless<TParams>::Counter;
+        using Key = typename PhiloxStateless<TParams>::Key;
 
-        const Key m_key;
+        Key const m_key;
 
         PhiloxStatelessKeyedBase(Key&& key) : m_key(std::move(key))
         {
diff --git a/alpaka/include/alpaka/rand/Philox/PhiloxStatelessVector.hpp b/alpaka/include/alpaka/rand/Philox/PhiloxStatelessVector.hpp
deleted file mode 100644
index 49a7fa1b..00000000
--- a/alpaka/include/alpaka/rand/Philox/PhiloxStatelessVector.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright 2022 Jeffrey Kelling
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/rand/Philox/PhiloxBaseTraits.hpp"
-
-#include <utility>
-
-namespace alpaka::rand::engine
-{
-    /** Philox-stateless engine generating a vector of numbers
-     *
-     * This engine's operator() will return a vector of numbers corresponding to the full size of its counter.
-     * This is a convenience vs. memory size tradeoff since the user has to deal with the output array
-     * themselves, but the internal state comprises only of a single counter and a key.
-     *
-     * @tparam TAcc Accelerator type as defined in alpaka/acc
-     * @tparam TParams Basic parameters for the Philox algorithm
-     */
-    template<typename TAcc, typename TParams>
-    class PhiloxStatelessVector : public trait::PhiloxStatelessBaseTraits<TAcc, TParams>::Base
-    {
-    };
-} // namespace alpaka::rand::engine
diff --git a/alpaka/include/alpaka/rand/Philox/PhiloxVector.hpp b/alpaka/include/alpaka/rand/Philox/PhiloxVector.hpp
index 648399ca..64c89b44 100644
--- a/alpaka/include/alpaka/rand/Philox/PhiloxVector.hpp
+++ b/alpaka/include/alpaka/rand/Philox/PhiloxVector.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 #include "alpaka/rand/Philox/MultiplyAndSplit64to32.hpp"
-#include "alpaka/rand/Philox/PhiloxBaseTraits.hpp"
+#include "alpaka/rand/Philox/PhiloxBaseCommon.hpp"
 
 #include <utility>
 
@@ -22,8 +22,10 @@ namespace alpaka::rand::engine
         using Counter = TCounter;
         using Key = TKey;
 
-        Counter counter; ///< Counter array
-        Key key; ///< Key array
+        /// Counter array
+        Counter counter;
+        /// Key array
+        Key key;
     };
 
     /** Philox engine generating a vector of numbers
@@ -32,21 +34,23 @@ namespace alpaka::rand::engine
      * This is a convenience vs. memory size tradeoff since the user has to deal with the output array
      * themselves, but the internal state comprises only of a single counter and a key.
      *
-     * @tparam TAcc Accelerator type as defined in alpaka/acc
      * @tparam TParams Basic parameters for the Philox algorithm
      */
-    template<typename TAcc, typename TParams>
-    class PhiloxVector : public trait::PhiloxBaseTraits<TAcc, TParams, PhiloxVector<TAcc, TParams>>::Base
+    template<typename TParams>
+    class PhiloxVector : public PhiloxBaseCommon<TParams, PhiloxVector<TParams>>
     {
     public:
-        /// Specialization for different TAcc backends
-        using Traits = trait::PhiloxBaseTraits<TAcc, TParams, PhiloxVector<TAcc, TParams>>;
+        using Base = PhiloxBaseCommon<TParams, PhiloxVector<TParams>>;
+
+        /// Counter type
+        using Counter = typename Base::Counter;
+        /// Key type
+        using Key = typename Base::Key;
+        /// State type
+        using State = PhiloxStateVector<Counter, Key>;
 
-        using Counter = typename Traits::Counter; ///< Backend-dependent Counter type
-        using Key = typename Traits::Key; ///< Backend-dependent Key type
-        using State = PhiloxStateVector<Counter, Key>; ///< Backend-dependent State type
         template<typename TDistributionResultScalar>
-        using ResultContainer = typename Traits::template ResultContainer<TDistributionResultScalar>;
+        using ResultContainer = typename Base::template ResultContainer<TDistributionResultScalar>;
 
         State state;
 
diff --git a/alpaka/include/alpaka/rand/RandDefault.hpp b/alpaka/include/alpaka/rand/RandDefault.hpp
index 6cb17018..bbe763c6 100644
--- a/alpaka/include/alpaka/rand/RandDefault.hpp
+++ b/alpaka/include/alpaka/rand/RandDefault.hpp
@@ -79,7 +79,7 @@ namespace alpaka::rand
             ALPAKA_FN_HOST_ACC auto operator()(TEngine& engine) -> T
             {
                 constexpr BitsT limit = static_cast<BitsT>(1) << std::numeric_limits<T>::digits;
-                const BitsT b = UniformUint<BitsT>()(engine);
+                BitsT const b = UniformUint<BitsT>()(engine);
                 auto const ret = static_cast<T>(b & (limit - 1)) / limit;
                 return ret;
             }
@@ -147,7 +147,7 @@ namespace alpaka::rand
                     } while(u1 <= std::numeric_limits<T>::epsilon());
 
                     // compute z0 and z1
-                    const T mag = sigma * math::sqrt(*m_acc, static_cast<T>(-2.) * math::log(*m_acc, u1));
+                    T const mag = sigma * math::sqrt(*m_acc, static_cast<T>(-2.) * math::log(*m_acc, u1));
                     constexpr T twoPi = static_cast<T>(2. * math::constants::pi);
                     // getting two normal number out of this, store one for later
                     m_cache = mag * static_cast<T>(math::cos(*m_acc, twoPi * u2)) + mu;
@@ -155,7 +155,7 @@ namespace alpaka::rand
                     return mag * static_cast<T>(math::sin(*m_acc, twoPi * u2)) + mu;
                 }
 
-                const T ret = m_cache;
+                T const ret = m_cache;
                 m_cache = std::numeric_limits<T>::quiet_NaN();
                 return ret;
             }
@@ -207,7 +207,7 @@ namespace alpaka::rand
                 TAcc const& /* acc */,
                 std::uint32_t const& seed,
                 std::uint32_t const& subsequence,
-                std::uint32_t const& offset) -> Philox4x32x10<TAcc>
+                std::uint32_t const& offset) -> Philox4x32x10
             {
                 return {seed, subsequence, offset};
             }
diff --git a/alpaka/include/alpaka/rand/RandPhilox.hpp b/alpaka/include/alpaka/rand/RandPhilox.hpp
index 72cf99b2..d11cacb6 100644
--- a/alpaka/include/alpaka/rand/RandPhilox.hpp
+++ b/alpaka/include/alpaka/rand/RandPhilox.hpp
@@ -27,15 +27,14 @@ namespace alpaka::rand
      * Ref.: J. K. Salmon, M. A. Moraes, R. O. Dror and D. E. Shaw, "Parallel random numbers: As easy as 1, 2, 3,"
      * SC '11: Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and
      * Analysis, 2011, pp. 1-12, doi: 10.1145/2063384.2063405.
-     *
-     * @tparam TAcc Accelerator type as defined in alpaka/acc
      */
-    template<typename TAcc>
-    class Philox4x32x10 : public concepts::Implements<ConceptRand, Philox4x32x10<TAcc>>
+    class Philox4x32x10 : public concepts::Implements<ConceptRand, Philox4x32x10>
     {
     public:
-        using EngineParams = engine::PhiloxParams<4, 32, 10>; ///< Philox algorithm: 10 rounds, 4 numbers of size 32.
-        using EngineVariant = engine::PhiloxSingle<TAcc, EngineParams>; ///< Engine outputs a single number
+        /// Philox algorithm: 10 rounds, 4 numbers of size 32.
+        using EngineParams = engine::PhiloxParams<4, 32, 10>;
+        /// Engine outputs a single number
+        using EngineVariant = engine::PhiloxSingle<EngineParams>;
 
         /** Initialize a new Philox engine
          *
@@ -84,15 +83,12 @@ namespace alpaka::rand
      * Ref.: J. K. Salmon, M. A. Moraes, R. O. Dror and D. E. Shaw, "Parallel random numbers: As easy as 1, 2, 3,"
      * SC '11: Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and
      * Analysis, 2011, pp. 1-12, doi: 10.1145/2063384.2063405.
-     *
-     * @tparam TAcc Accelerator type as defined in alpaka/acc
      */
-    template<typename TAcc>
-    class Philox4x32x10Vector : public concepts::Implements<ConceptRand, Philox4x32x10Vector<TAcc>>
+    class Philox4x32x10Vector : public concepts::Implements<ConceptRand, Philox4x32x10Vector>
     {
     public:
         using EngineParams = engine::PhiloxParams<4, 32, 10>;
-        using EngineVariant = engine::PhiloxVector<TAcc, EngineParams>;
+        using EngineVariant = engine::PhiloxVector<EngineParams>;
 
         /** Initialize a new Philox engine
          *
@@ -178,7 +174,7 @@ namespace alpaka::rand
             if constexpr(meta::IsArrayOrVector<TResult>::value)
             {
                 auto result = engine();
-                T scale = static_cast<T>(1) / engine.max() * _range;
+                T scale = static_cast<T>(1) / static_cast<T>(engine.max()) * _range;
                 TResult ret{
                     static_cast<T>(result[0]) * scale + _min,
                     static_cast<T>(result[1]) * scale + _min,
@@ -189,15 +185,17 @@ namespace alpaka::rand
             else
             {
                 // Since it's possible to get a host-only engine here, the call has to go through proxy
-                return static_cast<T>(EngineCallHostAccProxy<TEngine>{}(engine)) / engine.max() * _range + _min;
+                return static_cast<T>(EngineCallHostAccProxy<TEngine>{}(engine)) / static_cast<T>(engine.max())
+                           * _range
+                       + _min;
             }
 
             ALPAKA_UNREACHABLE(TResult{});
         }
 
     private:
-        const T _min;
-        const T _max;
-        const T _range;
+        T const _min;
+        T const _max;
+        T const _range;
     };
 } // namespace alpaka::rand
diff --git a/alpaka/include/alpaka/rand/RandPhiloxStateless.hpp b/alpaka/include/alpaka/rand/RandPhiloxStateless.hpp
index b7480f6f..b2530d13 100644
--- a/alpaka/include/alpaka/rand/RandPhiloxStateless.hpp
+++ b/alpaka/include/alpaka/rand/RandPhiloxStateless.hpp
@@ -5,7 +5,6 @@
 #pragma once
 
 #include "alpaka/rand/Philox/PhiloxStateless.hpp"
-#include "alpaka/rand/Philox/PhiloxStatelessVector.hpp"
 #include "alpaka/rand/Traits.hpp"
 
 namespace alpaka::rand
@@ -20,13 +19,10 @@ namespace alpaka::rand
      * Ref.: J. K. Salmon, M. A. Moraes, R. O. Dror and D. E. Shaw, "Parallel random numbers: As easy as 1, 2, 3,"
      * SC '11: Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and
      * Analysis, 2011, pp. 1-12, doi: 10.1145/2063384.2063405.
-     *
-     * @tparam TAcc Accelerator type as defined in alpaka/acc
      */
-    template<typename TAcc>
     class PhiloxStateless4x32x10Vector
-        : public alpaka::rand::engine::PhiloxStatelessVector<TAcc, engine::PhiloxParams<4, 32, 10>>
-        , public concepts::Implements<ConceptRand, PhiloxStateless4x32x10Vector<TAcc>>
+        : public alpaka::rand::engine::PhiloxStateless<engine::PhiloxParams<4, 32, 10>>
+        , public concepts::Implements<ConceptRand, PhiloxStateless4x32x10Vector>
     {
     public:
         using EngineParams = engine::PhiloxParams<4, 32, 10>;
diff --git a/alpaka/include/alpaka/rand/RandUniformCudaHipRand.hpp b/alpaka/include/alpaka/rand/RandUniformCudaHipRand.hpp
index 97090be3..63ffea90 100644
--- a/alpaka/include/alpaka/rand/RandUniformCudaHipRand.hpp
+++ b/alpaka/include/alpaka/rand/RandUniformCudaHipRand.hpp
@@ -231,7 +231,7 @@ namespace alpaka::rand
         template<typename TApi, typename T>
         struct CreateNormalReal<RandUniformCudaHipRand<TApi>, T, std::enable_if_t<std::is_floating_point_v<T>>>
         {
-            __device__ static auto createNormalReal(RandUniformCudaHipRand<TApi> const& /*rand*/)
+            static __device__ auto createNormalReal(RandUniformCudaHipRand<TApi> const& /*rand*/)
                 -> uniform_cuda_hip::NormalReal<T>
             {
                 return {};
@@ -242,7 +242,7 @@ namespace alpaka::rand
         template<typename TApi, typename T>
         struct CreateUniformReal<RandUniformCudaHipRand<TApi>, T, std::enable_if_t<std::is_floating_point_v<T>>>
         {
-            __device__ static auto createUniformReal(RandUniformCudaHipRand<TApi> const& /*rand*/)
+            static __device__ auto createUniformReal(RandUniformCudaHipRand<TApi> const& /*rand*/)
                 -> uniform_cuda_hip::UniformReal<T>
             {
                 return {};
@@ -253,7 +253,7 @@ namespace alpaka::rand
         template<typename TApi, typename T>
         struct CreateUniformUint<RandUniformCudaHipRand<TApi>, T, std::enable_if_t<std::is_integral_v<T>>>
         {
-            __device__ static auto createUniformUint(RandUniformCudaHipRand<TApi> const& /*rand*/)
+            static __device__ auto createUniformUint(RandUniformCudaHipRand<TApi> const& /*rand*/)
                 -> uniform_cuda_hip::UniformUint<T>
             {
                 return {};
@@ -267,7 +267,7 @@ namespace alpaka::rand
         template<typename TApi>
         struct CreateDefault<RandUniformCudaHipRand<TApi>>
         {
-            __device__ static auto createDefault(
+            static __device__ auto createDefault(
                 RandUniformCudaHipRand<TApi> const& /*rand*/,
                 std::uint32_t const& seed = 0,
                 std::uint32_t const& subsequence = 0,
diff --git a/alpaka/include/alpaka/test/KernelExecutionFixture.hpp b/alpaka/include/alpaka/test/KernelExecutionFixture.hpp
index 6d2cf31f..0e593444 100644
--- a/alpaka/include/alpaka/test/KernelExecutionFixture.hpp
+++ b/alpaka/include/alpaka/test/KernelExecutionFixture.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Benjamin Worpitz, Andrea Bocci, Bernhard Manfred Gruber, Jan Stephan
+/* Copyright 2024 Benjamin Worpitz, Andrea Bocci, Bernhard Manfred Gruber, Jan Stephan, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -34,28 +34,52 @@ namespace alpaka::test
         using Queue = test::DefaultQueue<Device>;
         using WorkDiv = WorkDivMembers<Dim, Idx>;
 
-        KernelExecutionFixture(WorkDiv workDiv) : m_workDiv{std::move(workDiv)}
+        KernelExecutionFixture(WorkDiv workDiv) : m_queue{m_device}, m_workDiv{std::move(workDiv)}
         {
         }
 
         template<typename TExtent>
-        KernelExecutionFixture(TExtent const& extent)
-            : m_workDiv{getValidWorkDiv<Acc>(
-                m_device,
-                extent,
-                Vec<Dim, Idx>::ones(),
-                false,
-                GridBlockExtentSubDivRestrictions::Unrestricted)}
+        KernelExecutionFixture(TExtent const& extent) : m_queue{m_device}
+                                                      , m_extent{extent}
+        {
+        }
+
+        KernelExecutionFixture(Queue queue, WorkDiv workDiv)
+            : m_platform{} // if the platform is not stateless, this is wrong; we ignore it because it is not be used
+            , m_device{alpaka::getDev(queue)}
+            , m_queue{std::move(queue)}
+            , m_workDiv{std::move(workDiv)}
+        {
+        }
+
+        template<typename TExtent>
+        KernelExecutionFixture(Queue queue, TExtent const& extent)
+            : m_platform{} // if the platform is not stateless, this is wrong; we ignore it because it is not be used
+            , m_device{alpaka::getDev(queue)}
+            , m_queue{std::move(queue)}
+            , m_extent{extent}
         {
         }
 
         template<typename TKernelFnObj, typename... TArgs>
-        auto operator()(TKernelFnObj const& kernelFnObj, TArgs&&... args) -> bool
+        auto operator()(TKernelFnObj kernelFnObj, TArgs&&... args) -> bool
         {
             // Allocate the result value
             auto bufAccResult = allocBuf<bool, Idx>(m_device, static_cast<Idx>(1u));
             memset(m_queue, bufAccResult, static_cast<std::uint8_t>(true));
 
+
+            alpaka::KernelCfg<Acc> const kernelCfg = {m_extent, Vec<Dim, Idx>::ones()};
+
+            // set workdiv if it is not before
+            if(m_workDiv == WorkDiv{Vec<Dim, Idx>::all(0), Vec<Dim, Idx>::all(0), Vec<Dim, Idx>::all(0)})
+                m_workDiv = alpaka::getValidWorkDiv(
+                    kernelCfg,
+                    m_device,
+                    kernelFnObj,
+                    getPtrNative(bufAccResult),
+                    std::forward<TArgs>(args)...);
+
             exec<Acc>(m_queue, m_workDiv, kernelFnObj, getPtrNative(bufAccResult), std::forward<TArgs>(args)...);
 
             // Copy the result value to the host
@@ -73,7 +97,9 @@ namespace alpaka::test
         DevCpu m_devHost{getDevByIdx(m_platformHost, 0)};
         Platform m_platform{};
         Device m_device{getDevByIdx(m_platform, 0)};
-        Queue m_queue{m_device};
-        WorkDiv m_workDiv;
+        Queue m_queue;
+        WorkDiv m_workDiv{Vec<Dim, Idx>::all(0), Vec<Dim, Idx>::all(0), Vec<Dim, Idx>::all(0)};
+        Vec<Dim, Idx> m_extent;
     };
+
 } // namespace alpaka::test
diff --git a/alpaka/include/alpaka/test/acc/TestAccs.hpp b/alpaka/include/alpaka/test/acc/TestAccs.hpp
index c0751aa1..2370fa42 100644
--- a/alpaka/include/alpaka/test/acc/TestAccs.hpp
+++ b/alpaka/include/alpaka/test/acc/TestAccs.hpp
@@ -1,4 +1,5 @@
-/* Copyright 2023 Benjamin Worpitz, Erik Zenker, Matthias Werner, Andrea Bocci, Bernhard Manfred Gruber, Jan Stephan
+/* Copyright 2024 Benjamin Worpitz, Erik Zenker, Matthias Werner, Andrea Bocci, Bernhard Manfred Gruber, Jan Stephan,
+ * Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -79,21 +80,21 @@ namespace alpaka::test
 #endif
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_TARGET_CPU)
         template<typename TDim, typename TIdx>
-        using AccCpuSyclIfAvailableElseInt = alpaka::AccCpuSycl<TDim, TIdx>;
+        using AccCpuSyclIfAvailableElseInt = AccCpuSycl<TDim, TIdx>;
 #else
         template<typename TDim, typename TIdx>
         using AccCpuSyclIfAvailableElseInt = int;
 #endif
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_TARGET_FPGA)
         template<typename TDim, typename TIdx>
-        using AccFpgaSyclIntelIfAvailableElseInt = alpaka::AccFpgaSyclIntel<TDim, TIdx>;
+        using AccFpgaSyclIntelIfAvailableElseInt = AccFpgaSyclIntel<TDim, TIdx>;
 #else
         template<typename TDim, typename TIdx>
         using AccFpgaSyclIntelIfAvailableElseInt = int;
 #endif
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_TARGET_GPU)
         template<typename TDim, typename TIdx>
-        using AccGpuSyclIntelIfAvailableElseInt = alpaka::AccGpuSyclIntel<TDim, TIdx>;
+        using AccGpuSyclIntelIfAvailableElseInt = AccGpuSyclIntel<TDim, TIdx>;
 #else
         template<typename TDim, typename TIdx>
         using AccGpuSyclIntelIfAvailableElseInt = int;
diff --git a/alpaka/include/alpaka/test/event/EventHostManualTrigger.hpp b/alpaka/include/alpaka/test/event/EventHostManualTrigger.hpp
index 4ce36390..653dbbb6 100644
--- a/alpaka/include/alpaka/test/event/EventHostManualTrigger.hpp
+++ b/alpaka/include/alpaka/test/event/EventHostManualTrigger.hpp
@@ -1,5 +1,5 @@
-/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Jan Stephan, Jeffrey Kelling, Andrea Bocci,
- *                Bernhard Manfred Gruber
+/* Copyright 2024 Benjamin Worpitz, Matthias Werner, Jan Stephan, Jeffrey Kelling, Andrea Bocci,
+ *                Bernhard Manfred Gruber, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -712,11 +712,11 @@ namespace alpaka
 {
     namespace test
     {
-        template<typename TPlatform>
+        template<typename TTag>
         class EventHostManualTriggerSycl
         {
         public:
-            EventHostManualTriggerSycl(DevGenericSycl<TPlatform> const&)
+            EventHostManualTriggerSycl(DevGenericSycl<TTag> const&)
             {
             }
 
@@ -727,16 +727,16 @@ namespace alpaka
 
         namespace trait
         {
-            template<typename TPlatform>
-            struct EventHostManualTriggerType<DevGenericSycl<TPlatform>>
+            template<typename TTag>
+            struct EventHostManualTriggerType<DevGenericSycl<TTag>>
             {
-                using type = alpaka::test::EventHostManualTriggerSycl<TPlatform>;
+                using type = alpaka::test::EventHostManualTriggerSycl<TTag>;
             };
 
-            template<typename TPlatform>
-            struct IsEventHostManualTriggerSupported<DevGenericSycl<TPlatform>>
+            template<typename TTag>
+            struct IsEventHostManualTriggerSupported<DevGenericSycl<TTag>>
             {
-                ALPAKA_FN_HOST static auto isSupported(DevGenericSycl<TPlatform> const&) -> bool
+                ALPAKA_FN_HOST static auto isSupported(DevGenericSycl<TTag> const&) -> bool
                 {
                     return false;
                 }
@@ -746,35 +746,30 @@ namespace alpaka
 
     namespace trait
     {
-        template<typename TPlatform>
-        struct Enqueue<
-            QueueGenericSyclBlocking<DevGenericSycl<TPlatform>>,
-            test::EventHostManualTriggerSycl<TPlatform>>
+        template<typename TTag>
+        struct Enqueue<QueueGenericSyclBlocking<TTag>, test::EventHostManualTriggerSycl<TTag>>
         {
             ALPAKA_FN_HOST static auto enqueue(
-                QueueGenericSyclBlocking<DevGenericSycl<TPlatform>>& /* queue */,
-                test::EventHostManualTriggerSycl<TPlatform>& /* event */) -> void
+                QueueGenericSyclBlocking<TTag>& /* queue */,
+                test::EventHostManualTriggerSycl<TTag>& /* event */) -> void
             {
             }
         };
 
-        template<typename TPlatform>
-        struct Enqueue<
-            QueueGenericSyclNonBlocking<DevGenericSycl<TPlatform>>,
-            test::EventHostManualTriggerSycl<TPlatform>>
+        template<typename TTag>
+        struct Enqueue<QueueGenericSyclNonBlocking<TTag>, test::EventHostManualTriggerSycl<TTag>>
         {
             ALPAKA_FN_HOST static auto enqueue(
-                QueueGenericSyclNonBlocking<DevGenericSycl<TPlatform>>& /* queue */,
-                test::EventHostManualTriggerSycl<TPlatform>& /* event */) -> void
+                QueueGenericSyclNonBlocking<TTag>& /* queue */,
+                test::EventHostManualTriggerSycl<TTag>& /* event */) -> void
             {
             }
         };
 
-        template<typename TPlatform>
-        struct IsComplete<test::EventHostManualTriggerSycl<TPlatform>>
+        template<typename TTag>
+        struct IsComplete<test::EventHostManualTriggerSycl<TTag>>
         {
-            ALPAKA_FN_HOST static auto isComplete(test::EventHostManualTriggerSycl<TPlatform> const& /* event */)
-                -> bool
+            ALPAKA_FN_HOST static auto isComplete(test::EventHostManualTriggerSycl<TTag> const& /* event */) -> bool
             {
                 return true;
             }
diff --git a/alpaka/include/alpaka/test/queue/Queue.hpp b/alpaka/include/alpaka/test/queue/Queue.hpp
index 22432fc7..0518e6d4 100644
--- a/alpaka/include/alpaka/test/queue/Queue.hpp
+++ b/alpaka/include/alpaka/test/queue/Queue.hpp
@@ -1,4 +1,5 @@
-/* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci
+/* Copyright 2024 Benjamin Worpitz, Matthias Werner, René Widera, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci,
+ * Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -38,14 +39,20 @@ namespace alpaka::test
 #    endif
         };
 #endif
-    } // namespace trait
 
-    //! The queue type that should be used for the given device.
-    template<typename TDev>
-    using DefaultQueue = typename trait::DefaultQueueType<TDev>::type;
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+        //! The default queue type trait specialization for the SYCL device.
+        template<typename TTag>
+        struct DefaultQueueType<DevGenericSycl<TTag>>
+        {
+#    if(ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+            using type = QueueGenericSyclBlocking<TTag>;
+#    else
+            using type = QueueGenericSyclNonBlocking<TTag>;
+#    endif
+        };
+#endif
 
-    namespace trait
-    {
         //! The blocking queue trait.
         template<typename TQueue, typename TSfinae = void>
         struct IsBlockingQueue;
@@ -82,81 +89,24 @@ namespace alpaka::test
 #endif
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
-#    ifdef ALPAKA_SYCL_ONEAPI_CPU
-        //! The default queue type trait specialization for the Intel CPU device.
-        template<>
-        struct DefaultQueueType<alpaka::DevCpuSycl>
-        {
-#        if(ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-            using type = alpaka::QueueCpuSyclBlocking;
-#        else
-            using type = alpaka::QueueCpuSyclNonBlocking;
-#        endif
-        };
-
-        template<>
-        struct IsBlockingQueue<alpaka::QueueCpuSyclBlocking>
+        template<typename TTag>
+        struct IsBlockingQueue<QueueGenericSyclBlocking<TTag>>
         {
             static constexpr auto value = true;
         };
 
-        template<>
-        struct IsBlockingQueue<alpaka::QueueCpuSyclNonBlocking>
+        template<typename TTag>
+        struct IsBlockingQueue<QueueGenericSyclNonBlocking<TTag>>
         {
             static constexpr auto value = false;
         };
-#    endif
-#    ifdef ALPAKA_SYCL_ONEAPI_FPGA
-        //! The default queue type trait specialization for the Intel SYCL device.
-        template<>
-        struct DefaultQueueType<alpaka::DevFpgaSyclIntel>
-        {
-#        if(ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-            using type = alpaka::QueueFpgaSyclIntelBlocking;
-#        else
-            using type = alpaka::QueueFpgaSyclIntelNonBlocking;
-#        endif
-        };
-
-        template<>
-        struct IsBlockingQueue<alpaka::QueueFpgaSyclIntelBlocking>
-        {
-            static constexpr auto value = true;
-        };
-
-        template<>
-        struct IsBlockingQueue<alpaka::QueueFpgaSyclIntelNonBlocking>
-        {
-            static constexpr auto value = false;
-        };
-#    endif
-#    ifdef ALPAKA_SYCL_ONEAPI_GPU
-        //! The default queue type trait specialization for the Intel CPU device.
-        template<>
-        struct DefaultQueueType<alpaka::DevGpuSyclIntel>
-        {
-#        if(ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-            using type = alpaka::QueueGpuSyclIntelBlocking;
-#        else
-            using type = alpaka::QueueGpuSyclIntelNonBlocking;
-#        endif
-        };
-
-        template<>
-        struct IsBlockingQueue<alpaka::QueueGpuSyclIntelBlocking>
-        {
-            static constexpr auto value = true;
-        };
-
-        template<>
-        struct IsBlockingQueue<alpaka::QueueGpuSyclIntelNonBlocking>
-        {
-            static constexpr auto value = false;
-        };
-#    endif
 #endif
     } // namespace trait
 
+    //! The queue type that should be used for the given device.
+    template<typename TDev>
+    using DefaultQueue = typename trait::DefaultQueueType<TDev>::type;
+
     //! The queue type that should be used for the given accelerator.
     template<typename TQueue>
     using IsBlockingQueue = trait::IsBlockingQueue<TQueue>;
diff --git a/alpaka/include/alpaka/vec/Vec.hpp b/alpaka/include/alpaka/vec/Vec.hpp
index 616bed6b..d327f60f 100644
--- a/alpaka/include/alpaka/vec/Vec.hpp
+++ b/alpaka/include/alpaka/vec/Vec.hpp
@@ -41,6 +41,7 @@ namespace alpaka
 
         using Dim = TDim;
         using Val = TVal;
+        using value_type = Val; //!< STL-like value_type.
 
     private:
         //! A sequence of integers from 0 to dim-1.
@@ -87,18 +88,22 @@ namespace alpaka
         ALPAKA_FN_HOST_ACC constexpr explicit Vec(
             F&& generator,
             std::void_t<decltype(generator(std::integral_constant<std::size_t, 0>{}))>* ignore = nullptr)
+            : Vec(std::forward<F>(generator), std::make_index_sequence<TDim::value>{})
+        {
+            static_cast<void>(ignore);
+        }
 #else
         template<typename F, std::enable_if_t<std::is_invocable_v<F, std::integral_constant<std::size_t, 0>>, int> = 0>
         ALPAKA_FN_HOST_ACC constexpr explicit Vec(F&& generator)
-#endif
-            : Vec(std::forward<F>(generator), std::make_integer_sequence<TVal, TDim::value>{})
+            : Vec(std::forward<F>(generator), std::make_index_sequence<TDim::value>{})
         {
         }
+#endif
 
     private:
-        template<typename F, TVal... Is>
-        ALPAKA_FN_HOST_ACC constexpr explicit Vec(F&& generator, std::integer_sequence<TVal, Is...>)
-            : m_data{generator(std::integral_constant<TVal, Is>{})...}
+        template<typename F, std::size_t... Is>
+        ALPAKA_FN_HOST_ACC constexpr explicit Vec(F&& generator, std::index_sequence<Is...>)
+            : m_data{generator(std::integral_constant<std::size_t, Is>{})...}
         {
         }
 
@@ -587,7 +592,8 @@ namespace alpaka
     };
 
     template<typename TFirstIndex, typename... TRestIndices>
-    Vec(TFirstIndex&&, TRestIndices&&...) -> Vec<DimInt<1 + sizeof...(TRestIndices)>, std::decay_t<TFirstIndex>>;
+    ALPAKA_FN_HOST_ACC Vec(TFirstIndex&&, TRestIndices&&...)
+        -> Vec<DimInt<1 + sizeof...(TRestIndices)>, std::decay_t<TFirstIndex>>;
 
     template<typename T>
     inline constexpr bool isVec = false;
diff --git a/alpaka/include/alpaka/version.hpp b/alpaka/include/alpaka/version.hpp
index d35718a2..9ea2db77 100644
--- a/alpaka/include/alpaka/version.hpp
+++ b/alpaka/include/alpaka/version.hpp
@@ -7,7 +7,7 @@
 #include <boost/predef/version_number.h>
 
 #define ALPAKA_VERSION_MAJOR 1
-#define ALPAKA_VERSION_MINOR 1
+#define ALPAKA_VERSION_MINOR 2
 #define ALPAKA_VERSION_PATCH 0
 
 //! The alpaka library version number
diff --git a/alpaka/include/alpaka/workdiv/WorkDivHelpers.hpp b/alpaka/include/alpaka/workdiv/WorkDivHelpers.hpp
index 99f136f4..c15319cf 100644
--- a/alpaka/include/alpaka/workdiv/WorkDivHelpers.hpp
+++ b/alpaka/include/alpaka/workdiv/WorkDivHelpers.hpp
@@ -10,6 +10,8 @@
 #include "alpaka/core/Utility.hpp"
 #include "alpaka/dev/Traits.hpp"
 #include "alpaka/extent/Traits.hpp"
+#include "alpaka/kernel/KernelFunctionAttributes.hpp"
+#include "alpaka/kernel/Traits.hpp"
 #include "alpaka/vec/Vec.hpp"
 #include "alpaka/workdiv/WorkDivMembers.hpp"
 
@@ -20,6 +22,11 @@
 #include <set>
 #include <type_traits>
 
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wswitch-default"
+#endif
+
 //! The alpaka library.
 namespace alpaka
 {
@@ -113,23 +120,22 @@ namespace alpaka
     //! 2. The requirement of the block thread extent to divide the grid thread extent without remainder
     //! 3. The requirement of the block extent.
     //!
-    //! \param gridElemExtent
-    //!     The full extent of elements in the grid.
-    //! \param threadElemExtent
-    //!     the number of elements computed per thread.
-    //! \param accDevProps
-    //!     The maxima for the work division.
-    //! \param blockThreadMustDivideGridThreadExtent
-    //!     If this is true, the grid thread extent will be multiples of the corresponding block thread extent.
+    //! \param gridElemExtent The full extent of elements in the grid.
+    //! \param threadElemExtent the number of elements computed per thread.
+    //! \param accDevProps The maxima for the work division.
+    //! \param kernelBlockThreadCountMax The maximum number of threads per block. If it is zero this argument is not
+    //! used, device hard limits are used.
+    //! \param blockThreadMustDivideGridThreadExtent If this is true, the grid thread extent will be multiples of the
+    //! corresponding block thread extent.
     //!     NOTE: If this is true and gridThreadExtent is prime (or otherwise bad chosen) in a dimension, the block
     //!     thread extent will be one in this dimension.
-    //! \param gridBlockExtentSubDivRestrictions
-    //!     The grid block extent subdivision restrictions.
+    //! \param gridBlockExtentSubDivRestrictions The grid block extent subdivision restrictions.
     template<typename TDim, typename TIdx>
     ALPAKA_FN_HOST auto subDivideGridElems(
         Vec<TDim, TIdx> const& gridElemExtent,
         Vec<TDim, TIdx> const& threadElemExtent,
         AccDevProps<TDim, TIdx> const& accDevProps,
+        TIdx kernelBlockThreadCountMax = static_cast<TIdx>(0u),
         bool blockThreadMustDivideGridThreadExtent = true,
         GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions
         = GridBlockExtentSubDivRestrictions::Unrestricted) -> WorkDivMembers<TDim, TIdx>
@@ -148,7 +154,7 @@ namespace alpaka
 
         // Handle threadElemExtent and compute gridThreadExtent. Afterwards, only the blockThreadExtent has to be
         // optimized.
-        auto const clippedThreadElemExtent = elementwise_min(threadElemExtent, gridElemExtent);
+        auto clippedThreadElemExtent = elementwise_min(threadElemExtent, gridElemExtent);
         auto const gridThreadExtent = [&]
         {
             Vec r;
@@ -168,11 +174,27 @@ namespace alpaka
         // For equal block thread extent, restrict it to its minimum component.
         // For example (512, 256, 1024) will get (256, 256, 256).
         if(gridBlockExtentSubDivRestrictions == GridBlockExtentSubDivRestrictions::EqualExtent)
-            blockThreadExtent = Vec::all(blockThreadExtent.min());
+            blockThreadExtent = Vec::all(blockThreadExtent.min() != TIdx(0) ? blockThreadExtent.min() : TIdx(1));
+
+        // Choose kernelBlockThreadCountMax if it is not zero. It is less than the accelerator properties.
+        TIdx const& blockThreadCountMax
+            = (kernelBlockThreadCountMax != 0) ? kernelBlockThreadCountMax : accDevProps.m_blockThreadCountMax;
+
+        // Block thread extent could be {1024,1024,1024} although max threads per block is 1024. Block thread extent
+        // shows the max number of threads along each axis, it is not a measure to get max number of threads per block.
+        // It must be further limited (clipped above) by the kernel limit along each axis, using device limits is not
+        // enough.
+        for(typename TDim::value_type i(0); i < TDim::value; ++i)
+        {
+            blockThreadExtent[i] = std::min(blockThreadExtent[i], blockThreadCountMax);
+        }
 
         // Make the blockThreadExtent product smaller or equal to the accelerator's limit.
-        auto const& blockThreadCountMax = accDevProps.m_blockThreadCountMax;
-        if(blockThreadExtent.prod() > blockThreadCountMax)
+        if(blockThreadCountMax == 1)
+        {
+            blockThreadExtent = Vec::all(core::nthRootFloor(blockThreadCountMax, TIdx{TDim::value}));
+        }
+        else if(blockThreadExtent.prod() > blockThreadCountMax)
         {
             switch(gridBlockExtentSubDivRestrictions)
             {
@@ -205,6 +227,7 @@ namespace alpaka
             }
         }
 
+
         // Make the block thread extent divide the grid thread extent.
         if(blockThreadMustDivideGridThreadExtent)
         {
@@ -242,13 +265,15 @@ namespace alpaka
                 [[fallthrough]];
             case GridBlockExtentSubDivRestrictions::Unrestricted:
                 for(DimLoopInd i(0u); i < TDim::value; ++i)
+                {
                     blockThreadExtent[i] = detail::nextDivisorLowerOrEqual(gridThreadExtent[i], blockThreadExtent[i]);
+                }
                 break;
             }
         }
 
         // grid blocks extent = grid thread / block thread extent. quotient is rounded up.
-        auto const gridBlockExtent = [&]
+        auto gridBlockExtent = [&]
         {
             Vec r;
             for(DimLoopInd i = 0; i < TDim::value; ++i)
@@ -256,84 +281,135 @@ namespace alpaka
             return r;
         }();
 
+
+        // Store the maxima allowed for extents of grid, blocks and threads.
+        auto const gridBlockExtentMax = subVecEnd<TDim>(accDevProps.m_gridBlockExtentMax);
+        auto const blockThreadExtentMax = subVecEnd<TDim>(accDevProps.m_blockThreadExtentMax);
+        auto const threadElemExtentMax = subVecEnd<TDim>(accDevProps.m_threadElemExtentMax);
+
+        // Check that the extents for all dimensions are correct.
+        for(typename TDim::value_type i(0); i < TDim::value; ++i)
+        {
+            // Check that the maximum extents are greater or equal 1.
+            if(gridBlockExtentMax[i] < gridBlockExtent[i])
+            {
+                gridBlockExtent[i] = gridBlockExtentMax[i];
+            }
+            if(blockThreadExtentMax[i] < blockThreadExtent[i])
+            {
+                blockThreadExtent[i] = blockThreadExtentMax[i];
+            }
+            if(threadElemExtentMax[i] < threadElemExtent[i])
+            {
+                clippedThreadElemExtent[i] = threadElemExtentMax[i];
+            }
+        }
+
         return WorkDivMembers<TDim, TIdx>(gridBlockExtent, blockThreadExtent, clippedThreadElemExtent);
     }
 
-    //! \tparam TAcc The accelerator for which this work division has to be valid.
+    //! Kernel start configuration to determine a valid work division
+    //!
     //! \tparam TGridElemExtent The type of the grid element extent.
     //! \tparam TThreadElemExtent The type of the thread element extent.
-    //! \tparam TDev The type of the device.
-    //! \param dev
-    //!     The device the work division should be valid for.
-    //! \param gridElemExtent
-    //!     The full extent of elements in the grid.
-    //! \param threadElemExtents
-    //!     the number of elements computed per thread.
-    //! \param blockThreadMustDivideGridThreadExtent
-    //!     If this is true, the grid thread extent will be multiples of the corresponding block thread extent.
-    //!     NOTE: If this is true and gridThreadExtent is prime (or otherwise bad chosen) in a dimension, the block
-    //!     thread extent will be one in this dimension.
-    //! \param gridBlockExtentSubDivRestrictions
-    //!     The grid block extent subdivision restrictions.
-    //! \return The work division.
     template<
         typename TAcc,
-        typename TDev,
-        typename TGridElemExtent = Vec<Dim<TAcc>, Idx<TAcc>>,
-        typename TThreadElemExtent = Vec<Dim<TAcc>, Idx<TAcc>>>
-    ALPAKA_FN_HOST auto getValidWorkDiv(
-        [[maybe_unused]] TDev const& dev,
-        [[maybe_unused]] TGridElemExtent const& gridElemExtent = Vec<Dim<TAcc>, Idx<TAcc>>::ones(),
-        [[maybe_unused]] TThreadElemExtent const& threadElemExtents = Vec<Dim<TAcc>, Idx<TAcc>>::ones(),
-        [[maybe_unused]] bool blockThreadMustDivideGridThreadExtent = true,
-        [[maybe_unused]] GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions
-        = GridBlockExtentSubDivRestrictions::Unrestricted)
-        -> WorkDivMembers<Dim<TGridElemExtent>, Idx<TGridElemExtent>>
+        typename TGridElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>,
+        typename TThreadElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>>
+    struct KernelCfg
     {
+        //! The full extent of elements in the grid.
+        TGridElemExtent const gridElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>::ones();
+        //! The number of elements computed per thread.
+        TThreadElemExtent const threadElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>::ones();
+        //! If this is true, the grid thread extent will be multiples of
+        //! the corresponding block thread extent.
+        //!     NOTE: If this is true and gridThreadExtent is prime (or otherwise bad chosen) in a dimension, the block
+        //!     thread extent will be one in this dimension.
+        bool blockThreadMustDivideGridThreadExtent = true;
+        //! The grid block extent subdivision restrictions.
+        GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions
+            = GridBlockExtentSubDivRestrictions::Unrestricted;
+
         static_assert(
             Dim<TGridElemExtent>::value == Dim<TAcc>::value,
-            "The dimension of TAcc and the dimension of TGridElemExtent have to be identical!");
+            "The dimension of Acc and the dimension of TGridElemExtent have to be identical!");
         static_assert(
-            Dim<TThreadElemExtent>::value == Dim<TAcc>::value,
-            "The dimension of TAcc and the dimension of TThreadElemExtent have to be identical!");
+            Dim<TGridElemExtent>::value == Dim<TAcc>::value,
+            "The dimension of Acc and the dimension of TThreadElemExtent have to be identical!");
         static_assert(
             std::is_same_v<Idx<TGridElemExtent>, Idx<TAcc>>,
-            "The idx type of TAcc and the idx type of TGridElemExtent have to be identical!");
+            "The idx type of Acc and the idx type of TGridElemExtent have to be identical!");
         static_assert(
             std::is_same_v<Idx<TThreadElemExtent>, Idx<TAcc>>,
-            "The idx type of TAcc and the idx type of TThreadElemExtent have to be identical!");
+            "The idx type of Acc and the idx type of TThreadElemExtent have to be identical!");
+    };
+
+    //! \tparam TDev The type of the device.
+    //! \tparam TGridElemExtent The type of the grid element extent.
+    //! \tparam TThreadElemExtent The type of the thread element extent.
+    //! \param dev The device the work division should be valid for.
+    //! \param kernelFnObj The kernel function object which should be executed.
+    //! \param args The kernel invocation arguments.
+    //! \return The work division for the accelerator based on the kernel and argument types
+    template<
+        typename TAcc,
+        typename TDev,
+        typename TGridElemExtent,
+        typename TThreadElemExtent,
+        typename TKernelFnObj,
+        typename... TArgs>
+    ALPAKA_FN_HOST auto getValidWorkDiv(
+        KernelCfg<TAcc, TGridElemExtent, TThreadElemExtent> const& kernelCfg,
+        [[maybe_unused]] TDev const& dev,
+        TKernelFnObj const& kernelFnObj,
+        TArgs&&... args) -> WorkDivMembers<Dim<TAcc>, Idx<TAcc>>
+    {
+        using Acc = TAcc;
+
+        // Get max number of threads per block depending on the kernel function attributes.
+        // For GPU backend; number of registers used by the kernel, local and shared memory usage of the kernel
+        // determines the max number of threads per block. This number could be equal or less than the max number of
+        // threads per block defined by device properties.
+        auto const kernelFunctionAttributes
+            = getFunctionAttributes<Acc>(dev, kernelFnObj, std::forward<TArgs>(args)...);
+        auto const threadsPerBlock = kernelFunctionAttributes.maxThreadsPerBlock;
 
         if constexpr(Dim<TGridElemExtent>::value == 0)
         {
-            auto const zero = Vec<DimInt<0>, Idx<TAcc>>{};
-            ALPAKA_ASSERT(gridElemExtent == zero);
-            ALPAKA_ASSERT(threadElemExtents == zero);
-            return WorkDivMembers<DimInt<0>, Idx<TAcc>>{zero, zero, zero};
+            auto const zero = Vec<DimInt<0>, Idx<Acc>>{};
+            ALPAKA_ASSERT(kernelCfg.gridElemExtent == zero);
+            ALPAKA_ASSERT(kernelCfg.threadElemExtent == zero);
+            return WorkDivMembers<DimInt<0>, Idx<Acc>>{zero, zero, zero};
         }
         else
             return subDivideGridElems(
-                getExtents(gridElemExtent),
-                getExtents(threadElemExtents),
-                getAccDevProps<TAcc>(dev),
-                blockThreadMustDivideGridThreadExtent,
-                gridBlockExtentSubDivRestrictions);
+                getExtents(kernelCfg.gridElemExtent),
+                getExtents(kernelCfg.threadElemExtent),
+                getAccDevProps<Acc>(dev),
+                static_cast<Idx<Acc>>(threadsPerBlock),
+                kernelCfg.blockThreadMustDivideGridThreadExtent,
+                kernelCfg.gridBlockExtentSubDivRestrictions);
+
         using V [[maybe_unused]] = Vec<Dim<TGridElemExtent>, Idx<TGridElemExtent>>;
         ALPAKA_UNREACHABLE(WorkDivMembers<Dim<TGridElemExtent>, Idx<TGridElemExtent>>{V{}, V{}, V{}});
     }
 
+    //! Checks if the work division is supported
+    //!
+    //! \tparam TWorkDiv The type of the work division.
     //! \tparam TDim The dimensionality of the accelerator device properties.
     //! \tparam TIdx The idx type of the accelerator device properties.
-    //! \tparam TWorkDiv The type of the work division.
-    //! \param accDevProps The maxima for the work division.
     //! \param workDiv The work division to test for validity.
+    //! \param accDevProps The maxima for the work division.
     //! \return If the work division is valid for the given accelerator device properties.
-    template<typename TDim, typename TIdx, typename TWorkDiv>
-    ALPAKA_FN_HOST auto isValidWorkDiv(AccDevProps<TDim, TIdx> const& accDevProps, TWorkDiv const& workDiv) -> bool
+    template<typename TWorkDiv, typename TDim, typename TIdx>
+    ALPAKA_FN_HOST auto isValidWorkDiv(TWorkDiv const& workDiv, AccDevProps<TDim, TIdx> const& accDevProps) -> bool
     {
         // Get the extents of grid, blocks and threads of the work division to check.
         auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
         auto const blockThreadExtent = getWorkDiv<Block, Threads>(workDiv);
-        auto const threadElemExtent = getWorkDiv<Block, Threads>(workDiv);
+        auto const threadElemExtent = getWorkDiv<Thread, Elems>(workDiv);
 
         // Check that the maximal counts are satisfied.
         if(accDevProps.m_gridBlockCountMax < gridBlockExtent.prod())
@@ -372,13 +448,107 @@ namespace alpaka
         return true;
     }
 
+    //! Checks if the work division is supported
+    //!
+    //! \tparam TWorkDiv The type of the work division.
+    //! \tparam TDim The dimensionality of the accelerator device properties.
+    //! \tparam TIdx The idx type of the accelerator device properties.
+    //! \param workDiv The work division to test for validity.
+    //! \param accDevProps The maxima for the work division.
+    //! \param kernelFunctionAttributes Kernel attributes, including the maximum number of threads per block that can
+    //! be used by this kernel on the given device. This number can be equal to or smaller than the the number of
+    //! threads per block supported by the device.
+    //! \return Returns true if the work division is valid for the given accelerator device properties and for the
+    //! given kernel. Otherwise returns false.
+    template<typename TAcc, typename TWorkDiv, typename TDim, typename TIdx>
+    ALPAKA_FN_HOST auto isValidWorkDiv(
+        TWorkDiv const& workDiv,
+        AccDevProps<TDim, TIdx> const& accDevProps,
+        KernelFunctionAttributes const& kernelFunctionAttributes) -> bool
+    {
+        // Get the extents of grid, blocks and threads of the work division to check.
+        auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
+        auto const blockThreadExtent = getWorkDiv<Block, Threads>(workDiv);
+        auto const threadElemExtent = getWorkDiv<Thread, Elems>(workDiv);
+        // Use kernel properties to find the max threads per block for the kernel
+        auto const threadsPerBlockForKernel = kernelFunctionAttributes.maxThreadsPerBlock;
+        // Select the minimum to find the upper bound for the threads per block
+        auto const allowedThreadsPerBlock = std::min(
+            static_cast<TIdx>(threadsPerBlockForKernel),
+            static_cast<TIdx>(accDevProps.m_blockThreadCountMax));
+        // Check that the maximal counts are satisfied.
+        if(accDevProps.m_gridBlockCountMax < gridBlockExtent.prod())
+        {
+            return false;
+        }
+        if(allowedThreadsPerBlock < blockThreadExtent.prod())
+        {
+            return false;
+        }
+        if(accDevProps.m_threadElemCountMax < threadElemExtent.prod())
+        {
+            return false;
+        }
+
+        // Check that the extents for all dimensions are correct.
+        if constexpr(Dim<TWorkDiv>::value > 0)
+        {
+            // Store the maxima allowed for extents of grid, blocks and threads.
+            auto const gridBlockExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_gridBlockExtentMax);
+            auto const blockThreadExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_blockThreadExtentMax);
+            auto const threadElemExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_threadElemExtentMax);
+
+            for(typename Dim<TWorkDiv>::value_type i(0); i < Dim<TWorkDiv>::value; ++i)
+            {
+                // No extent is allowed to be zero or greater then the allowed maximum.
+                if((gridBlockExtent[i] < 1) || (blockThreadExtent[i] < 1) || (threadElemExtent[i] < 1)
+                   || (gridBlockExtentMax[i] < gridBlockExtent[i]) || (blockThreadExtentMax[i] < blockThreadExtent[i])
+                   || (threadElemExtentMax[i] < threadElemExtent[i]))
+                {
+                    return false;
+                }
+            }
+        }
+
+        return true;
+    }
+
+    //! Checks if the work division is supported for the kernel on the device
+    //!
     //! \tparam TAcc The accelerator to test the validity on.
+    //! \tparam TDev The type of the device.
+    //! \tparam TWorkDiv The type of work division to test for validity.
+    //! \param workDiv The work division to test for validity.
     //! \param dev The device to test the work division for validity on.
+    //! \param kernelFnObj The kernel function object which should be executed.
+    //! \param args The kernel invocation arguments.
+    //! \return Returns the value of isValidWorkDiv function.
+    template<typename TAcc, typename TWorkDiv, typename TDev, typename TKernelFnObj, typename... TArgs>
+    ALPAKA_FN_HOST auto isValidWorkDiv(
+        TWorkDiv const& workDiv,
+        TDev const& dev,
+        TKernelFnObj const& kernelFnObj,
+        TArgs&&... args) -> bool
+    {
+        return isValidWorkDiv<TAcc>(
+            workDiv,
+            getAccDevProps<TAcc>(dev),
+            getFunctionAttributes<TAcc>(dev, kernelFnObj, std::forward<TArgs>(args)...));
+    }
+
+    //! Checks if the work division is supported by the device
+    //!
+    //! \tparam TAcc The accelerator to test the validity on.
     //! \param workDiv The work division to test for validity.
+    //! \param dev The device to test the work division for validity on.
     //! \return If the work division is valid on this accelerator.
-    template<typename TAcc, typename TDev, typename TWorkDiv>
-    ALPAKA_FN_HOST auto isValidWorkDiv(TDev const& dev, TWorkDiv const& workDiv) -> bool
+    template<typename TAcc, typename TWorkDiv, typename TDev>
+    ALPAKA_FN_HOST auto isValidWorkDiv(TWorkDiv const& workDiv, TDev const& dev) -> bool
     {
-        return isValidWorkDiv(getAccDevProps<TAcc>(dev), workDiv);
+        return isValidWorkDiv(workDiv, getAccDevProps<TAcc>(dev));
     }
 } // namespace alpaka
+
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic pop
+#endif
diff --git a/alpaka/include/alpaka/workdiv/WorkDivMembers.hpp b/alpaka/include/alpaka/workdiv/WorkDivMembers.hpp
index f81a1567..3d364508 100644
--- a/alpaka/include/alpaka/workdiv/WorkDivMembers.hpp
+++ b/alpaka/include/alpaka/workdiv/WorkDivMembers.hpp
@@ -21,6 +21,7 @@ namespace alpaka
     public:
         ALPAKA_FN_HOST_ACC WorkDivMembers() = delete;
 
+        //! Accepts different alpaka vector types and takes the last TDim number of items.
         ALPAKA_NO_HOST_ACC_WARNING
         template<typename TGridBlockExtent, typename TBlockThreadExtent, typename TThreadElemExtent>
         ALPAKA_FN_HOST_ACC explicit WorkDivMembers(
@@ -33,6 +34,18 @@ namespace alpaka
         {
         }
 
+        //! \brief Accepts single specific type and is called without explicit template parameters.
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC WorkDivMembers(
+            alpaka::Vec<TDim, TIdx> const& gridBlockExtent,
+            alpaka::Vec<TDim, TIdx> const& blockThreadExtent,
+            alpaka::Vec<TDim, TIdx> const& elemExtent)
+            : m_gridBlockExtent(gridBlockExtent)
+            , m_blockThreadExtent(blockThreadExtent)
+            , m_threadElemExtent(elemExtent)
+        {
+        }
+
         ALPAKA_NO_HOST_ACC_WARNING
         ALPAKA_FN_HOST_ACC WorkDivMembers(WorkDivMembers const& other)
             : m_gridBlockExtent(other.m_gridBlockExtent)
@@ -83,6 +96,14 @@ namespace alpaka
         Vec<TDim, TIdx> m_threadElemExtent;
     };
 
+    //! Deduction guide for the constructor which can be called without explicit template type parameters
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TDim, typename TIdx>
+    ALPAKA_FN_HOST_ACC WorkDivMembers(
+        alpaka::Vec<TDim, TIdx> const& gridBlockExtent,
+        alpaka::Vec<TDim, TIdx> const& blockThreadExtent,
+        alpaka::Vec<TDim, TIdx> const& elemExtent) -> WorkDivMembers<TDim, TIdx>;
+
     namespace trait
     {
         //! The WorkDivMembers dimension get trait specialization.
diff --git a/alpaka/script/after_failure.sh b/alpaka/script/after_failure.sh
index f14564cb..96247ae6 100755
--- a/alpaka/script/after_failure.sh
+++ b/alpaka/script/after_failure.sh
@@ -5,7 +5,10 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
-source ./script/set.sh
+set +xv
+source ./script/setup_utilities.sh
+
+echo_green "<SCRIPT: after_failure>"
 
 if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
 then
diff --git a/alpaka/script/before_install.sh b/alpaka/script/before_install.sh
index 0bb1582d..b3c8a807 100755
--- a/alpaka/script/before_install.sh
+++ b/alpaka/script/before_install.sh
@@ -4,8 +4,10 @@
 # Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber, Jan Stephan, Simeon Ehrig
 # SPDX-License-Identifier: MPL-2.0
 #
+set +xv
+source ./script/setup_utilities.sh
 
-source ./script/set.sh
+echo_green "<SCRIPT: before_install>"
 
 # because of the strict abort conditions, a variable needs to be defined, if we read from
 # this statement avoids additional checks later in the scripts
@@ -93,10 +95,15 @@ if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
 then
     if [ "${ALPAKA_CI_STDLIB}" == "libc++" ]
     then
-        if [[ "${CXX}" == "g++"* ]]
+        if [[ "${ALPAKA_CI_CXX}" == "g++"* ]]
         then
             echo "using libc++ with g++ not yet supported."
             exit 1
         fi
     fi
 fi
+
+if [ "$ALPAKA_CI_OS_NAME" = "Windows" ] || [ "$ALPAKA_CI_OS_NAME" = "macOS" ]
+then
+    export CMAKE_CXX_COMPILER=$ALPAKA_CI_CXX
+fi
diff --git a/alpaka/script/ci.sh b/alpaka/script/ci.sh
index 8bdf3ece..f50b37ec 100755
--- a/alpaka/script/ci.sh
+++ b/alpaka/script/ci.sh
@@ -5,16 +5,18 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
-source ./script/set.sh
+set +xv
+source ./script/setup_utilities.sh
+
+echo_green "<SCRIPT: ci>"
 
 ./script/print_env.sh
 source ./script/before_install.sh
-
-if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
-then
-  ./script/docker_ci.sh
-elif [ "$ALPAKA_CI_OS_NAME" = "Windows" ] || [ "$ALPAKA_CI_OS_NAME" = "macOS" ]
-then
-  source ./script/install.sh
-  ./script/run.sh
+if [ -n "$GITHUB_ACTIONS" ] && [ "$ALPAKA_CI_OS_NAME" = "Linux" ]; then
+  # Workaround for the error: ThreadSanitizer: unexpected memory mapping
+  # change the configuration of the address space layout randomization
+  sudo sysctl vm.mmap_rnd_bits=28
 fi
+
+source ./script/install.sh
+./script/run.sh
diff --git a/alpaka/script/docker_ci.sh b/alpaka/script/docker_ci.sh
deleted file mode 100755
index 6e7c9e25..00000000
--- a/alpaka/script/docker_ci.sh
+++ /dev/null
@@ -1,203 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2023 Benjamin Worpitz, Bernhard Manfred Gruber, Jan Stephan
-# SPDX-License-Identifier: MPL-2.0
-#
-
-source ./script/set.sh
-source ./script/docker_retry.sh
-
-ALPAKA_CI_BOOST_BRANCH="boost-${ALPAKA_BOOST_VERSION}"
-
-# runtime and compile time options
-ALPAKA_DOCKER_ENV_LIST=()
-ALPAKA_DOCKER_ENV_LIST+=("--env" "CC=${CC}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "CXX=${CXX}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_OS_NAME=${ALPAKA_CI_OS_NAME}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_ANALYSIS=${ALPAKA_CI_ANALYSIS}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_TBB_VERSION=${ALPAKA_CI_TBB_VERSION}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_BOOST_BRANCH=${ALPAKA_CI_BOOST_BRANCH}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "BOOST_ROOT=${BOOST_ROOT}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_BOOST_VERSION=${ALPAKA_BOOST_VERSION}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_BOOST_LIB_DIR=${ALPAKA_CI_BOOST_LIB_DIR}")
-if [ ! -z "${ALPAKA_CI_CLANG_VER+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CLANG_VER=${ALPAKA_CI_CLANG_VER}")
-fi
-if [ ! -z "${ALPAKA_CI_BUILD_JOBS+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_BUILD_JOBS=${ALPAKA_CI_BUILD_JOBS}")
-fi
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_STDLIB=${ALPAKA_CI_STDLIB}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CMAKE_VER=${ALPAKA_CI_CMAKE_VER}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CMAKE_DIR=${ALPAKA_CI_CMAKE_DIR}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_RUN_TESTS=${ALPAKA_CI_RUN_TESTS}")
-if [ ! -z "${CMAKE_CXX_FLAGS+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}")
-fi
-if [ ! -z "${CMAKE_C_COMPILER+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "CMAKE_C_COMPILER=${CMAKE_C_COMPILER}")
-fi
-if [ ! -z "${CMAKE_CXX_COMPILER+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}")
-fi
-if [ ! -z "${CMAKE_EXE_LINKER_FLAGS+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "CMAKE_EXE_LINKER_FLAGS=${CMAKE_EXE_LINKER_FLAGS}")
-fi
-if [ ! -z "${CMAKE_CXX_EXTENSIONS+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "CMAKE_CXX_EXTENSIONS=${CMAKE_CXX_EXTENSIONS}")
-fi
-if [ ! -z "${ALPAKA_CI_GCC_VER+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_GCC_VER=${ALPAKA_CI_GCC_VER}")
-fi
-if [ ! -z "${ALPAKA_CI_SANITIZERS+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_SANITIZERS=${ALPAKA_CI_SANITIZERS}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "alpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE=${alpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE}")
-fi
-if [ ! -z "${alpaka_ACC_CPU_B_SEQ_T_THREADS_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "alpaka_ACC_CPU_B_SEQ_T_THREADS_ENABLE=${alpaka_ACC_CPU_B_SEQ_T_THREADS_ENABLE}")
-fi
-if [ ! -z "${alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE=${alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE}")
-fi
-if [ ! -z "${alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE=${alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE}")
-fi
-if [ ! -z "${alpaka_ACC_GPU_CUDA_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "alpaka_ACC_GPU_CUDA_ENABLE=${alpaka_ACC_GPU_CUDA_ENABLE}")
-fi
-if [ ! -z "${alpaka_ACC_GPU_HIP_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "alpaka_ACC_GPU_HIP_ENABLE=${alpaka_ACC_GPU_HIP_ENABLE}")
-fi
-if [ ! -z "${alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE=${alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE}")
-fi
-if [ ! -z "${alpaka_ACC_SYCL_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "alpaka_ACC_SYCL_ENABLE=${alpaka_ACC_SYCL_ENABLE}")
-fi
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_INSTALL_CUDA=${ALPAKA_CI_INSTALL_CUDA}")
-if [ "${ALPAKA_CI_INSTALL_CUDA}" == "ON" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CUDA_DIR=${ALPAKA_CI_CUDA_DIR}")
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CUDA_VERSION=${ALPAKA_CI_CUDA_VERSION}")
-    if [ ! -z "${CMAKE_CUDA_COMPILER+x}" ]
-    then
-        ALPAKA_DOCKER_ENV_LIST+=("--env" "CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}")
-    fi
-    if [ ! -z "${CMAKE_CUDA_ARCHITECTURES+x}" ]
-    then
-        ALPAKA_DOCKER_ENV_LIST+=("--env" "CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}")
-    fi
-    if [ ! -z "${CMAKE_CUDA_FLAGS+x}" ]
-    then
-        ALPAKA_DOCKER_ENV_LIST+=("--env" "CMAKE_CUDA_FLAGS=${CMAKE_CUDA_FLAGS}")
-    fi
-fi
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_INSTALL_HIP=${ALPAKA_CI_INSTALL_HIP}")
-if [ "${ALPAKA_CI_INSTALL_HIP}" == "ON" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_HIP_ROOT_DIR=${ALPAKA_CI_HIP_ROOT_DIR}")
-    if [ ! -z "${CMAKE_HIP_COMPILER+x}" ]
-    then
-        ALPAKA_DOCKER_ENV_LIST+=("--env" "CMAKE_HIP_COMPILER=${CMAKE_HIP_COMPILER})")
-    fi
-    if [ ! -z "${CMAKE_HIP_ARCHITECTURES+x}" ]
-    then
-        ALPAKA_DOCKER_ENV_LIST+=("--env" "CMAKE_HIP_ARCHITECTURES=${CMAKE_HIP_ARCHITECTURES})")
-    fi
-    if [! -z "${CMAKE_HIP_FLAGS+x}" ]
-    then
-        ALPAKA_DOCKER_ENV_LIST+=("--env" "CMAKE_HIP_FLAGS=${CMAKE_HIP_FLAGS}")
-    fi
-fi
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_INSTALL_TBB=${ALPAKA_CI_INSTALL_TBB}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_INSTALL_OMP=${ALPAKA_CI_INSTALL_OMP}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_INSTALL_ATOMIC=${ALPAKA_CI_INSTALL_ATOMIC}")
-
-# runtime only options
-ALPAKA_DOCKER_ENV_LIST+=("--env" "alpaka_CI=${alpaka_CI}")
-if [ ! -z "${alpaka_DEBUG+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "alpaka_DEBUG=${alpaka_DEBUG}")
-fi
-if [ ! -z "${alpaka_CXX_STANDARD+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "alpaka_CXX_STANDARD=${alpaka_CXX_STANDARD}")
-fi
-if [ ! -z "${OMP_NUM_THREADS+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "OMP_NUM_THREADS=${OMP_NUM_THREADS}")
-fi
-if [ ! -z "${alpaka_ACC_GPU_CUDA_ONLY_MODE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "alpaka_ACC_GPU_CUDA_ONLY_MODE=${alpaka_ACC_GPU_CUDA_ONLY_MODE}")
-fi
-if [ ! -z "${alpaka_ACC_GPU_HIP_ONLY_MODE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "alpaka_ACC_GPU_HIP_ONLY_MODE=${alpaka_ACC_GPU_HIP_ONLY_MODE}")
-fi
-if [ ! -z "${alpaka_CUDA_FAST_MATH+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "alpaka_CUDA_FAST_MATH=${alpaka_CUDA_FAST_MATH}")
-fi
-if [ ! -z "${alpaka_CUDA_FTZ+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "alpaka_CUDA_FTZ=${alpaka_CUDA_FTZ}")
-fi
-if [ ! -z "${alpaka_CUDA_SHOW_REGISTER+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "alpaka_CUDA_SHOW_REGISTER=${alpaka_CUDA_SHOW_REGISTER}")
-fi
-if [ ! -z "${alpaka_CUDA_KEEP_FILES+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "alpaka_CUDA_KEEP_FILES=${alpaka_CUDA_KEEP_FILES}")
-fi
-if [ ! -z "${alpaka_CUDA_EXPT_EXTENDED_LAMBDA+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "alpaka_CUDA_EXPT_EXTENDED_LAMBDA=${alpaka_CUDA_EXPT_EXTENDED_LAMBDA}")
-fi
-if [ ! -z "${CMAKE_CUDA_SEPARABLE_COMPILATION+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "CMAKE_CUDA_SEPARABLE_COMPILATION=${CMAKE_CUDA_SEPARABLE_COMPILATION}")
-fi
-if [ ! -z "${CMAKE_INSTALL_PREFIX+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "CMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}")
-fi
-if [ ! -z "${alpaka_USE_MDSPAN+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "alpaka_USE_MDSPAN=${alpaka_USE_MDSPAN}")
-fi
-if [ ! -z "${alpaka_ENABLE_WERROR+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "alpaka_ENABLE_WERROR=${alpaka_ENABLE_WERROR}")
-fi
-if [ ! -z "${alpaka_SYCL_ONEAPI_CPU+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "alpaka_SYCL_ONEAPI_CPU=${alpaka_SYCL_ONEAPI_CPU}")
-fi
-if [! -z "${alpaka_SYCL_ONEAPI_CPU_ISA+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "alpaka_SYCL_ONEAPI_CPU_ISA=${alpaka_SYCL_ONEAPI_CPU_ISA}")
-fi
-
-docker_retry docker run -v "$(pwd)":"$(pwd)" -w "$(pwd)" "${ALPAKA_DOCKER_ENV_LIST[@]}" "${ALPAKA_CI_DOCKER_BASE_IMAGE_NAME}" /bin/bash -c "source ./script/install.sh && ./script/run.sh"
diff --git a/alpaka/script/docker_retry.sh b/alpaka/script/docker_retry.sh
deleted file mode 100755
index 37d7c7ff..00000000
--- a/alpaka/script/docker_retry.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-#
-# Copyright 2019-2020 Benjamin Worpitz, Rene Widera
-# SPDX-License-Identifier: MPL-2.0
-#
-
-ANSI_RED="\033[31m"
-ANSI_RESET="\033[0m"
-
-# rerun docker command if error 125 (
-#   - triggered by image download problems
-#   - wait 30 seconds before retry
-docker_retry() {
-  set +euo pipefail
-  local result=0
-  local count=1
-  while [ $count -le 3 ]; do
-    [ $result -eq 125 ] && {
-      echo -e "\n${ANSI_RED}The command \"$*\" failed. Retrying, $count of 3.${ANSI_RESET}\n" >&2
-    }
-    "$@"
-    result=$?
-    [ $result -ne 125 ] && break
-    count=$((count + 1))
-    sleep 30
-  done
-  [ $count -gt 3 ] && {
-    echo -e "\n${ANSI_RED}The command \"$*\" failed 3 times.${ANSI_RESET}\n" >&2
-  }
-  return $result
-}
diff --git a/alpaka/script/gitlab_ci_run.sh b/alpaka/script/gitlab_ci_run.sh
index 5a1df140..b82be79f 100755
--- a/alpaka/script/gitlab_ci_run.sh
+++ b/alpaka/script/gitlab_ci_run.sh
@@ -5,15 +5,8 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
-source ./script/set.sh
-
-# inside the agc-container, the user is root and does not require sudo
-# to compatibility to other container, fake the missing sudo command
-if ! command -v sudo &> /dev/null
-then
-    cp ${CI_PROJECT_DIR}/script/gitlabci/fake_sudo.sh /usr/bin/sudo
-    chmod +x /usr/bin/sudo
-fi
+set +xv
+source ./script/setup_utilities/set.sh
 
 source ./script/before_install.sh
 source ./script/install.sh
diff --git a/alpaka/script/gitlabci/fake_sudo.sh b/alpaka/script/gitlabci/fake_sudo.sh
deleted file mode 100644
index 59498ace..00000000
--- a/alpaka/script/gitlabci/fake_sudo.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2021 Simeon Ehrig
-# SPDX-License-Identifier: MPL-2.0
-#
-
-
-# execute arguments
-$@
diff --git a/alpaka/script/gitlabci/job_analysis.yml b/alpaka/script/gitlabci/job_analysis.yml
index 3cfee9ff..7cd1695d 100644
--- a/alpaka/script/gitlabci/job_analysis.yml
+++ b/alpaka/script/gitlabci/job_analysis.yml
@@ -8,7 +8,7 @@ linux_clang-14_debug_analysis:
     CMAKE_BUILD_TYPE: Debug
     ALPAKA_BOOST_VERSION: 1.80.0
     ALPAKA_CI_CMAKE_VER: 3.23.5
+    alpaka_CXX_STANDARD: 17
     ALPAKA_CI_ANALYSIS: "ON"
     alpaka_DEBUG: 2
     alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: "OFF"
-
diff --git a/alpaka/script/gitlabci/job_base.yml b/alpaka/script/gitlabci/job_base.yml
index 96b77fa9..a91bb2a6 100644
--- a/alpaka/script/gitlabci/job_base.yml
+++ b/alpaka/script/gitlabci/job_base.yml
@@ -18,13 +18,12 @@
   image: ubuntu:${ALPAKA_CI_UBUNTU_VER}
   variables:
     ALPAKA_CI_UBUNTU_VER: "20.04"
-    CC: gcc
-    CXX: g++
+    ALPAKA_CI_CXX: g++
     alpaka_ACC_CPU_B_SEQ_T_THREADS_ENABLE: "ON"
     alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE: "ON"
     alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: "OFF"
     alpaka_ACC_GPU_CUDA_ENABLE: "ON"
-    CMAKE_CUDA_COMPILER: nvcc
+    ALPAKA_CI_CUDA_COMPILER: nvcc
     ALPAKA_CI_STDLIB: libstdc++
     # CI contains a Quadro P5000 (sm_61)
     CMAKE_CUDA_ARCHITECTURES: "61"
@@ -51,13 +50,12 @@
   image: ubuntu:${ALPAKA_CI_UBUNTU_VER}
   variables:
     ALPAKA_CI_UBUNTU_VER: "20.04"
-    CC: clang
-    CXX: clang++
+    ALPAKA_CI_CXX: clang++
     alpaka_ACC_CPU_B_SEQ_T_THREADS_ENABLE: "ON"
     alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE: "ON"
     alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: "OFF"
     alpaka_ACC_GPU_CUDA_ENABLE: "ON"
-    CMAKE_CUDA_COMPILER: clang++
+    ALPAKA_CI_CUDA_COMPILER: clang++
     ALPAKA_CI_STDLIB: libstdc++
     # CI contains a Quadro P5000 (sm_61)
     CMAKE_CUDA_ARCHITECTURES: "61"
@@ -87,8 +85,7 @@
   image: ubuntu:${ALPAKA_CI_UBUNTU_VER}
   variables:
     ALPAKA_CI_UBUNTU_VER: "22.04"
-    CC: clang
-    CXX: clang++
+    ALPAKA_CI_CXX: clang++
     ALPAKA_CI_SANITIZERS: ""
     ALPAKA_CI_ANALYSIS: "OFF"
     ALPAKA_CI_TBB_VERSION: 2021.4.0
diff --git a/alpaka/script/gitlabci/job_clang.yml b/alpaka/script/gitlabci/job_clang.yml
index 2dcc4250..9aed3dd1 100644
--- a/alpaka/script/gitlabci/job_clang.yml
+++ b/alpaka/script/gitlabci/job_clang.yml
@@ -9,6 +9,7 @@ linux_clang-11_release:
     ALPAKA_BOOST_VERSION: 1.75.0
     ALPAKA_CI_CMAKE_VER: 3.22.6
     CMAKE_CXX_EXTENSIONS: "OFF"
+    alpaka_CXX_STANDARD: 17
 
 linux_clang-14_debug:
   extends: .base_clang
@@ -18,7 +19,8 @@ linux_clang-14_debug:
     CMAKE_BUILD_TYPE: Debug
     ALPAKA_BOOST_VERSION: 1.80.0
     ALPAKA_CI_CMAKE_VER: 3.23.5
- 
+    alpaka_CXX_STANDARD: 17
+
 linux_clang-16_relwithdebinfo_asan_c++20:
   extends: .base_clang
   variables:
diff --git a/alpaka/script/gitlabci/job_cuda.yml b/alpaka/script/gitlabci/job_cuda.yml
index 55e85c05..3da01c53 100644
--- a/alpaka/script/gitlabci/job_cuda.yml
+++ b/alpaka/script/gitlabci/job_cuda.yml
@@ -1,27 +1,29 @@
 # SPDX-License-Identifier: MPL-2.0
 
 # nvcc + g++
-linux_nvcc12.0_gcc12_debug_relocatable_device_code_compile_only:
+linux_nvcc12.0_gcc11_debug_relocatable_device_code_compile_only:
   extends: .base_cuda_gcc_compile_only
-  image: registry.hzdr.de/crp/alpaka-group-container/alpaka-ci-ubuntu20.04-cuda110-gcc:3.1
+  image: registry.hzdr.de/crp/alpaka-group-container/alpaka-ci-ubuntu20.04-cuda110-gcc:3.2
   variables:
     ALPAKA_CI_UBUNTU_VER: "20.04"
     ALPAKA_CI_CUDA_VERSION: "12.0"
-    ALPAKA_CI_GCC_VER: 12
+    ALPAKA_CI_GCC_VER: 11
     CMAKE_BUILD_TYPE: Debug
     ALPAKA_BOOST_VERSION: 1.81.0
     ALPAKA_CI_CMAKE_VER: 3.26.5
+    alpaka_CXX_STANDARD: 17
     alpaka_RELOCATABLE_DEVICE_CODE: "ON"
 
-linux_nvcc12.0_gcc12_release_extended_lambda_off_compile_only:
+linux_nvcc12.0_gcc11_release_extended_lambda_off_compile_only:
   extends: .base_cuda_gcc_compile_only
-  image: registry.hzdr.de/crp/alpaka-group-container/alpaka-ci-ubuntu20.04-cuda110-gcc:3.1
+  image: registry.hzdr.de/crp/alpaka-group-container/alpaka-ci-ubuntu20.04-cuda110-gcc:3.2
   variables:
     ALPAKA_CI_UBUNTU_VER: "20.04"
     ALPAKA_CI_CUDA_VERSION: "12.0"
-    ALPAKA_CI_GCC_VER: 12
+    ALPAKA_CI_GCC_VER: 11
     CMAKE_BUILD_TYPE: Release
     ALPAKA_BOOST_VERSION: 1.82.0
     ALPAKA_CI_CMAKE_VER: 3.27.1
+    alpaka_CXX_STANDARD: 17
     alpaka_ACC_GPU_CUDA_ENABLE: "ON"
     alpaka_CUDA_EXPT_EXTENDED_LAMBDA: "OFF"
diff --git a/alpaka/script/gitlabci/print_env.sh b/alpaka/script/gitlabci/print_env.sh
index 95bc0cbf..5a4dae15 100755
--- a/alpaka/script/gitlabci/print_env.sh
+++ b/alpaka/script/gitlabci/print_env.sh
@@ -5,6 +5,12 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
+# set exit on error manually instead using setup_utilities because
+# otherwise the begin of the job log looks not helpful
+if [ -z ${alpaka_DISABLE_EXIT_FAILURE+x} ]; then
+    set -e
+fi
+
 # display output with yellow color
 echo -e "\033[0;33mSteps to setup containter locally"
 
@@ -14,17 +20,17 @@ if [ "${CMAKE_CXX_COMPILER}" == "nvc++" ] || [ "${alpaka_ACC_GPU_CUDA_ENABLE}" =
 then
     if [ "${ALPAKA_CI_RUN_TESTS}" == "ON" ];
     then
-	    echo "${first_step_prefix} docker run --gpus=all -it ${CI_JOB_IMAGE} bash"
+        echo "${first_step_prefix} docker run --gpus=all -it ${CI_JOB_IMAGE} bash"
     else
-	    echo "${first_step_prefix} docker run -it ${CI_JOB_IMAGE} bash"
+        echo "${first_step_prefix} docker run -it ${CI_JOB_IMAGE} bash"
     fi
 elif [ "${alpaka_ACC_GPU_HIP_ENABLE}" == "ON" ];
 then
     if [ "${ALPAKA_CI_RUN_TESTS}" == "ON" ];
     then
-	    echo "${first_step_prefix} docker run -it --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video ${CI_JOB_IMAGE} bash"
+        echo "${first_step_prefix} docker run -it --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video ${CI_JOB_IMAGE} bash"
     else
-	    echo "${first_step_prefix} docker run -it ${CI_JOB_IMAGE} bash"
+        echo "${first_step_prefix} docker run -it ${CI_JOB_IMAGE} bash"
     fi
 else
     echo "${first_step_prefix} docker run -it ${CI_JOB_IMAGE} bash"
@@ -33,10 +39,12 @@ fi
 echo -e "2. Run the following export commands in the container to setup enviroment\n"
 
 # take all env variables, filter it and display it with a `export` prefix
-printenv | grep -E 'alpaka_*|ALPAKA_*|CMAKE_*|BOOST_|CC|CXX|CUDA_' | while read -r line ; do
+printenv | grep -E 'alpaka_*|ALPAKA_*|CMAKE_*|BOOST_|CUDA_' | while read -r line ; do
     echo "export $line \\"
 done
 
+# the variable is not set, but should be set if a job is debugged locally in a container
+echo 'export alpaka_DISABLE_EXIT_FAILURE=true \'
 echo 'export GITLAB_CI=true'
 echo ""
 
diff --git a/alpaka/script/homebrew/13.2.1/libomp.rb b/alpaka/script/homebrew/13.2.1/libomp.rb
index ec77d91b..8225ad21 100644
--- a/alpaka/script/homebrew/13.2.1/libomp.rb
+++ b/alpaka/script/homebrew/13.2.1/libomp.rb
@@ -6,12 +6,12 @@ class Libomp < Formula
     url "https://github.com/llvm/llvm-project/releases/download/llvmorg-12.0.1/openmp-12.0.1.src.tar.xz"
     sha256 "60fe79440eaa9ebf583a6ea7f81501310388c02754dbe7dc210776014d06b091"
     license "MIT"
-  
+
     livecheck do
       url "https://llvm.org/"
       regex(/LLVM (\d+\.\d+\.\d+)/i)
     end
-  
+
     bottle do
       sha256 cellar: :any,                 arm64_big_sur: "9b4d71ac4e8a8b8d04819b1bfd155bcb266a9fdf1405b24c9e3801858b08d8bf"
       sha256 cellar: :any,                 big_sur:       "cba5086bd24f1aaa196900f784d7cf1c3dc0de1f536db2f6dccf571a7850d5d9"
@@ -19,13 +19,13 @@ class Libomp < Formula
       sha256 cellar: :any,                 mojave:        "bb25a639e722fe6ab1ede965a5a8854696f40daac2c9c69ad36a8be7f8ae2606"
       sha256 cellar: :any_skip_relocation, x86_64_linux:  "732e9e28300c5e0b3fe8de12e5b6617bc8bb39cc401d5a35cffbb305097a70e9"
     end
-  
+
     depends_on "cmake" => :build
-  
+
     on_linux do
       keg_only "provided by LLVM, which is not keg-only on Linux"
     end
-  
+
     def install
       # Build universal binary
       ENV.permit_arch_flags
@@ -39,7 +39,7 @@ def install
                            "-DLIBOMP_INSTALL_ALIASES=OFF", "-DCMAKE_OSX_ARCHITECTURES=arm64;x86_64"
       system "make", "install"
     end
-  
+
     test do
       (testpath/"test.cpp").write <<~EOS
         #include <omp.h>
@@ -62,4 +62,3 @@ def install
       system "./test"
     end
   end
-  
\ No newline at end of file
diff --git a/alpaka/script/homebrew/14.2/libomp.rb b/alpaka/script/homebrew/14.2/libomp.rb
index 2e08f822..d4b69bad 100644
--- a/alpaka/script/homebrew/14.2/libomp.rb
+++ b/alpaka/script/homebrew/14.2/libomp.rb
@@ -6,12 +6,12 @@ class Libomp < Formula
     url "https://github.com/llvm/llvm-project/releases/download/llvmorg-14.0.6/openmp-14.0.6.src.tar.xz"
     sha256 "4f731ff202add030d9d68d4c6daabd91d3aeed9812e6a5b4968815cfdff0eb1f"
     license "MIT"
-  
+
     livecheck do
       url "https://llvm.org/"
       regex(/LLVM (\d+\.\d+\.\d+)/i)
     end
-  
+
     bottle do
       sha256 cellar: :any,                 arm64_monterey: "b36b1393289e7d98fc03425b6c23a63c4f5e9290ecf0922d45e6fde2973ba8fb"
       sha256 cellar: :any,                 arm64_big_sur:  "f00a5f352167b2fd68ad25b1959ef66a346023c6dbeb50892b386381d7ebe183"
@@ -20,20 +20,20 @@ class Libomp < Formula
       sha256 cellar: :any,                 catalina:       "63cdbb3a70c4b85a6a92a55c8ab2384ded244d37568cd769409dee00a14b581d"
       sha256 cellar: :any_skip_relocation, x86_64_linux:   "470c1338f8c1bc8ef1a41e86bb9beddcff9c353947a2073b2c2b4f584db9bd20"
     end
-  
+
     depends_on "cmake" => :build
     uses_from_macos "llvm" => :build
-  
+
     on_linux do
       keg_only "provided by LLVM, which is not keg-only on Linux"
     end
-  
+
     def install
       # Disable LIBOMP_INSTALL_ALIASES, otherwise the library is installed as
       # libgomp alias which can conflict with GCC's libgomp.
       args = ["-DLIBOMP_INSTALL_ALIASES=OFF"]
       args << "-DOPENMP_ENABLE_LIBOMPTARGET=OFF" if OS.linux?
-  
+
       # Build universal binary
       ENV.permit_arch_flags
       ENV.runtime_cpu_detection
@@ -42,14 +42,14 @@ def install
       system "cmake", "-S", "openmp-#{version}.src", "-B", "build/shared", *std_cmake_args, *args
       system "cmake", "--build", "build/shared"
       system "cmake", "--install", "build/shared"
-  
+
       system "cmake", "-S", "openmp-#{version}.src", "-B", "build/static",
                       "-DLIBOMP_ENABLE_SHARED=OFF",
                       *std_cmake_args, *args
       system "cmake", "--build", "build/static"
       system "cmake", "--install", "build/static"
     end
-  
+
     test do
       (testpath/"test.cpp").write <<~EOS
         #include <omp.h>
@@ -72,4 +72,3 @@ def install
       system "./test"
     end
   end
-  
\ No newline at end of file
diff --git a/alpaka/script/install.sh b/alpaka/script/install.sh
index b2b5184f..c0ebab1b 100755
--- a/alpaka/script/install.sh
+++ b/alpaka/script/install.sh
@@ -5,49 +5,19 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
-source ./script/travis_retry.sh
+set +xv
+source ./script/setup_utilities.sh
 
-source ./script/set.sh
+echo_green "<SCRIPT: install>"
 
 : ${ALPAKA_CI_ANALYSIS?"ALPAKA_CI_ANALYSIS must be specified"}
 : ${ALPAKA_CI_INSTALL_CUDA?"ALPAKA_CI_INSTALL_CUDA must be specified"}
 : ${ALPAKA_CI_INSTALL_HIP?"ALPAKA_CI_INSTALL_HIP must be specified"}
 : ${ALPAKA_CI_INSTALL_TBB?"ALPAKA_CI_INSTALL_TBB must be specified"}
 
-# the agc-manager only exists in the agc-container
-# set alias to false, so each time if we ask the agc-manager if a software is installed, it will
-# return false and the installation of software will be triggered
-if [ "$ALPAKA_CI_OS_NAME" != "Linux" ] || [ ! -f "/usr/bin/agc-manager" ]
-then
-    echo "agc-manager is not installed"
-
-    echo '#!/bin/bash' > agc-manager
-    echo 'exit 1' >> agc-manager
-
-    if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
-    then
-        sudo chmod +x agc-manager
-        sudo mv agc-manager /usr/bin/agc-manager
-    elif [ "$ALPAKA_CI_OS_NAME" = "Windows" ]
-    then
-        chmod +x agc-manager
-        mv agc-manager /usr/bin
-    elif [ "$ALPAKA_CI_OS_NAME" = "macOS" ]
-    then
-        sudo chmod +x agc-manager
-        sudo mv agc-manager /usr/local/bin
-    else
-        echo "unknown operation system: ${ALPAKA_CI_OS_NAME}"
-        exit 1
-    fi
-else
-    echo "found agc-manager"
-fi
-
 if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
 then
     travis_retry apt-get -y --quiet update
-    travis_retry apt-get -y install sudo
 
     # tzdata is installed by software-properties-common but it requires some special handling
     if [[ "$(cat /etc/os-release)" == *"20.04"* ]]
@@ -68,15 +38,13 @@ fi
 
 if [ "${ALPAKA_CI_ANALYSIS}" == "ON" ] ;then source ./script/install_analysis.sh ;fi
 
-# Install CUDA before installing gcc as it installs gcc-4.8 and overwrites our selected compiler
-if [ "${ALPAKA_CI_INSTALL_CUDA}" == "ON" ] ;then source ./script/install_cuda.sh ;fi
 
 if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
 then
-    if [[ "${CXX}" == "g++"* ]] ;then source ./script/install_gcc.sh ;fi
+    if [[ "${ALPAKA_CI_CXX}" == "g++"* ]] ;then source ./script/install_gcc.sh ;fi
     # do not install clang if we use HIP, HIP/ROCm is shipping an own clang version
-    if [[ "${CXX}" == "clang++" ]] && [ "${ALPAKA_CI_INSTALL_HIP}" != "ON" ] ;then source ./script/install_clang.sh ;fi
-    if [[ "${CXX}" == "icpx" ]] ;then source ./script/install_oneapi.sh ;fi
+    if [[ "${ALPAKA_CI_CXX}" == "clang++" ]] && [ "${ALPAKA_CI_INSTALL_HIP}" != "ON" ] ;then source ./script/install_clang.sh ;fi
+    if [[ "${ALPAKA_CI_CXX}" == "icpx" ]] ;then source ./script/install_oneapi.sh ;fi
 elif [ "$ALPAKA_CI_OS_NAME" = "macOS" ]
 then
     echo "### list all applications ###"
@@ -85,8 +53,10 @@ then
     sudo xcode-select -s "/Applications/Xcode_${ALPAKA_CI_XCODE_VER}.app/Contents/Developer"
 fi
 
+if [ "${ALPAKA_CI_INSTALL_CUDA}" == "ON" ] ;then source ./script/install_cuda.sh ;fi
+
 # Don't install TBB for oneAPI runners - it will be installed as part of oneAPI
-if [ "${ALPAKA_CI_INSTALL_TBB}" = "ON" ] && [ "${CXX}" != "icpx" ]  
+if [ "${ALPAKA_CI_INSTALL_TBB}" = "ON" ] && [ "${ALPAKA_CI_CXX}" != "icpx" ]
 then
     source ./script/install_tbb.sh
 fi
@@ -103,4 +73,3 @@ then
 fi
 
 source ./script/install_boost.sh
-
diff --git a/alpaka/script/install_analysis.sh b/alpaka/script/install_analysis.sh
index 2b06c125..d86cfb15 100755
--- a/alpaka/script/install_analysis.sh
+++ b/alpaka/script/install_analysis.sh
@@ -5,9 +5,10 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
-source ./script/travis_retry.sh
+set +xv
+source ./script/setup_utilities.sh
 
-source ./script/set.sh
+echo_green "<SCRIPT: install_analysis>"
 
 if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
 then
diff --git a/alpaka/script/install_boost.sh b/alpaka/script/install_boost.sh
index 233c0f97..4e6052b9 100755
--- a/alpaka/script/install_boost.sh
+++ b/alpaka/script/install_boost.sh
@@ -5,8 +5,10 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
-source ./script/travis_retry.sh
-source ./script/set.sh
+set +xv
+source ./script/setup_utilities.sh
+
+echo_green "<SCRIPT: install boost>"
 
 : "${BOOST_ROOT?'BOOST_ROOT must be specified'}"
 : "${ALPAKA_BOOST_VERSION?'ALPAKA_BOOST_VERSION must be specified'}"
@@ -16,8 +18,7 @@ then
     : "${ALPAKA_CI_STDLIB?'ALPAKA_CI_STDLIB must be specified'}"
 fi
 : "${CMAKE_BUILD_TYPE?'CMAKE_BUILD_TYPE must be specified'}"
-: "${CXX?'CXX must be specified'}"
-: "${CC?'CC must be specified'}"
+: "${ALPAKA_CI_CXX?'ALPAKA_CI_CXX must be specified'}"
 : "${ALPAKA_CI_INSTALL_ATOMIC?'ALPAKA_CI_INSTALL_ATOMIC must be specified'}"
 if [ "$ALPAKA_CI_OS_NAME" = "Windows" ]
 then
@@ -29,15 +30,18 @@ then
     ALPAKA_CI_STDLIB=""
 fi
 
-if [ "${CXX}" != "icpc" ] && [ "${ALPAKA_CI_STDLIB}" != "libc++" ]
+if [ "${ALPAKA_CI_CXX}" != "icpc" ] && [ "${ALPAKA_CI_STDLIB}" != "libc++" ]
 then
     if agc-manager -e boost@${ALPAKA_BOOST_VERSION} ; then
+        echo_green "<USE: preinstalled BOOST ${ALPAKA_BOOST_VERSION}>"
         export BOOST_ROOT=$(agc-manager -b boost@${ALPAKA_BOOST_VERSION})
         export ALPAKA_CI_BOOST_LIB_DIR=${BOOST_ROOT}
         return
     fi
 fi
 
+echo_yellow "<INSTALL: BOOST ${ALPAKA_BOOST_VERSION}>"
+
 ALPAKA_CI_BOOST_BRANCH="boost-${ALPAKA_BOOST_VERSION}"
 
 if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
@@ -66,15 +70,22 @@ then
         TOOLSET="msvc-14.3"
     fi
     # Add new versions as needed
-elif [ "${CXX}" == "icpc" ]
+elif [ "${ALPAKA_CI_CXX}" == "icpc" ]
 then
     TOOLSET="intel-linux"
-elif [ "${CXX}" == "icpx" ]
+elif [ "${ALPAKA_CI_CXX}" == "g++" ]
+then
+    TOOLSET="gcc"
+elif [ "${ALPAKA_CI_CXX}" == "clang++" ]
+then
+    TOOLSET="clang"
+elif [ "${ALPAKA_CI_CXX}" == "icpx" ]
 then
     # icpx is binary compatibly with g++ and ipcx is not supported by b2
     TOOLSET="gcc"
 else
-    TOOLSET="${CC}"
+    echo_red "unknown ALPAKA_CI_CXX: ${ALPAKA_CI_CXX}"
+    exit 1
 fi
 
 # Bootstrap boost.
@@ -146,7 +157,7 @@ then
 
     # Clang is not supported by the FindBoost script.
     # boost (especially old versions) produces too much warnings when using clang (newer versions) so that the 4 MiB log is too short.
-    if [[ "${CXX}" == "clang++"* ]]
+    if [[ "${ALPAKA_CI_CXX}" == "clang++"* ]]
     then
         ALPAKA_BOOST_B2_CXXFLAGS+=" -Wunused-private-field -Wno-unused-local-typedef -Wno-c99-extensions -Wno-variadic-macros"
     fi
diff --git a/alpaka/script/install_clang.sh b/alpaka/script/install_clang.sh
index 8c8382c9..63231ea8 100755
--- a/alpaka/script/install_clang.sh
+++ b/alpaka/script/install_clang.sh
@@ -5,13 +5,13 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
-source ./script/travis_retry.sh
+set +xv
+source ./script/setup_utilities.sh
 
-source ./script/set.sh
+echo_green "<SCRIPT: install_clang>"
 
 : "${ALPAKA_CI_CLANG_VER?'ALPAKA_CI_CLANG_VER must be specified'}"
 : "${ALPAKA_CI_STDLIB?'ALPAKA_CI_STDLIB must be specified'}"
-: "${CXX?'CXX must be specified'}"
 
 #TODO(SimeonEhrig): remove this statement, if ppa's are fixed in alpaka-group-container
 if [[ -f "/etc/apt/sources.list.d/llvm.list" ]];
@@ -19,8 +19,11 @@ then
     sudo rm /etc/apt/sources.list.d/llvm.list
 fi
 
-if ! agc-manager -e clang@${ALPAKA_CI_CLANG_VER}
+if agc-manager -e clang@${ALPAKA_CI_CLANG_VER}
 then
+    echo_green "<USE preinstalled Clang ${ALPAKA_CI_CLANG_VER}>"
+else
+    echo_yellow "<INSTALL: Clang ${ALPAKA_CI_CLANG_VER}>"
     # Install from LLVM repository (if available); otherwise install LLVM from official Ubuntu repositories
     ALPAKA_CI_UBUNTU_NAME=`lsb_release -c | awk '{print $2}'`
     wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
@@ -48,11 +51,15 @@ then
         travis_retry sudo apt-get -y --quiet update
         travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install libc++-${ALPAKA_CI_CLANG_VER}-dev
         travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install libc++abi-${ALPAKA_CI_CLANG_VER}-dev
-        if [ "${ALPAKA_CI_CLANG_VER}" -ge 12 ]
-        then
-            # Starting from LLVM 12 libunwind is required when using libc++. For some reason this isn't installed by default
-            travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install libunwind-${ALPAKA_CI_CLANG_VER}-dev
-        fi
+    fi
+
+    # Workaround if clang uses the stdlibc++. The stdlibc++-9 does not support C++20, therefore we install the stdlibc++-11. Clang automatically uses the latest stdlibc++ version.
+    if [[ "$(cat /etc/os-release)" =~ "20.04" ]] && [ "${alpaka_CXX_STANDARD}" == "20" ];
+    then
+        travis_retry sudo apt install -y --no-install-recommends software-properties-common
+        sudo apt-add-repository ppa:ubuntu-toolchain-r/test -y
+        travis_retry sudo apt update
+        travis_retry sudo apt install -y --no-install-recommends g++-11
     fi
 
     if [ "${alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE}" = "ON" ] || [ "${alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE}" = "ON" ]
@@ -61,11 +68,14 @@ then
         travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install "${LIBOMP_PACKAGE}"
     fi
 
-    sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-"${ALPAKA_CI_CLANG_VER}" 50
-    sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-"${ALPAKA_CI_CLANG_VER}" 50
-    sudo update-alternatives --install /usr/bin/cc cc /usr/bin/clang-"${ALPAKA_CI_CLANG_VER}" 50
-    sudo update-alternatives --install /usr/bin/c++ c++ /usr/bin/clang++-"${ALPAKA_CI_CLANG_VER}" 50
+    which clang++-${ALPAKA_CI_CLANG_VER}
+    export CMAKE_CXX_COMPILER=$(which clang++-${ALPAKA_CI_CLANG_VER})
+
+    # create soft link clang and clang++ to the actual executable
+    # the boost build script requires that the executable clang++ exist
+    ln -s $(which clang-${ALPAKA_CI_CLANG_VER}) $(dirname $(which clang-${ALPAKA_CI_CLANG_VER}))/clang
+    ln -s $(which clang++-${ALPAKA_CI_CLANG_VER}) $(dirname $(which clang++-${ALPAKA_CI_CLANG_VER}))/clang++
 fi
 
-which "${CXX}"
-${CXX} --version
+which "${CMAKE_CXX_COMPILER}"
+${CMAKE_CXX_COMPILER} --version
diff --git a/alpaka/script/install_cmake.sh b/alpaka/script/install_cmake.sh
index 6fa786dc..21d5454b 100755
--- a/alpaka/script/install_cmake.sh
+++ b/alpaka/script/install_cmake.sh
@@ -5,9 +5,10 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
-source ./script/travis_retry.sh
+set +xv
+source ./script/setup_utilities.sh
 
-source ./script/set.sh
+echo_green "<SCRIPT: install_cmake>"
 
 : "${ALPAKA_CI_CMAKE_DIR?'ALPAKA_CI_CMAKE_DIR must be specified'}"
 : "${ALPAKA_CI_CMAKE_VER?'ALPAKA_CI_CMAKE_VER must be specified'}"
@@ -15,8 +16,10 @@ source ./script/set.sh
 if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] || [ "$ALPAKA_CI_OS_NAME" = "Windows" ]
 then
     if agc-manager -e cmake@${ALPAKA_CI_CMAKE_VER} ; then
+        echo_green "<USE: preinstalled CMake ${ALPAKA_CI_CMAKE_VER}>"
         export ALPAKA_CI_CMAKE_DIR=$(agc-manager -b cmake@${ALPAKA_CI_CMAKE_VER})
     else
+        echo_yellow "<INSTALL: CMake ${ALPAKA_CI_CMAKE_VER}>"
         if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
         then
             # Download the selected version.
diff --git a/alpaka/script/install_cuda.sh b/alpaka/script/install_cuda.sh
index 90dc39a8..b9d8e514 100755
--- a/alpaka/script/install_cuda.sh
+++ b/alpaka/script/install_cuda.sh
@@ -5,9 +5,10 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
-source ./script/travis_retry.sh
+set +xv
+source ./script/setup_utilities.sh
 
-source ./script/set.sh
+echo_green "<SCRIPT: install_cuda>"
 
 : "${ALPAKA_CI_CUDA_VERSION?'ALPAKA_CI_CUDA_VERSION must be specified'}"
 
@@ -15,17 +16,24 @@ ALPAKA_CUDA_VER_SEMANTIC=( ${ALPAKA_CI_CUDA_VERSION//./ } )
 ALPAKA_CUDA_VER_MAJOR="${ALPAKA_CUDA_VER_SEMANTIC[0]}"
 echo ALPAKA_CUDA_VER_MAJOR: "${ALPAKA_CUDA_VER_MAJOR}"
 
+# if LD_LIBRARY_PATH is not set, the following statement will throw an unbound variable error
+# export LD_LIBRARY_PATH=/path/to/lib:${LD_LIBRARY_PATH} 
+if [ -z ${LD_LIBRARY_PATH+x} ]; then
+    export LD_LIBRARY_PATH=""
+fi
 
 if agc-manager -e cuda@${ALPAKA_CI_CUDA_VERSION}
 then
+    echo_green "<USE: preinstalled CUDA ${ALPAKA_CI_CUDA_VERSION}>"
     ALPAKA_CI_CUDA_PATH=$(agc-manager -b cuda@${ALPAKA_CI_CUDA_VERSION})
     export PATH=${ALPAKA_CI_CUDA_PATH}/bin:${PATH}
     export LD_LIBRARY_PATH=${ALPAKA_CI_CUDA_PATH}/lib64:${LD_LIBRARY_PATH}
 else
+    echo_yellow "<INSTALL: CUDA ${ALPAKA_CI_CUDA_VERSION}>"
     if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
     then
         : "${ALPAKA_CI_CUDA_DIR?'ALPAKA_CI_CUDA_DIR must be specified'}"
-        : "${CMAKE_CUDA_COMPILER?'CMAKE_CUDA_COMPILER must be specified'}"
+        : "${ALPAKA_CI_CUDA_COMPILER?'ALPAKA_CI_CUDA_COMPILER must be specified'}"
 
         if [[ "$(cat /etc/os-release)" == *"20.04"* ]]
         then
@@ -36,19 +44,7 @@ else
         fi
 
         # Set the correct CUDA downloads
-        if [ "${ALPAKA_CI_CUDA_VERSION}" == "11.0" ]
-        then
-            ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-"${ALPAKA_CUDA_DISTRO}"-11-0-local
-            ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"_11.0.3-450.51.06-1_amd64.deb
-            ALPAKA_CUDA_PKG_FILE_PATH=https://developer.download.nvidia.com/compute/cuda/11.0.3/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
-            ALPAKA_CUDA_OLD_KEYS=true
-        elif [ "${ALPAKA_CI_CUDA_VERSION}" == "11.1" ]
-        then
-            ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-"${ALPAKA_CUDA_DISTRO}"-11-1-local
-            ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"_11.1.1-455.32.00-1_amd64.deb
-            ALPAKA_CUDA_PKG_FILE_PATH=https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
-            ALPAKA_CUDA_OLD_KEYS=true
-        elif [ "${ALPAKA_CI_CUDA_VERSION}" == "11.2" ]
+        if [ "${ALPAKA_CI_CUDA_VERSION}" == "11.2" ]
         then
             ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-"${ALPAKA_CUDA_DISTRO}"-11-2-local
             ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"_11.2.2-460.32.03-1_amd64.deb
@@ -114,8 +110,26 @@ else
             ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"_12.3.2-545.23.08-1_amd64.deb
             ALPAKA_CUDA_PKG_FILE_PATH=https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
             ALPAKA_CUDA_OLD_KEYS=false
+        elif [ "${ALPAKA_CI_CUDA_VERSION}" == "12.4" ]
+        then
+            ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-"${ALPAKA_CUDA_DISTRO}"-12-4-local
+            ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"_12.4.0-550.54.14-1_amd64.deb
+            ALPAKA_CUDA_PKG_FILE_PATH=https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
+            ALPAKA_CUDA_OLD_KEYS=false
+        elif [ "${ALPAKA_CI_CUDA_VERSION}" == "12.5" ]
+        then
+            ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-"${ALPAKA_CUDA_DISTRO}"-12-5-local
+            ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"_12.5.0-555.42.02-1_amd64.deb
+            ALPAKA_CUDA_PKG_FILE_PATH=https://developer.download.nvidia.com/compute/cuda/12.5.0/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
+            ALPAKA_CUDA_OLD_KEYS=false
+        elif [ "${ALPAKA_CI_CUDA_VERSION}" == "12.6" ]
+        then
+            ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-"${ALPAKA_CUDA_DISTRO}"-12-6-local
+            ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"_12.6.1-560.35.03-1_amd64.deb
+            ALPAKA_CUDA_PKG_FILE_PATH=https://developer.download.nvidia.com/compute/cuda/12.6.1/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
+            ALPAKA_CUDA_OLD_KEYS=false
         else
-            echo CUDA versions other than 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 12.0, 12.1, 12.2 and 12.3 are not currently supported on linux!
+            echo CUDA versions other than 11.2-12.6 are not currently supported on linux!
         fi
 
         # First install the local repository.
@@ -149,7 +163,7 @@ else
         export PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
         export LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH
 
-        if [ "${CMAKE_CUDA_COMPILER}" == "clang++" ]
+        if [ "${ALPAKA_CI_CUDA_COMPILER}" == "clang++" ]
         then
             travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install g++-multilib
         fi
@@ -172,3 +186,14 @@ else
         ./cuda_installer.exe -s "nvcc_${ALPAKA_CI_CUDA_VERSION}" "curand_dev_${ALPAKA_CI_CUDA_VERSION}" "cudart_${ALPAKA_CI_CUDA_VERSION}" "thrust_${ALPAKA_CI_CUDA_VERSION}" "visual_studio_integration_${ALPAKA_CI_CUDA_VERSION}"
     fi
 fi
+
+if [ "${ALPAKA_CI_CUDA_COMPILER}" == "nvcc" ]
+then
+    export CMAKE_CUDA_COMPILER=$(which nvcc)
+elif [ "${ALPAKA_CI_CUDA_COMPILER}" == "clang++" ]
+then
+    export CMAKE_CUDA_COMPILER=$(which clang++-${ALPAKA_CI_CLANG_VER})
+else
+    echo_red "unknown ALPAKA_CI_CUDA_COMPILER: ${ALPAKA_CI_CUDA_COMPILER}"
+    exit 1
+fi
diff --git a/alpaka/script/install_doxygen.sh b/alpaka/script/install_doxygen.sh
index f63bbd80..f99f7843 100755
--- a/alpaka/script/install_doxygen.sh
+++ b/alpaka/script/install_doxygen.sh
@@ -5,8 +5,9 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
-source ./script/travis_retry.sh
+set +xv
+source ./script/setup_utilities.sh
 
-source ./script/set.sh
+echo_green "<SCRIPT: install_doxygen>"
 
 travis_retry sudo apt-get -y --quiet install --no-install-recommends doxygen graphviz
diff --git a/alpaka/script/install_gcc.sh b/alpaka/script/install_gcc.sh
index 3daaf3a0..72dfb960 100755
--- a/alpaka/script/install_gcc.sh
+++ b/alpaka/script/install_gcc.sh
@@ -5,28 +5,40 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
-source ./script/travis_retry.sh
+set +xv
+source ./script/setup_utilities.sh
 
-source ./script/set.sh
+echo_green "<SCRIPT: install_gcc>"
 
 : "${ALPAKA_CI_GCC_VER?'ALPAKA_CI_GCC_VER must be specified'}"
 : "${ALPAKA_CI_SANITIZERS?'ALPAKA_CI_SANITIZERS must be specified'}"
-: "${CXX?'CXX must be specified'}"
 
-if ! agc-manager -e gcc@${ALPAKA_CI_GCC_VER}
+if agc-manager -e gcc@${ALPAKA_CI_GCC_VER}
 then
+    echo_green "<USE: preinstalled GCC ${ALPAKA_CI_GCC_VER}>"
+else
+    echo_yellow "<INSTALL: GCC ${ALPAKA_CI_GCC_VER}>"
+
     travis_retry sudo add-apt-repository -y ppa:ubuntu-toolchain-r/ppa # Contains gcc 10.4 (Ubuntu 20.04)
     travis_retry sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test # Contains gcc 11 (Ubuntu 20.04)
     travis_retry sudo apt-get -y --quiet update
     travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install g++-"${ALPAKA_CI_GCC_VER}"
 fi
 
-sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-"${ALPAKA_CI_GCC_VER}" 50
-sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-"${ALPAKA_CI_GCC_VER}" 50
+which g++-${ALPAKA_CI_GCC_VER}
+export CMAKE_CXX_COMPILER=$(which g++-${ALPAKA_CI_GCC_VER})
+
+# the g++ executalbe is required for compiling boost
+# if it does not exist, create symbolic link to the install g++-${ALPAKA_CI_GCC_VER}
+if ! command -v g++ >/dev/null; then
+    echo_yellow "No g++ executable found."
+    ln -s $(which g++-${ALPAKA_CI_GCC_VER}) $(dirname $(which g++-${ALPAKA_CI_GCC_VER}))/g++
+fi
+
 if [[ "${ALPAKA_CI_SANITIZERS}" == *"TSan"* ]]
 then
     travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install libtsan0
 fi
 
-which "${CXX}"
-${CXX} -v
+which "${CMAKE_CXX_COMPILER}"
+${CMAKE_CXX_COMPILER} -v
diff --git a/alpaka/script/install_hip.sh b/alpaka/script/install_hip.sh
index 3d71095b..b040dcad 100755
--- a/alpaka/script/install_hip.sh
+++ b/alpaka/script/install_hip.sh
@@ -5,9 +5,10 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
-source ./script/travis_retry.sh
+set +xv
+source ./script/setup_utilities.sh
 
-source ./script/set.sh
+echo_green "<SCRIPT: install_hip>"
 
 : "${ALPAKA_CI_HIP_ROOT_DIR?'ALPAKA_CI_HIP_ROOT_DIR must be specified'}"
 : "${ALPAKA_CI_HIP_VERSION?'ALPAKA_CI_HIP_VERSION must be specified'}"
@@ -15,8 +16,11 @@ source ./script/set.sh
 function version { echo "$@" | awk -F. '{ printf("%d%03d%03d%03d\n", $1,$2,$3,$4); }'; }
 
 if agc-manager -e rocm@${ALPAKA_CI_HIP_VERSION} ; then
+    echo_green "<USE: preinstalled ROCm ${ALPAKA_CI_HIP_VERSION}>"
     export ROCM_PATH=$(agc-manager -b rocm@${ALPAKA_CI_HIP_VERSION})
 else
+    echo_yellow "<INSTALL: ROCm ${ALPAKA_CI_HIP_VERSION}>"
+
     travis_retry apt-get -y --quiet update
     travis_retry apt-get -y --quiet install wget gnupg2
     # AMD container keys are outdated and must be updated
@@ -45,6 +49,15 @@ export HSA_PATH=$ROCM_PATH
 export PATH=${ROCM_PATH}/bin:$PATH
 export PATH=${ROCM_PATH}/llvm/bin:$PATH
 
+# Workaround if clang uses the stdlibc++. The stdlibc++-9 does not support C++20, therefore we install the stdlibc++-11. Clang automatically uses the latest stdlibc++ version.
+if [[ "$(cat /etc/os-release)" =~ "20.04" ]] && [ "${alpaka_CXX_STANDARD}" == "20" ];
+then
+    travis_retry sudo apt install -y --no-install-recommends software-properties-common
+    sudo apt-add-repository ppa:ubuntu-toolchain-r/test -y
+    travis_retry sudo apt update
+    travis_retry sudo apt install -y --no-install-recommends g++-11
+fi
+
 sudo update-alternatives --install /usr/bin/clang clang ${ROCM_PATH}/llvm/bin/clang 50
 sudo update-alternatives --install /usr/bin/clang++ clang++ ${ROCM_PATH}/llvm/bin/clang++ 50
 sudo update-alternatives --install /usr/bin/cc cc ${ROCM_PATH}/llvm/bin/clang 50
@@ -88,3 +101,6 @@ hipconfig
 rocm-smi
 # print newline as previous command does not do this
 echo
+
+# use the clang++ of the HIP SDK as C++ compiler
+export CMAKE_CXX_COMPILER=$(which clang++)
diff --git a/alpaka/script/install_omp.sh b/alpaka/script/install_omp.sh
index da3c0775..1ae0bd49 100755
--- a/alpaka/script/install_omp.sh
+++ b/alpaka/script/install_omp.sh
@@ -4,17 +4,35 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
-source ./script/travis_retry.sh
+set +xv
+source ./script/setup_utilities.sh
 
-source ./script/set.sh
+echo_green "<SCRIPT: install_omp>"
 
 if [ "$ALPAKA_CI_OS_NAME" = "macOS" ]
 then
     # workaround to avoid link issues from python 2 to 3 during libomp dependency installation
+    rm '/usr/local/bin/2to3-3.11' || true
+    rm '/usr/local/bin/idle3.11' || true
+    rm '/usr/local/bin/pydoc3.11' || true
+    rm '/usr/local/bin/python3.11' || true
+    rm '/usr/local/bin/python3.11-config' || true
     rm '/usr/local/bin/2to3-3.12'
     rm '/usr/local/bin/idle3.12'
     rm '/usr/local/bin/pydoc3.12'
     rm '/usr/local/bin/python3.12'
     rm '/usr/local/bin/python3.12-config'
+    rm '/usr/local/bin/2to3' || true
+    rm '/usr/local/bin/idle3' || true
+    rm '/usr/local/bin/pydoc3' || true
+    rm '/usr/local/bin/python3' || true
+    rm '/usr/local/bin/python3-config' || true
+    rm '/usr/local/share/man/man1/python3.1' || true
+    rm '/usr/local/lib/pkgconfig/python3-embed.pc' || true
+    rm '/usr/local/lib/pkgconfig/python3.pc' || true
+    rm '/usr/local/Frameworks/Python.framework/Headers' || true
+    rm '/usr/local/Frameworks/Python.framework/Python' || true
+    rm '/usr/local/Frameworks/Python.framework/Resources' || true
+    rm '/usr/local/Frameworks/Python.framework/Versions/Current'  || true
     brew reinstall --build-from-source --formula ./script/homebrew/${ALPAKA_CI_XCODE_VER}/libomp.rb
 fi
diff --git a/alpaka/script/install_oneapi.sh b/alpaka/script/install_oneapi.sh
index 507799f5..0159d2a1 100755
--- a/alpaka/script/install_oneapi.sh
+++ b/alpaka/script/install_oneapi.sh
@@ -4,15 +4,17 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
-source ./script/travis_retry.sh
+set +xv
+source ./script/setup_utilities.sh
 
-source ./script/set.sh
+echo_green "<SCRIPT: install_oneapi>"
 
-: "${CXX?'CXX must be specified'}"
-
-
-if ! agc-manager -e oneapi
+if agc-manager -e oneapi
 then
+    echo_green "<USE: preinstalled OneAPI ${ALPAKA_CI_ONEAPI_VERSION}>"
+else
+    echo_yellow "<INSTALL: Intel OneAPI ${ALPAKA_CI_ONEAPI_VERSION}>"
+
     # Ref.: https://github.com/rscohn2/oneapi-ci
     # intel-basekit intel-hpckit are too large in size
 
@@ -41,10 +43,20 @@ then
     set +eu
     source /opt/intel/oneapi/setvars.sh
     set -eu
+
+    # Workaround if icpx uses the stdlibc++. The stdlibc++-9 does not support C++20, therefore we install the stdlibc++-11. Clang automatically uses the latest stdlibc++ version.
+    if [[ "$(cat /etc/os-release)" =~ "20.04" ]] && [ "${alpaka_CXX_STANDARD}" == "20" ];
+    then
+        travis_retry sudo apt install -y --no-install-recommends software-properties-common
+        sudo apt-add-repository ppa:ubuntu-toolchain-r/test -y
+        travis_retry sudo apt update
+        travis_retry sudo apt install -y --no-install-recommends g++-11
+    fi
+
+    # path depends on the SDK version
+    export CMAKE_CXX_COMPILER=$(which icpx)
 fi
 
-which "${CXX}"
-${CXX} --version
-which "${CC}"
-${CC} --version
+which "${CMAKE_CXX_COMPILER}"
+${CMAKE_CXX_COMPILER} --version
 sycl-ls
diff --git a/alpaka/script/install_tbb.sh b/alpaka/script/install_tbb.sh
index a051d8b5..64d8b10c 100755
--- a/alpaka/script/install_tbb.sh
+++ b/alpaka/script/install_tbb.sh
@@ -5,14 +5,19 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
-source ./script/travis_retry.sh
+set +xv
+source ./script/setup_utilities.sh
 
-source ./script/set.sh
+echo_green "<SCRIPT: install_tbb>"
 
 : "${ALPAKA_CI_TBB_VERSION?'ALPAKA_CI_TBB_VERSION must be specified'}"
 
-if ! agc-manager -e tbb@${ALPAKA_CI_TBB_VERSION}
+if agc-manager -e tbb@${ALPAKA_CI_TBB_VERSION}
 then
+    echo_green "<USE: preinstalled Intel TBB ${ALPAKA_CI_TBB_VERSION}>"
+else
+    echo_yellow "<INSTALL: Intel TBB ${ALPAKA_CI_TBB_VERSION}>"
+
     # Install TBB
     if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
     then
diff --git a/alpaka/script/job_generator/alpaka_filter.py b/alpaka/script/job_generator/alpaka_filter.py
index 71a43c1f..e63aef25 100644
--- a/alpaka/script/job_generator/alpaka_filter.py
+++ b/alpaka/script/job_generator/alpaka_filter.py
@@ -55,4 +55,51 @@ def alpaka_post_filter(row: List) -> bool:
     ):
         return False
 
+    # there is a compiler bug in GCC 11.4 which avoids working with NVCC 11.5
+    if (
+        row_check_name(row, DEVICE_COMPILER, "==", NVCC)
+        and (
+            row_check_version(row, DEVICE_COMPILER, "==", "11.4")
+            or row_check_version(row, DEVICE_COMPILER, "==", "11.5")
+        )
+        and row_check_name(row, HOST_COMPILER, "==", GCC)
+        and row_check_version(row, HOST_COMPILER, "==", "11")
+    ):
+        return False
+
+    # cmake 3.24 and older does not support C++20 for nvcc
+    if (
+        row_check_name(row, DEVICE_COMPILER, "==", NVCC)
+        and row_check_version(row, CXX_STANDARD, ">=", "20")
+        and row_check_version(row, CMAKE, "<", "3.25")
+    ):
+        return False
+
+    # Debug builds with HIP/ROCm 6.2 produce compiler errors
+    if (
+        is_in_row(row, BUILD_TYPE)
+        and row[param_map[BUILD_TYPE]][VERSION] == CMAKE_DEBUG
+        and row_check_name(row, DEVICE_COMPILER, "==", HIPCC)
+        and row_check_version(row, DEVICE_COMPILER, "==", "6.2")
+    ):
+        return False
+
+     # g++-12 is not available on the Ubuntu 20.04 ppa's
+    if (
+        row_check_name(row, HOST_COMPILER, "==", GCC)
+        and row_check_version(row, HOST_COMPILER, "==", "12")
+        and row_check_version(row, UBUNTU, "==", "20.04")
+    ):
+        return False
+
+    # there is a bug with g++-13 and cuda 12.4 on Ubuntu 20.04
+    if (
+        row_check_name(row, DEVICE_COMPILER, "==", NVCC)
+        and row_check_version(row, DEVICE_COMPILER, "==", "12.4")
+        and row_check_name(row, HOST_COMPILER, "==", GCC)
+        and row_check_version(row, HOST_COMPILER, "==", "13")
+        and row_check_version(row, UBUNTU, "==", "20.04")
+    ):
+        return False
+
     return True
diff --git a/alpaka/script/job_generator/custom_job.py b/alpaka/script/job_generator/custom_job.py
index 5e52da9f..bf3ab079 100644
--- a/alpaka/script/job_generator/custom_job.py
+++ b/alpaka/script/job_generator/custom_job.py
@@ -3,16 +3,14 @@
 
 Add custom jobs. For example loaded from a yaml file."""
 
-from genericpath import isfile
-import os, yaml
+import os
+import yaml
 from typing import List, Dict, Callable
 from typeguard import typechecked
 
 
 @typechecked
-def read_jobs_from_folder(
-    path: str, filter: Callable = lambda name: True
-) -> List[Dict[str, Dict]]:
+def read_jobs_from_folder(path: str, filter: Callable = lambda name: True) -> List[Dict[str, Dict]]:
     """Read all job descriptions from the files located in a specific folder.
     The function ignore sub folders.
 
@@ -37,9 +35,7 @@ def read_jobs_from_folder(
         abs_file_path = os.path.join(path, file_name)
         if os.path.isfile(abs_file_path) and filter(file_name):
             with open(abs_file_path, "r", encoding="utf8") as job_yaml:
-                for job_name, job_body in yaml.load(
-                    job_yaml, yaml.loader.SafeLoader
-                ).items():
+                for job_name, job_body in yaml.load(job_yaml, yaml.loader.SafeLoader).items():
                     custom_job_list.append({job_name: job_body})
 
     return custom_job_list
@@ -58,15 +54,11 @@ def add_custom_jobs(job_matrix_yaml: List[Dict[str, Dict]], container_version: f
         RuntimeError: Throw error, if yaml file of custom jobs does not exits.
     """
     # load custom jobs from the folder script/gitlabci
-    script_gitlab_ci_folder = os.path.abspath(
-        os.path.join(os.path.abspath(__file__), "../../gitlabci/")
-    )
+    script_gitlab_ci_folder = os.path.abspath(os.path.join(os.path.abspath(__file__), "../../gitlabci/"))
 
     for path in [script_gitlab_ci_folder]:
         job_matrix_yaml += read_jobs_from_folder(
             path,
             # filter file names
-            lambda name: name != "job_base.yml"
-            and name.startswith("job_")
-            and name.endswith(".yml"),
+            lambda name: name != "job_base.yml" and name.startswith("job_") and name.endswith(".yml"),
         )
diff --git a/alpaka/script/job_generator/generate_job_yaml.py b/alpaka/script/job_generator/generate_job_yaml.py
index 01d6cd23..e66191e8 100644
--- a/alpaka/script/job_generator/generate_job_yaml.py
+++ b/alpaka/script/job_generator/generate_job_yaml.py
@@ -5,7 +5,8 @@
 
 from typing import List, Dict, Tuple
 from typeguard import typechecked
-import os, yaml
+import os
+import yaml
 import gitlab
 
 
@@ -136,18 +137,12 @@ def job_image(
     """
 
     verified_container_url = [
-        "registry.hzdr.de/crp/alpaka-group-container/"
-        + "alpaka-ci-ubuntu"
-        + job[UBUNTU][VERSION]
+        "registry.hzdr.de/crp/alpaka-group-container/" + "alpaka-ci-ubuntu" + job[UBUNTU][VERSION]
     ]
 
-    is_in_gitlab_images = lambda name: bool(
-        [i for i in gitlab_images if i.startswith(name)]
-    )
+    is_in_gitlab_images = lambda name: bool([i for i in gitlab_images if i.startswith(name)])
 
-    def verify_image(
-        test_url: List[str], verified_url: List[str], gitlab_images: List[str]
-    ) -> bool:
+    def verify_image(test_url: List[str], verified_url: List[str], gitlab_images: List[str]) -> bool:
         """Verify if the test_url is included in gitlab_images.
 
         Args:
@@ -165,8 +160,7 @@ def verify_image(
         if gitlab_images and not is_in_gitlab_images("".join(test_url)):
             if "".join(test_url) not in image_warning_cache:
                 print_warn(
-                    f'image {"".join(test_url)} does not exist\n'
-                    f'  use instead image: {"".join(verified_url)}'
+                    f'image {"".join(test_url)} does not exist\n' f'  use instead image: {"".join(verified_url)}'
                 )
                 # append image to a cache to show the warning only one time
                 image_warning_cache.append("".join(test_url))
@@ -188,26 +182,16 @@ def verify_image(
         return "".join(verified_container_url)
     verified_container_url = testing_container_url.copy()
 
-    if (
-        ALPAKA_ACC_GPU_CUDA_ENABLE in job
-        and job[ALPAKA_ACC_GPU_CUDA_ENABLE][VERSION] != OFF_VER
-    ):
+    if ALPAKA_ACC_GPU_CUDA_ENABLE in job and job[ALPAKA_ACC_GPU_CUDA_ENABLE][VERSION] != OFF_VER:
         # Cast cuda version shape. E.g. from 11.0 to 110
-        testing_container_url.insert(
-            1, "-cuda" + str(int(float(job[ALPAKA_ACC_GPU_CUDA_ENABLE][VERSION]) * 10))
-        )
+        testing_container_url.insert(1, "-cuda" + str(int(float(job[ALPAKA_ACC_GPU_CUDA_ENABLE][VERSION]) * 10)))
 
     if not verify_image(testing_container_url, verified_container_url, gitlab_images):
         return "".join(verified_container_url)
     verified_container_url = testing_container_url.copy()
 
-    if (
-        ALPAKA_ACC_GPU_HIP_ENABLE in job
-        and job[ALPAKA_ACC_GPU_HIP_ENABLE][VERSION] != OFF_VER
-    ):
-        testing_container_url.insert(
-            1, "-rocm" + job[ALPAKA_ACC_GPU_HIP_ENABLE][VERSION]
-        )
+    if ALPAKA_ACC_GPU_HIP_ENABLE in job and job[ALPAKA_ACC_GPU_HIP_ENABLE][VERSION] != OFF_VER:
+        testing_container_url.insert(1, "-rocm" + job[ALPAKA_ACC_GPU_HIP_ENABLE][VERSION])
 
     if not verify_image(testing_container_url, verified_container_url, gitlab_images):
         return "".join(verified_container_url)
@@ -220,9 +204,7 @@ def verify_image(
 
 
 @typechecked
-def append_backend_variables(
-    variables: Dict[str, str], job: Dict[str, Tuple[str, str]]
-):
+def append_backend_variables(variables: Dict[str, str], job: Dict[str, Tuple[str, str]]):
     """Searches for enabled back-ends in the job parameters and appends the back-end
     variable to variables to enable it in the CI job.
 
@@ -287,6 +269,7 @@ def job_variables(job: Dict[str, Tuple[str, str]]) -> Dict[str, str]:
 
     variables["ALPAKA_CI_CMAKE_VER"] = job[CMAKE][VERSION]
     variables["ALPAKA_BOOST_VERSION"] = job[BOOST][VERSION]
+    variables["alpaka_CXX_STANDARD"] = job[CXX_STANDARD][VERSION]
 
     # all back-ends are disabled by default
     # back-ends are conditionally enabled depending on the job parameters
@@ -313,28 +296,19 @@ def job_variables(job: Dict[str, Tuple[str, str]]) -> Dict[str, str]:
     append_backend_variables(variables, job)
 
     if job[DEVICE_COMPILER][NAME] == GCC:
-        variables["CC"] = "gcc"
-        variables["CXX"] = "g++"
+        variables["ALPAKA_CI_CXX"] = "g++"
         variables["ALPAKA_CI_GCC_VER"] = job[DEVICE_COMPILER][VERSION]
-        if (
-            ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE in job
-            and job[ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE][VERSION] == ON_VER
-        ):
+        if ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE in job and job[ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE][VERSION] == ON_VER:
             variables["ALPAKA_CI_TBB_VERSION"] = "2021.10.0"
 
     if job[DEVICE_COMPILER][NAME] == CLANG:
-        variables["CC"] = "clang"
-        variables["CXX"] = "clang++"
+        variables["ALPAKA_CI_CXX"] = "clang++"
         variables["ALPAKA_CI_CLANG_VER"] = job[DEVICE_COMPILER][VERSION]
-        if (
-            ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE in job
-            and job[ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE][VERSION] == ON_VER
-        ):
+        if ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE in job and job[ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE][VERSION] == ON_VER:
             variables["ALPAKA_CI_TBB_VERSION"] = "2021.10.0"
 
     if job[DEVICE_COMPILER][NAME] == HIPCC:
-        variables["CC"] = "clang"
-        variables["CXX"] = "clang++"
+        variables["ALPAKA_CI_CXX"] = "clang++"
         variables["CMAKE_HIP_COMPILER"] = "clang++"
         variables["CMAKE_HIP_ARCHITECTURES"] = "${CI_GPU_ARCH}"
         # TODO(SimeonEhrig) check, if we can remove this variable:
@@ -354,58 +328,55 @@ def job_variables(job: Dict[str, Tuple[str, str]]) -> Dict[str, str]:
             variables["ALPAKA_CI_CLANG_VER"] = "17"
         elif job[DEVICE_COMPILER][VERSION] == "6.0":
             variables["ALPAKA_CI_CLANG_VER"] = "17"
+        elif job[DEVICE_COMPILER][VERSION] == "6.1":
+            variables["ALPAKA_CI_CLANG_VER"] = "17"
+        elif job[DEVICE_COMPILER][VERSION] == "6.2":
+            variables["ALPAKA_CI_CLANG_VER"] = "18"
         else:
             raise RuntimeError(
-                "generate_job_yaml.job_variables(): unknown hip version: "
-                f"{job[DEVICE_COMPILER][VERSION]}"
+                "generate_job_yaml.job_variables(): unknown ROCm version: " f"{job[DEVICE_COMPILER][VERSION]}"
             )
         variables["ALPAKA_CI_HIP_VERSION"] = job[DEVICE_COMPILER][VERSION]
         variables["ALPAKA_CI_STDLIB"] = "libstdc++"
 
     # general configuration, if the CUDA backend is enabled (includes nvcc and clang as CUDA
     # compiler)
-    if (
-        ALPAKA_ACC_GPU_CUDA_ENABLE in job
-        and job[ALPAKA_ACC_GPU_CUDA_ENABLE][VERSION] != OFF_VER
-    ):
+    if ALPAKA_ACC_GPU_CUDA_ENABLE in job and job[ALPAKA_ACC_GPU_CUDA_ENABLE][VERSION] != OFF_VER:
         variables["ALPAKA_CI_STDLIB"] = "libstdc++"
         variables["CMAKE_CUDA_ARCHITECTURES"] = job[SM_LEVEL][VERSION]
         variables["ALPAKA_CI_CUDA_VERSION"] = job[ALPAKA_ACC_GPU_CUDA_ENABLE][VERSION]
 
     if job[DEVICE_COMPILER][NAME] == NVCC:
         # general configuration, if nvcc is the CUDA compiler
-        variables["CMAKE_CUDA_COMPILER"] = "nvcc"
+        variables["ALPAKA_CI_CUDA_COMPILER"] = "nvcc"
 
         # configuration, if GCC is the CUDA host compiler
         if job[HOST_COMPILER][NAME] == GCC:
-            variables["CC"] = "gcc"
-            variables["CXX"] = "g++"
+            variables["ALPAKA_CI_CXX"] = "g++"
             variables["ALPAKA_CI_GCC_VER"] = job[HOST_COMPILER][VERSION]
         # configuration, if Clang is the CUDA host compiler
         elif job[HOST_COMPILER][NAME] == CLANG:
-            variables["CC"] = "clang"
-            variables["CXX"] = "clang++"
+            variables["ALPAKA_CI_CXX"] = "clang++"
             variables["ALPAKA_CI_CLANG_VER"] = job[HOST_COMPILER][VERSION]
         else:
             raise RuntimeError(
-                "generate_job_yaml.job_variables(): unknown CUDA host compiler: "
-                f"{job[HOST_COMPILER][NAME]}"
+                "generate_job_yaml.job_variables(): unknown CUDA host compiler: " f"{job[HOST_COMPILER][NAME]}"
             )
 
     if job[DEVICE_COMPILER][NAME] == CLANG_CUDA:
-        variables["CC"] = "clang"
-        variables["CXX"] = "clang++"
+        variables["ALPAKA_CI_CXX"] = "clang++"
         variables["ALPAKA_CI_CLANG_VER"] = job[DEVICE_COMPILER][VERSION]
-        variables["CMAKE_CUDA_COMPILER"] = "clang++"
+        variables["ALPAKA_CI_CUDA_COMPILER"] = "clang++"
 
     # oneAPI configuration
     if job[DEVICE_COMPILER][NAME] == ICPX:
-        variables["CC"] = "icx"
-        variables["CXX"] = "icpx"
-        if job[DEVICE_COMPILER][VERSION] == "2023.1.0":
-            variables["ALPAKA_CI_CLANG_VER"] = "16"
-        elif job[DEVICE_COMPILER][VERSION] == "2023.2.0":
-            variables["ALPAKA_CI_CLANG_VER"] = "16"
+        variables["ALPAKA_CI_CXX"] = "icpx"
+        if job[DEVICE_COMPILER][VERSION] == "2024.0":
+            variables["ALPAKA_CI_CLANG_VER"] = "17"
+        elif job[DEVICE_COMPILER][VERSION] == "2024.1":
+            variables["ALPAKA_CI_CLANG_VER"] = "18"
+        elif job[DEVICE_COMPILER][VERSION] == "2024.2":
+            variables["ALPAKA_CI_CLANG_VER"] = "19"
         variables["ALPAKA_CI_STDLIB"] = "libstdc++"
         variables["ALPAKA_CI_ONEAPI_VERSION"] = job[DEVICE_COMPILER][VERSION]
         variables["alpaka_SYCL_ONEAPI_CPU"] = "ON"
@@ -427,21 +398,12 @@ def job_tags(job: Dict[str, Tuple[str, str]]) -> List[str]:
     if job[JOB_EXECUTION_TYPE][VERSION] == JOB_EXECUTION_COMPILE_ONLY:
         return ["x86_64", "cpuonly"]
 
-    if (
-        ALPAKA_ACC_GPU_CUDA_ENABLE in job
-        and job[ALPAKA_ACC_GPU_CUDA_ENABLE][VERSION] != OFF_VER
-    ):
+    if ALPAKA_ACC_GPU_CUDA_ENABLE in job and job[ALPAKA_ACC_GPU_CUDA_ENABLE][VERSION] != OFF_VER:
         return ["x86_64", "cuda"]
-    if (
-        ALPAKA_ACC_GPU_HIP_ENABLE in job
-        and job[ALPAKA_ACC_GPU_HIP_ENABLE][VERSION] != OFF_VER
-    ):
+    if ALPAKA_ACC_GPU_HIP_ENABLE in job and job[ALPAKA_ACC_GPU_HIP_ENABLE][VERSION] != OFF_VER:
         return ["x86_64", "rocm"]
 
-    if (
-        ALPAKA_ACC_SYCL_ENABLE in job
-        and job[ALPAKA_ACC_SYCL_ENABLE][VERSION] != OFF_VER
-    ):
+    if ALPAKA_ACC_SYCL_ENABLE in job and job[ALPAKA_ACC_SYCL_ENABLE][VERSION] != OFF_VER:
         return ["x86_64", "cpuonly"]
 
     # fallback
@@ -487,9 +449,7 @@ def global_variables() -> Dict[str, str]:
 
 
 @typechecked
-def create_job(
-    job: Dict[str, Tuple[str, str]], container_version: float, gitlab_images: List[str]
-) -> Dict[str, Dict]:
+def create_job(job: Dict[str, Tuple[str, str]], container_version: float, gitlab_images: List[str]) -> Dict[str, Dict]:
     """Create complete GitLab-CI yaml for a single job
 
     Args:
@@ -505,9 +465,7 @@ def create_job(
     job_name = "linux_" + job[DEVICE_COMPILER][NAME] + job[DEVICE_COMPILER][VERSION]
     # if the nvcc is the device compiler, add also the host compiler to the name
     if job[DEVICE_COMPILER][NAME] == NVCC:
-        job_name = (
-            job_name + "-" + job[HOST_COMPILER][NAME] + job[HOST_COMPILER][VERSION]
-        )
+        job_name = job_name + "-" + job[HOST_COMPILER][NAME] + job[HOST_COMPILER][VERSION]
     # if Clang-CUDA is the device compiler, add also the CUDA SDK version to the name
     if job[DEVICE_COMPILER][NAME] == CLANG_CUDA:
         job_name = job_name + "-cuda" + job[ALPAKA_ACC_GPU_CUDA_ENABLE][VERSION]
@@ -581,9 +539,7 @@ def distribute_to_waves(
         elif job_name.startswith("linux_gcc"):
             sorted_groups[JOB_CPU_RUNTIME].append(job)
         # Clang as C++ compiler without CUDA backend
-        elif job_name.startswith("linux_clang") and not job_name.startswith(
-            "linux_clang-cuda"
-        ):
+        elif job_name.startswith("linux_clang") and not job_name.startswith("linux_clang-cuda"):
             sorted_groups[JOB_CPU_RUNTIME].append(job)
         elif job_name.startswith("linux_hipcc"):
             # sorted_groups[JOB_ROCM_RUNTIME].append(job)
@@ -604,7 +560,7 @@ def distribute_to_waves(
             sorted_groups[JOB_UNKNOWN].append(job)
 
     for wave in WAVE_GROUP_NAMES:
-        if not wave in wave_size:
+        if wave not in wave_size:
             wave_size[wave] = len(sorted_groups[wave])
         else:
             # if max_jobs is negative, set to 0
@@ -653,9 +609,7 @@ def write_job_yaml(
                     "dummy-job": {
                         "image": "alpine:latest",
                         "interruptible": True,
-                        "script": [
-                            'echo "This is a dummy job so that the CI does not fail."'
-                        ],
+                        "script": ['echo "This is a dummy job so that the CI does not fail."'],
                     }
                 },
                 output_file,
@@ -679,9 +633,7 @@ def write_job_yaml(
         # The CUDA and HIP jobs inherent from a job template written in yaml
         script_path = os.path.abspath(__file__)
         with open(
-            os.path.abspath(
-                os.path.join(os.path.dirname(script_path), "../gitlabci/job_base.yml")
-            ),
+            os.path.abspath(os.path.join(os.path.dirname(script_path), "../gitlabci/job_base.yml")),
             "r",
             encoding="utf8",
         ) as file:
@@ -693,14 +645,10 @@ def write_job_yaml(
             # If all jobs would be collected first in dict, the order would be not guarantied.
             for stage_number, wave in enumerate(job_matrix[wave_name]):
                 # Improve the readability of the generated job yaml
-                output_file.write(
-                    f"# <<<<<<<<<<<<< {wave_name}-stage{stage_number} >>>>>>>>>>>>>\n\n"
-                )
+                output_file.write(f"# <<<<<<<<<<<<< {wave_name}-stage{stage_number} >>>>>>>>>>>>>\n\n")
                 for job in wave:
                     # the first key is the name
-                    job[list(job.keys())[0]][
-                        "stage"
-                    ] = f"{wave_name}-stage{stage_number}"
+                    job[list(job.keys())[0]]["stage"] = f"{wave_name}-stage{stage_number}"
 
                     yaml.dump(job, output_file)
                     output_file.write("\n")
diff --git a/alpaka/script/job_generator/job_generator.py b/alpaka/script/job_generator/job_generator.py
index c9ba6633..a3b18637 100644
--- a/alpaka/script/job_generator/job_generator.py
+++ b/alpaka/script/job_generator/job_generator.py
@@ -4,7 +4,9 @@
 Generate GitLab-CI test jobs yaml for the vikunja CI."""
 
 import argparse
-import sys, os, random
+import sys
+import os
+import random
 from typing import List, Dict, Tuple
 from collections import OrderedDict
 
@@ -26,7 +28,6 @@
     write_job_yaml,
     distribute_to_waves,
     JOB_COMPILE_ONLY,
-    JOB_RUNTIME,
     JOB_UNKNOWN,
     WAVE_GROUP_NAMES,
 )
@@ -41,21 +42,15 @@ def get_args() -> argparse.Namespace:
     Returns:
         argparse.Namespace: The commandline arguments.
     """
-    parser = argparse.ArgumentParser(
-        description="Calculate job matrix and create GitLab CI .yml."
-    )
+    parser = argparse.ArgumentParser(description="Calculate job matrix and create GitLab CI .yml.")
 
-    parser.add_argument(
-        "version", type=float, help="Version number of the used CI container."
-    )
+    parser.add_argument("version", type=float, help="Version number of the used CI container.")
     parser.add_argument(
         "--print-combinations",
         action="store_true",
         help="Display combination matrix.",
     )
-    parser.add_argument(
-        "--verify", action="store_true", help="Verify generated combination matrix"
-    )
+    parser.add_argument("--verify", action="store_true", help="Verify generated combination matrix")
     parser.add_argument(
         "-a",
         "--all",
@@ -174,9 +169,7 @@ def get_args() -> argparse.Namespace:
             if striped_line.strip().startswith(COMMIT_MESSAGE_FILTER_PREFIX):
                 filter_regix = striped_line[len(COMMIT_MESSAGE_FILTER_PREFIX) :].strip()
             if striped_line.startswith(COMMIT_MESSAGE_REORDER_PREFIX):
-                reorder_regix = striped_line[
-                    len(COMMIT_MESSAGE_REORDER_PREFIX) :
-                ].strip()
+                reorder_regix = striped_line[len(COMMIT_MESSAGE_REORDER_PREFIX) :].strip()
 
     if filter_regix:
         job_matrix_yaml = filter_job_list(job_matrix_yaml, filter_regix)
@@ -197,7 +190,7 @@ def get_args() -> argparse.Namespace:
         filter_wave_name = args.wave
         wave_job_matrix = {filter_wave_name: wave_job_matrix[filter_wave_name]}
         for wave_name in WAVE_GROUP_NAMES:
-            if not wave_name in wave_job_matrix:
+            if wave_name not in wave_job_matrix:
                 wave_job_matrix[wave_name] = []
 
     write_job_yaml(
diff --git a/alpaka/script/job_generator/job_modifier.py b/alpaka/script/job_generator/job_modifier.py
index 7a4310f2..ebac6017 100644
--- a/alpaka/script/job_generator/job_modifier.py
+++ b/alpaka/script/job_generator/job_modifier.py
@@ -48,13 +48,13 @@ def add_job_parameters(job_matrix: List[Dict[str, Tuple[str, str]]]):
 
     # This is a helper dictionary to find the latest minor version of a CUDA SDK major, used by a
     # specific host compiler.
-    # e.g. We have CUDA SDK versions from 11.0 to 11.8 and 12.0 and 12.1. GCC as host compiler
+    # e.g. We have CUDA SDK versions from 11.2 to 11.8 and 12.0 and 12.1. GCC as host compiler
     # supports all SDK versions, Clang as host compiler only the 11 versions and Clang as CUDA
     # compiler only up to 11.5. So the result is (see CUDA_SDK_per_compiler later):
     # {"GCC" : ["11.8", "12.1"], "Clang" : ["11.8"], "Clang-CUDA" : ["11.5"]}
 
     # { compiler_name : {major : (minor, (version_string)}}
-    latest_CUDA_SDK_minor_versions: Dict[str : Dict[int:(int, str)]] = {
+    latest_CUDA_SDK_minor_versions: Dict[str : Dict[int : (int, str)]] = {
         GCC: {},
         CLANG: {},
         CLANG_CUDA: {},
@@ -64,22 +64,14 @@ def add_job_parameters(job_matrix: List[Dict[str, Tuple[str, str]]]):
     VERSION_STRING = 1
 
     for job in job_matrix:
-        if (
-            ALPAKA_ACC_GPU_CUDA_ENABLE in job
-            and job[ALPAKA_ACC_GPU_CUDA_ENABLE][VERSION] != OFF_VER
-        ):
+        if ALPAKA_ACC_GPU_CUDA_ENABLE in job and job[ALPAKA_ACC_GPU_CUDA_ENABLE][VERSION] != OFF_VER:
             v = version.parse(job[ALPAKA_ACC_GPU_CUDA_ENABLE][VERSION])
-            if not v.major in latest_CUDA_SDK_minor_versions[job[HOST_COMPILER][NAME]]:
+            if v.major not in latest_CUDA_SDK_minor_versions[job[HOST_COMPILER][NAME]]:
                 latest_CUDA_SDK_minor_versions[job[HOST_COMPILER][NAME]][v.major] = (
                     v.minor,
                     job[ALPAKA_ACC_GPU_CUDA_ENABLE][VERSION],
                 )
-            elif (
-                latest_CUDA_SDK_minor_versions[job[HOST_COMPILER][NAME]][v.major][
-                    MINOR_VERSION
-                ]
-                < v.minor
-            ):
+            elif latest_CUDA_SDK_minor_versions[job[HOST_COMPILER][NAME]][v.major][MINOR_VERSION] < v.minor:
                 latest_CUDA_SDK_minor_versions[job[HOST_COMPILER][NAME]][v.major] = (
                     v.minor,
                     job[ALPAKA_ACC_GPU_CUDA_ENABLE][VERSION],
@@ -100,22 +92,16 @@ def add_job_parameters(job_matrix: List[Dict[str, Tuple[str, str]]]):
 
     for job in job_matrix:
         for compiler_name, sdk_versions in CUDA_SDK_per_compiler.items():
-            if (
-                ALPAKA_ACC_GPU_CUDA_ENABLE in job
-                and job[HOST_COMPILER][NAME] == compiler_name
-            ):
+            if ALPAKA_ACC_GPU_CUDA_ENABLE in job and job[HOST_COMPILER][NAME] == compiler_name:
                 for sdk_version in sdk_versions:
                     if (
                         job[ALPAKA_ACC_GPU_CUDA_ENABLE][VERSION] == sdk_version
                         # needs to be a release build, otherwise there is the risk of running ot of
                         # GPU resources
                         and job[BUILD_TYPE][VERSION] == CMAKE_RELEASE
-                        and not job[HOST_COMPILER][VERSION]
-                        in used_host_compiler[job[HOST_COMPILER][NAME]]
+                        and job[HOST_COMPILER][VERSION] not in used_host_compiler[job[HOST_COMPILER][NAME]]
                     ):
-                        used_host_compiler[job[HOST_COMPILER][NAME]].append(
-                            job[HOST_COMPILER][VERSION]
-                        )
+                        used_host_compiler[job[HOST_COMPILER][NAME]].append(job[HOST_COMPILER][VERSION])
                         job[JOB_EXECUTION_TYPE] = (
                             JOB_EXECUTION_TYPE,
                             JOB_EXECUTION_RUNTIME,
@@ -140,10 +126,7 @@ def add_job_parameters(job_matrix: List[Dict[str, Tuple[str, str]]]):
             else:
                 job[SM_LEVEL] = (SM_LEVEL, STANDARD_SM_LEVEL + ";90")
             missing_nvcc_versions.remove(job[DEVICE_COMPILER][VERSION])
-        elif (
-            ALPAKA_ACC_GPU_CUDA_ENABLE in job
-            and ALPAKA_ACC_GPU_CUDA_ENABLE[VERSION] != OFF_VER
-        ):
+        elif ALPAKA_ACC_GPU_CUDA_ENABLE in job and ALPAKA_ACC_GPU_CUDA_ENABLE[VERSION] != OFF_VER:
             job[SM_LEVEL] = (SM_LEVEL, STANDARD_SM_LEVEL)
         else:
             job[SM_LEVEL] = (SM_LEVEL, "")
diff --git a/alpaka/script/job_generator/reorder_jobs.py b/alpaka/script/job_generator/reorder_jobs.py
index 27e997cd..1bd7d1ac 100644
--- a/alpaka/script/job_generator/reorder_jobs.py
+++ b/alpaka/script/job_generator/reorder_jobs.py
@@ -9,8 +9,6 @@
 from typeguard import typechecked
 
 from alpaka_job_coverage.globals import *  # pylint: disable=wildcard-import,unused-wildcard-import
-from alpaka_job_coverage.util import search_and_move_job
-from versions import sw_versions
 
 
 @typechecked
diff --git a/alpaka/script/job_generator/verify.py b/alpaka/script/job_generator/verify.py
index 7871d69f..29c5cceb 100644
--- a/alpaka/script/job_generator/verify.py
+++ b/alpaka/script/job_generator/verify.py
@@ -14,9 +14,7 @@
 from util import print_warn
 
 
-def verify_parameters(
-    parameters: Dict[str, Union[List[Tuple[str, str]], List[List[Tuple[str, str]]]]]
-):
+def verify_parameters(parameters: Dict[str, Union[List[Tuple[str, str]], List[List[Tuple[str, str]]]]]):
     """Prints a warning for each parameter value which is not supported by the
     alpaka-job-coverage library.
 
@@ -29,10 +27,7 @@ def verify_parameters(
             for backend in param_value:
                 for name, version in backend:
                     if not is_supported_version(name=name, version=version):
-                        print_warn(
-                            f"{name}-{version} is not officially supported by "
-                            "the alpaka-job-library."
-                        )
+                        print_warn(f"{name}-{version} is not officially supported by the alpaka-job-library.")
         elif param_name not in [BUILD_TYPE, JOB_EXECUTION_TYPE, MDSPAN]:
             for name, version in param_value:
                 # if we compare a minor.major.patch version with a minor.major
@@ -41,16 +36,11 @@ def verify_parameters(
                 #      2.4 == 2.4.1 -> False
                 if name == CMAKE:
                     parsed_cmake_version = pk_version.parse(version)
-                    mod_version = (
-                        f"{parsed_cmake_version.major}.{parsed_cmake_version.minor}"
-                    )
+                    mod_version = f"{parsed_cmake_version.major}.{parsed_cmake_version.minor}"
                 else:
                     mod_version = version
                 if not is_supported_version(name=name, version=mod_version):
-                    print_warn(
-                        f"{name}-{mod_version} is not officially supported by "
-                        "the alpaka-job-library."
-                    )
+                    print_warn(f"{name}-{mod_version} is not officially supported by the alpaka-job-library.")
 
 
 class Combination:
@@ -121,15 +111,9 @@ def verify(combinations: List[Dict[str, Tuple[str, str]]]) -> bool:
         Combination({HOST_COMPILER: (GCC, "*"), DEVICE_COMPILER: (NVCC, "*")}),
         Combination({HOST_COMPILER: (CLANG, "*"), DEVICE_COMPILER: (NVCC, "*")}),
         Combination({HOST_COMPILER: (HIPCC, "*"), DEVICE_COMPILER: (HIPCC, "*")}),
-        Combination(
-            {HOST_COMPILER: (CLANG_CUDA, "*"), DEVICE_COMPILER: (CLANG_CUDA, "*")}
-        ),
-        Combination(
-            {DEVICE_COMPILER: (NVCC, "12.0"), CXX_STANDARD: (CXX_STANDARD, "20")}
-        ),
-        Combination(
-            {DEVICE_COMPILER: (NVCC, "12.1"), CXX_STANDARD: (CXX_STANDARD, "20")}
-        ),
+        Combination({HOST_COMPILER: (CLANG_CUDA, "*"), DEVICE_COMPILER: (CLANG_CUDA, "*")}),
+        Combination({DEVICE_COMPILER: (NVCC, "12.0"), CXX_STANDARD: (CXX_STANDARD, "20")}),
+        Combination({DEVICE_COMPILER: (NVCC, "12.1"), CXX_STANDARD: (CXX_STANDARD, "20")}),
         Combination({HOST_COMPILER: (ICPX, "*"), DEVICE_COMPILER: (ICPX, "*")}),
     ]
 
diff --git a/alpaka/script/job_generator/versions.py b/alpaka/script/job_generator/versions.py
index b3826294..e7f186e3 100644
--- a/alpaka/script/job_generator/versions.py
+++ b/alpaka/script/job_generator/versions.py
@@ -14,8 +14,6 @@
     GCC: ["9", "10", "11", "12", "13"],
     CLANG: ["9", "10", "11", "12", "13", "14", "15", "16", "17"],
     NVCC: [
-        "11.0",
-        "11.1",
         "11.2",
         "11.3",
         "11.4",
@@ -27,9 +25,12 @@
         "12.1",
         "12.2",
         "12.3",
+        "12.4",
+        "12.5",
+        "12.6",
     ],
-    HIPCC: ["5.1", "5.2", "5.3", "5.4", "5.5", "5.6", "5.7", "6.0"],
-    ICPX: ["2023.1.0", "2023.2.0"],
+    HIPCC: ["5.1", "5.2", "5.3", "5.4", "5.5", "5.6", "5.7", "6.0", "6.1", "6.2"],
+    ICPX: ["2024.2"],
     # Contains all enabled back-ends.
     # There are special cases for ALPAKA_ACC_GPU_CUDA_ENABLE and ALPAKA_ACC_GPU_HIP_ENABLE
     # which have to be combined with nvcc and hipcc versions.
@@ -74,7 +75,7 @@
         ],
     ],
     UBUNTU: ["20.04"],
-    CMAKE: ["3.22.6", "3.23.5", "3.24.4", "3.25.3", "3.26.4"],
+    CMAKE: ["3.22.6", "3.23.5", "3.24.4", "3.25.3", "3.26.4", "3.27.9", "3.28.6", "3.29.8", "3.30.3"],
     BOOST: [
         "1.74.0",
         "1.75.0",
@@ -85,6 +86,10 @@
         "1.80.0",
         "1.81.0",
         "1.82.0",
+        "1.83.0",
+        "1.84.0",
+        "1.85.0",
+        "1.86.0",
     ],
     CXX_STANDARD: ["17", "20"],
     BUILD_TYPE: BUILD_TYPES,
@@ -109,9 +114,7 @@ def get_compiler_versions(clang_cuda: bool = True) -> List[Tuple[str, str]]:
     compilers: List[Tuple[str, str]] = []
 
     # only use keys defined in sw_versions
-    for compiler_name in set(sw_versions.keys()).intersection(
-        [GCC, CLANG, NVCC, HIPCC, ICPX]
-    ):
+    for compiler_name in set(sw_versions.keys()).intersection([GCC, CLANG, NVCC, HIPCC, ICPX]):
         for version in sw_versions[compiler_name]:
             compilers.append((compiler_name, version))
             if clang_cuda and compiler_name == CLANG:
diff --git a/alpaka/script/prepare_sanitizers.sh b/alpaka/script/prepare_sanitizers.sh
index 60991d33..c18444b8 100755
--- a/alpaka/script/prepare_sanitizers.sh
+++ b/alpaka/script/prepare_sanitizers.sh
@@ -5,7 +5,17 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
-source ./script/set.sh
+set +xv
+source ./script/setup_utilities.sh
+
+echo_green "<SCRIPT: prepare_sanitizers>"
+
+#-------------------------------------------------------------------------------
+# Configure the leak sanitizer to gnore memory leaks in the clang OpenMP library
+cat > lsan.supp <<@EOF
+leak:libomp.so
+@EOF
+LSAN_SUPPRESSIONS="suppressions=$PWD/lsan.supp"
 
 #-------------------------------------------------------------------------------
 # Exports the CMAKE_CXX_FLAGS and CMAKE_EXE_LINKER_FLAGS to enable the sanitizers listed in ALPAKA_CI_SANITIZERS.
@@ -23,7 +33,9 @@ then
 fi
 if [ -z "${LSAN_OPTIONS+x}" ]
 then
-    export LSAN_OPTIONS=
+    export LSAN_OPTIONS="$LSAN_SUPPRESSIONS"
+else
+    export LSAN_OPTIONS="$LSAN_OPTIONS,$LSAN_SUPPRESSIONS"
 fi
 
 #-------------------------------------------------------------------------------
@@ -37,7 +49,7 @@ then
     CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fno-optimize-sibling-calls"
 
     # g++ needs to use a different linker
-    if [[ "${CXX}" == "g++"* ]]
+    if [[ "${ALPAKA_CI_CXX}" == "g++"* ]]
     then
         CMAKE_EXE_LINKER_FLAGS="${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=gold"
     fi
@@ -47,7 +59,7 @@ then
     then
         CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize=undefined"
 
-        if [[ "${CXX}" == "clang++"* ]]
+        if [[ "${ALPAKA_CI_CXX}" == "clang++"* ]]
         then
             # Previously 'local-bounds' was part of UBsan but has been removed because it is not a pure front-end check
             CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize=local-bounds"
@@ -65,7 +77,7 @@ then
             exit 1
         fi
 
-        if ( [ "${alpaka_ACC_GPU_CUDA_ENABLE}" == "ON" ] && [ "${CMAKE_CUDA_COMPILER}" == "clang++" ] )
+        if ( [ "${alpaka_ACC_GPU_CUDA_ENABLE}" == "ON" ] && [ "${ALPAKA_CI_CUDA_COMPILER}" == "clang++" ] )
         then
             # fatal error: error in backend: Module has a nontrivial global ctor, which NVPTX does not support.
             # clang-3.9: error: clang frontend command failed with exit code 70 (use -v to see invocation)
@@ -75,7 +87,7 @@ then
 
         CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize=address"
 
-        if [[ "${CXX}" != "clang++"* ]]
+        if [[ "${ALPAKA_CI_CXX}" != "clang++"* ]]
         then
             CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize-address-use-after-scope"
         fi
@@ -96,7 +108,7 @@ then
         fi
 
         CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize=thread"
-        if [[ "${CXX}" == "g++"* ]]
+        if [[ "${ALPAKA_CI_CXX}" == "g++"* ]]
         then
             CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -pie -fPIE"
             CMAKE_EXE_LINKER_FLAGS="${CMAKE_EXE_LINKER_FLAGS} -ltsan"
diff --git a/alpaka/script/print_env.sh b/alpaka/script/print_env.sh
index a2288f18..dfd9f146 100755
--- a/alpaka/script/print_env.sh
+++ b/alpaka/script/print_env.sh
@@ -5,9 +5,10 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
-source ./script/travis_retry.sh
+set +xv
+source ./script/setup_utilities.sh
 
-source ./script/set.sh
+echo_green "<SCRIPT: print_env>"
 
 #-------------------------------------------------------------------------------
 if [ "$alpaka_CI" = "GITHUB" ]
diff --git a/alpaka/script/push_doc.sh b/alpaka/script/push_doc.sh
index 506e17c1..bef8e41d 100755
--- a/alpaka/script/push_doc.sh
+++ b/alpaka/script/push_doc.sh
@@ -5,9 +5,10 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
-source ./script/travis_retry.sh
+set +xv
+source ./script/setup_utilities.sh
 
-source ./script/set.sh
+echo_green "<SCRIPT: push_doc>"
 
 cd docs/doxygen/html
 
diff --git a/alpaka/script/readme_generator/README.md b/alpaka/script/readme_generator/README.md
new file mode 100644
index 00000000..9a755074
--- /dev/null
+++ b/alpaka/script/readme_generator/README.md
@@ -0,0 +1,13 @@
+# About
+
+The script generates the Markdown table for compiler compatibility for back-ends for the alpaka `README.md`. It reads the properties from the `supported_compiler.json` and outputs the Markdown table to stdout.
+
+```bash
+./generate_supported_compilers.py
+```
+
+The generated Markdown can be copied to the alpaka `README.md`.
+
+# Configuration File
+
+The configuration file contains a dictionary. Each key in the dictionary is a compiler. The values contain information about the compatibility with the back-ends. The names of the back-ends are specified by the script. Each back-end requires a `state` property. The `comment` property is optional.
diff --git a/alpaka/script/readme_generator/generate_supported_compilers.py b/alpaka/script/readme_generator/generate_supported_compilers.py
new file mode 100755
index 00000000..7ebeed01
--- /dev/null
+++ b/alpaka/script/readme_generator/generate_supported_compilers.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import json
+from typing import List, Dict
+from dataclasses import dataclass
+
+
+def print_red(msg: str):
+    """print message with color red"""
+    print("\033[0;31m" + msg + "\033[0m")
+
+
+@dataclass
+class ConfigEntry:
+    """Stores the key for the json config and how the key is printed in
+    markdown.
+    """
+
+    name: str
+    representation: str
+
+
+# pylint: disable=missing-docstring
+def get_expected_config_entries() -> List[ConfigEntry]:
+    return [
+        ConfigEntry("serial", "Serial"),
+        ConfigEntry("OMPblock", "OpenMP 2.0+ blocks"),
+        ConfigEntry("OMPthread", "OpenMP 2.0+ threads"),
+        ConfigEntry("thread", "std::thread"),
+        ConfigEntry("tbb", "TBB"),
+        ConfigEntry("CUDAnvcc", "CUDA (nvcc)"),
+        ConfigEntry("CUDAclang", "CUDA (clang)"),
+        ConfigEntry("hip", "HIP (clang)"),
+        ConfigEntry("sycl", "SYCL"),
+    ]
+
+
+# pylint: disable=missing-docstring
+def get_expected_config_names() -> List[str]:
+    return [n.name for n in get_expected_config_entries()]
+
+
+def get_known_states() -> Dict[str, str]:
+    """Returns a dict of known backend states. The key is the value in the
+    config json and the value how it will be printed in markdown.
+    """
+    return {"yes": ":white_check_mark:", "no": ":x:", "none": "-"}
+
+
+# pylint: disable=missing-docstring
+def get_known_state_names() -> List[str]:
+    return list(get_known_states().keys())
+
+
+def config_validator(conf: Dict[str, Dict[str, str]]) -> bool:
+    """Validate the json configuration and prints errors."""
+    for compiler_name, compiler_conf in conf.items():
+        for expected_entry in get_expected_config_names():
+            if expected_entry not in compiler_conf:
+                print_red(f"[ERROR]: {compiler_name} misses entry {expected_entry}")
+                return False
+            if "state" not in compiler_conf[expected_entry]:
+                print_red(
+                    f"[ERROR]: {compiler_name}/{expected_entry} misses state entry"
+                )
+                return False
+            if compiler_conf[expected_entry]["state"] not in get_known_state_names():
+                print_red(
+                    f"[ERROR]: {compiler_name}/{expected_entry}/state "
+                    f"unknown state: {compiler_conf[expected_entry]['state']}"
+                )
+                return False
+
+    return True
+
+
+def render_table(conf):
+    """Renders the configuration to a markdown table"""
+    # [column][row]
+    table: List[List[str]] = []
+
+    # add backend names
+    backends: List[str] = ["Accelerator Back-end"]
+    for config_entry in get_expected_config_entries():
+        backends.append(config_entry.representation)
+    table.append(backends)
+
+    # reads the state of each backend for each compiler and generates the cell
+    # the cell contains at least a symbol for the state and can also contains
+    # a comment
+    for compiler_name, compiler_conf in conf.items():
+        column: List[str] = [compiler_name]
+        for backend in compiler_conf.values():
+            value = get_known_states()[backend["state"]]
+            if "comment" in backend:
+                value += f" {backend['comment']}"
+            column.append(value)
+        table.append(column)
+
+    # each cell in a column should have the same width
+    # therefore determine the broadest cell in a column
+    column_sizes: List[int] = []
+    for col in table:
+        size = 0
+        for row in col:
+            size = max(size, len(row))
+        column_sizes.append(size)
+
+    # print the table header
+    print("|", end="")
+    for c_num in range(len(table)):
+        print(f" {table[c_num][0]:<{column_sizes[c_num]}} |", end="")
+    print()
+
+    # print the lines under the table header
+    print("|", end="")
+    for c_num in range(len(table)):
+        print((column_sizes[c_num] + 2) * "-" + "|", end="")
+    print()
+
+    # prints each backend state cell for each compiler
+    for r_num in range(1, len(table[0])):
+        print("|", end="")
+        for c_num in range(len(table)):
+            print(f" {table[c_num][r_num]:<{column_sizes[c_num]}} |", end="")
+        print()
+
+
+if __name__ == "__main__":
+    script_path = os.path.dirname(os.path.realpath(__file__))
+    config_path = os.path.join(script_path, "supported_compilers.json")
+
+    if not os.path.exists(config_path):
+        print_red(f"[ERROR]: {config_path} does not exist")
+        sys.exit(1)
+
+    with open(config_path, "r", encoding="utf-8") as config_file:
+        config = json.load(config_file)
+
+    if not config_validator(config):
+        sys.exit(1)
+
+    render_table(config)
diff --git a/alpaka/script/readme_generator/supported_compilers.json b/alpaka/script/readme_generator/supported_compilers.json
new file mode 100644
index 00000000..0a929950
--- /dev/null
+++ b/alpaka/script/readme_generator/supported_compilers.json
@@ -0,0 +1,460 @@
+{
+    "gcc 9.5 (Linux)": {
+        "serial": {
+            "state": "yes"
+        },
+        "OMPblock": {
+            "state": "yes"
+        },
+        "OMPthread": {
+            "state": "yes"
+        },
+        "thread": {
+            "state": "yes"
+        },
+        "tbb": {
+            "state": "yes"
+        },
+        "CUDAnvcc": {
+            "state": "yes",
+            "comment": "(CUDA 11.2 - 12.5)[^2]"
+        },
+        "CUDAclang": {
+            "state": "none"
+        },
+        "hip": {
+            "state": "none"
+        },
+        "sycl": {
+            "state": "no"
+        }
+    },
+    "gcc 10.4 / 11.1 (Linux)": {
+        "serial": {
+            "state": "yes"
+        },
+        "OMPblock": {
+            "state": "yes"
+        },
+        "OMPthread": {
+            "state": "yes"
+        },
+        "thread": {
+            "state": "yes"
+        },
+        "tbb": {
+            "state": "yes"
+        },
+        "CUDAnvcc": {
+            "state": "yes",
+            "comment": "(CUDA 11.4 - 12.0)[^2]"
+        },
+        "CUDAclang": {
+            "state": "none"
+        },
+        "hip": {
+            "state": "none"
+        },
+        "sycl": {
+            "state": "no"
+        }
+    },
+    "gcc 12.3 (Linux)": {
+        "serial": {
+            "state": "yes"
+        },
+        "OMPblock": {
+            "state": "yes"
+        },
+        "OMPthread": {
+            "state": "yes"
+        },
+        "thread": {
+            "state": "yes"
+        },
+        "tbb": {
+            "state": "yes"
+        },
+        "CUDAnvcc": {
+            "state": "yes",
+            "comment": "(CUDA 12.0 - 12.5)"
+        },
+        "CUDAclang": {
+            "state": "none"
+        },
+        "hip": {
+            "state": "none"
+        },
+        "sycl": {
+            "state": "no"
+        }
+    },
+    "gcc 13.1 (Linux)": {
+        "serial": {
+            "state": "yes"
+        },
+        "OMPblock": {
+            "state": "yes"
+        },
+        "OMPthread": {
+            "state": "yes"
+        },
+        "thread": {
+            "state": "yes"
+        },
+        "tbb": {
+            "state": "yes"
+        },
+        "CUDAnvcc": {
+            "state": "yes",
+            "comment": "(CUDA 12.4 - 12.5)"
+        },
+        "CUDAclang": {
+            "state": "none"
+        },
+        "hip": {
+            "state": "none"
+        },
+        "sycl": {
+            "state": "no"
+        }
+    },
+    "clang 9 (Linux)": {
+        "serial": {
+            "state": "yes"
+        },
+        "OMPblock": {
+            "state": "yes"
+        },
+        "OMPthread": {
+            "state": "yes"
+        },
+        "thread": {
+            "state": "yes"
+        },
+        "tbb": {
+            "state": "yes"
+        },
+        "CUDAnvcc": {
+            "state": "yes",
+            "comment": "(CUDA 11.6 - 12.0)[^2]"
+        },
+        "CUDAclang": {
+            "state": "no"
+        },
+        "hip": {
+            "state": "no"
+        },
+        "sycl": {
+            "state": "no"
+        }
+    },
+    "clang 10/11 (Linux)": {
+        "serial": {
+            "state": "yes"
+        },
+        "OMPblock": {
+            "state": "yes"
+        },
+        "OMPthread": {
+            "state": "yes"
+        },
+        "thread": {
+            "state": "yes"
+        },
+        "tbb": {
+            "state": "yes"
+        },
+        "CUDAnvcc": {
+            "state": "yes",
+            "comment": "(CUDA 11.2, 11.6 - 12.0)[^2]"
+        },
+        "CUDAclang": {
+            "state": "no"
+        },
+        "hip": {
+            "state": "no"
+        },
+        "sycl": {
+            "state": "no"
+        }
+    },
+    "clang 12 (Linux)": {
+        "serial": {
+            "state": "yes"
+        },
+        "OMPblock": {
+            "state": "yes"
+        },
+        "OMPthread": {
+            "state": "yes"
+        },
+        "thread": {
+            "state": "yes"
+        },
+        "tbb": {
+            "state": "yes"
+        },
+        "CUDAnvcc": {
+            "state": "yes",
+            "comment": "(CUDA 11.6 - 12.0)[^2]"
+        },
+        "CUDAclang": {
+            "state": "no"
+        },
+        "hip": {
+            "state": "no"
+        },
+        "sycl": {
+            "state": "no"
+        }
+    },
+    "clang 13 (Linux)": {
+        "serial": {
+            "state": "yes"
+        },
+        "OMPblock": {
+            "state": "yes"
+        },
+        "OMPthread": {
+            "state": "yes"
+        },
+        "thread": {
+            "state": "yes"
+        },
+        "tbb": {
+            "state": "yes"
+        },
+        "CUDAnvcc": {
+            "state": "yes",
+            "comment": "(CUDA 11.7 - 12.0)"
+        },
+        "CUDAclang": {
+            "state": "no"
+        },
+        "hip": {
+            "state": "no"
+        },
+        "sycl": {
+            "state": "no"
+        }
+    },
+    "clang 14 (Linux)": {
+        "serial": {
+            "state": "yes"
+        },
+        "OMPblock": {
+            "state": "yes"
+        },
+        "OMPthread": {
+            "state": "yes"
+        },
+        "thread": {
+            "state": "yes"
+        },
+        "tbb": {
+            "state": "yes"
+        },
+        "CUDAnvcc": {
+            "state": "yes",
+            "comment": "(CUDA 11.8 - 12.0)"
+        },
+        "CUDAclang": {
+            "state": "yes",
+            "comment": "(CUDA 11.2 - 11.5)"
+        },
+        "hip": {
+            "state": "yes",
+            "comment": "(HIP 5.1 - 5.2)"
+        },
+        "sycl": {
+            "state": "no"
+        }
+    },
+    "clang 15 (Linux)": {
+        "serial": {
+            "state": "yes"
+        },
+        "OMPblock": {
+            "state": "yes"
+        },
+        "OMPthread": {
+            "state": "yes"
+        },
+        "thread": {
+            "state": "yes"
+        },
+        "tbb": {
+            "state": "yes"
+        },
+        "CUDAnvcc": {
+            "state": "yes",
+            "comment": "(CUDA 12.2)"
+        },
+        "CUDAclang": {
+            "state": "yes",
+            "comment": "(CUDA 11.2 - 11.5)"
+        },
+        "hip": {
+            "state": "yes",
+            "comment": "(HIP 5.3 - 5.4)"
+        },
+        "sycl": {
+            "state": "no"
+        }
+    },
+    "clang 16 (Linux)": {
+        "serial": {
+            "state": "yes"
+        },
+        "OMPblock": {
+            "state": "yes"
+        },
+        "OMPthread": {
+            "state": "yes"
+        },
+        "thread": {
+            "state": "yes"
+        },
+        "tbb": {
+            "state": "yes"
+        },
+        "CUDAnvcc": {
+            "state": "yes",
+            "comment": "(CUDA 12.3)"
+        },
+        "CUDAclang": {
+            "state": "yes",
+            "comment": "(CUDA 11.2 - 11.5)"
+        },
+        "hip": {
+            "state": "yes",
+            "comment": "(HIP 5.5 - 5.6)"
+        },
+        "sycl": {
+            "state": "no"
+        }
+    },
+    "clang 17 (Linux)": {
+        "serial": {
+            "state": "yes"
+        },
+        "OMPblock": {
+            "state": "yes"
+        },
+        "OMPthread": {
+            "state": "yes"
+        },
+        "thread": {
+            "state": "yes"
+        },
+        "tbb": {
+            "state": "yes"
+        },
+        "CUDAnvcc": {
+            "state": "yes",
+            "comment": "(CUDA 12.4 - 15.5)"
+        },
+        "CUDAclang": {
+            "state": "yes",
+            "comment": "(CUDA 11.2 - 11.8)"
+        },
+        "hip": {
+            "state": "yes",
+            "comment": "(HIP 5.7 - 6.1)"
+        },
+        "sycl": {
+            "state": "no"
+        }
+    },
+    "icpx 2024.2 (Linux)": {
+        "serial": {
+            "state": "yes"
+        },
+        "OMPblock": {
+            "state": "yes",
+            "comment": "[^1]"
+        },
+        "OMPthread": {
+            "state": "yes",
+            "comment": "[^1]"
+        },
+        "thread": {
+            "state": "yes"
+        },
+        "tbb": {
+            "state": "yes"
+        },
+        "CUDAnvcc": {
+            "state": "no"
+        },
+        "CUDAclang": {
+            "state": "no"
+        },
+        "hip": {
+            "state": "no"
+        },
+        "sycl": {
+            "state": "yes",
+            "comment": "[^4]"
+        }
+    },
+    "Xcode 13.2.1 / 14.2 / 14.3.1 (macOS)": {
+        "serial": {
+            "state": "yes"
+        },
+        "OMPblock": {
+            "state": "yes"
+        },
+        "OMPthread": {
+            "state": "yes"
+        },
+        "thread": {
+            "state": "yes"
+        },
+        "tbb": {
+            "state": "yes"
+        },
+        "CUDAnvcc": {
+            "state": "none"
+        },
+        "CUDAclang": {
+            "state": "none"
+        },
+        "hip": {
+            "state": "none"
+        },
+        "sycl": {
+            "state": "none"
+        }
+    },
+    "Visual Studio 2022 (Windows)": {
+        "serial": {
+            "state": "yes"
+        },
+        "OMPblock": {
+            "state": "yes"
+        },
+        "OMPthread": {
+            "state": "yes"
+        },
+        "thread": {
+            "state": "yes"
+        },
+        "tbb": {
+            "state": "yes"
+        },
+        "CUDAnvcc": {
+            "state": "no"
+        },
+        "CUDAclang": {
+            "state": "none"
+        },
+        "hip": {
+            "state": "none"
+        },
+        "sycl": {
+            "state": "no"
+        }
+    }
+}
diff --git a/alpaka/script/run.sh b/alpaka/script/run.sh
index 4249c21c..6fae0ca0 100755
--- a/alpaka/script/run.sh
+++ b/alpaka/script/run.sh
@@ -5,7 +5,10 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
-source ./script/set.sh
+set +xv
+source ./script/setup_utilities.sh
+
+echo_green "<SCRIPT: run>"
 
 : "${ALPAKA_CI_CMAKE_DIR?'ALPAKA_CI_CMAKE_DIR must be specified'}"
 echo "ALPAKA_CI_CMAKE_DIR: ${ALPAKA_CI_CMAKE_DIR}"
@@ -18,8 +21,8 @@ then
     : "${ALPAKA_CI_STDLIB?'ALPAKA_CI_STDLIB must be specified'}"
     echo "ALPAKA_CI_STDLIB: ${ALPAKA_CI_STDLIB}"
 fi
-: "${CXX?'CXX must be specified'}"
-echo "CXX: ${CXX}"
+: "${ALPAKA_CI_CXX?'ALPAKA_CI_CXX must be specified'}"
+echo "ALPAKA_CI_CXX: ${ALPAKA_CI_CXX}"
 
 
 if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
@@ -28,7 +31,7 @@ then
     then
         LD_LIBRARY_PATH=
     fi
-    if [[ "${CXX}" = "clang++"* ]]
+    if [[ "${ALPAKA_CI_CXX}" = "clang++"* ]]
     then
         if [ "${ALPAKA_CI_CLANG_VER}" -ge "10" ]
         then
@@ -63,7 +66,7 @@ then
         # We have to explicitly add the stub libcuda.so to CUDA_LIB_PATH because the real one would be installed by the driver (which we can not install).
         export CUDA_LIB_PATH=/usr/local/cuda/lib64/stubs/
 
-        if [ "${CMAKE_CUDA_COMPILER}" == "nvcc" ]
+        if [ "${ALPAKA_CI_CUDA_COMPILER}" == "nvcc" ]
         then
             which nvcc
             nvcc -V
@@ -90,21 +93,28 @@ then
             export CMAKE_CXX_FLAGS=
         fi
 
-        if [[ "${CXX}" == "clang++"* ]]
+        if [[ "${ALPAKA_CI_CXX}" == "clang++"* ]]
         then
             CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -stdlib=libc++"
         fi
     fi
 
-    if [ "${CXX}" == "icpc" ]
+    if [ "${ALPAKA_CI_CXX}" == "icpc" ]
     then
         set +eu
-        which ${CXX} || source /opt/intel/oneapi/setvars.sh
-        set -eu
+        which ${ALPAKA_CI_CXX} || source /opt/intel/oneapi/setvars.sh
+
+        # exit by default if the command does not return 0
+        # can be deactivated by setting the environment variable alpaka_DISABLE_EXIT_FAILURE
+        # for example for local debugging in a Docker container
+        if [ -z ${alpaka_DISABLE_EXIT_FAILURE+x} ]; then
+            set -e
+        fi
+        set -u
     fi
 
-    which "${CXX}"
-    ${CXX} --version
+    which "${CMAKE_CXX_COMPILER}"
+    ${CMAKE_CXX_COMPILER} --version
 fi
 
 if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
@@ -141,7 +151,7 @@ else
     then
         export alpaka_USE_MDSPAN=FETCH
     else
-	    export alpaka_USE_MDSPAN=OFF
+        export alpaka_USE_MDSPAN=OFF
     fi
 fi
 
diff --git a/alpaka/script/run_analysis.sh b/alpaka/script/run_analysis.sh
index 104c016f..0be67a3e 100755
--- a/alpaka/script/run_analysis.sh
+++ b/alpaka/script/run_analysis.sh
@@ -5,7 +5,10 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
-source ./script/set.sh
+set +xv
+source ./script/setup_utilities.sh
+
+echo_green "<SCRIPT: run_analysis>"
 
 if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] || [ "$ALPAKA_CI_OS_NAME" = "macOS" ]
 then
diff --git a/alpaka/script/run_build.sh b/alpaka/script/run_build.sh
index 8ffde4ec..4926fbce 100755
--- a/alpaka/script/run_build.sh
+++ b/alpaka/script/run_build.sh
@@ -4,8 +4,10 @@
 # Copyright 2014-2021 Benjamin Worpitz, Simeon Ehrig
 # SPDX-License-Identifier: MPL-2.0
 #
+set +xv
+source ./script/setup_utilities.sh
 
-source ./script/set.sh
+echo_green "<SCRIPT: run_build>"
 
 cd build/
 
diff --git a/alpaka/script/run_doxygen.sh b/alpaka/script/run_doxygen.sh
index b091cfbb..8c5ceddc 100755
--- a/alpaka/script/run_doxygen.sh
+++ b/alpaka/script/run_doxygen.sh
@@ -5,7 +5,10 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
-source ./script/set.sh
+set +xv
+source ./script/setup_utilities.sh
+
+echo_green "<SCRIPT: run_doxygen>"
 
 #To deploy the doxygen documentation a copy of the repository is created inside the deployed folder.
 #This copy is always in the gh-pages branch consisting only of the containing files.
diff --git a/alpaka/script/run_generate.sh b/alpaka/script/run_generate.sh
index 142bfa48..e17fd8fb 100755
--- a/alpaka/script/run_generate.sh
+++ b/alpaka/script/run_generate.sh
@@ -5,7 +5,13 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
-source ./script/set.sh
+set +xv
+source ./script/setup_utilities.sh
+
+echo_green "<SCRIPT: run_generate>"
+
+: "${CMAKE_CXX_COMPILER?'CMAKE_CXX_COMPILER must be specified'}"
+: "${alpaka_CXX_STANDARD?'alpaka_CXX_STANDARD must be specified'}"
 
 #-------------------------------------------------------------------------------
 
@@ -34,10 +40,6 @@ if [ ! -z "${CMAKE_CXX_FLAGS+x}" ]
 then
     echo "CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
 fi
-if [ ! -z "${CMAKE_C_COMPILER+x}" ]
-then
-    echo "CMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
-fi
 if [ ! -z "${CMAKE_CXX_COMPILER+x}" ]
 then
     echo "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
@@ -78,9 +80,9 @@ mkdir -p build/
 cd build/
 
 "${ALPAKA_CI_CMAKE_EXECUTABLE}" --log-level=VERBOSE -G "${ALPAKA_CI_CMAKE_GENERATOR}" ${ALPAKA_CI_CMAKE_GENERATOR_PLATFORM}\
-    -Dalpaka_BUILD_EXAMPLES=ON -DBUILD_TESTING=ON "$(env2cmake alpaka_ENABLE_WERROR)" \
+    -Dalpaka_BUILD_EXAMPLES=ON -DBUILD_TESTING=ON -Dalpaka_BUILD_BENCHMARKS=ON "$(env2cmake alpaka_ENABLE_WERROR)" \
     "$(env2cmake BOOST_ROOT)" -DBOOST_LIBRARYDIR="${ALPAKA_CI_BOOST_LIB_DIR}/lib" -DBoost_USE_STATIC_LIBS=ON -DBoost_USE_MULTITHREADED=ON -DBoost_USE_STATIC_RUNTIME=OFF -DBoost_ARCHITECTURE="-x64" \
-    "$(env2cmake CMAKE_BUILD_TYPE)" "$(env2cmake CMAKE_CXX_FLAGS)" "$(env2cmake CMAKE_C_COMPILER)" "$(env2cmake CMAKE_CXX_COMPILER)" "$(env2cmake CMAKE_EXE_LINKER_FLAGS)" "$(env2cmake CMAKE_CXX_EXTENSIONS)"\
+    "$(env2cmake CMAKE_BUILD_TYPE)" "$(env2cmake CMAKE_CXX_FLAGS)" "$(env2cmake CMAKE_CXX_COMPILER)" "$(env2cmake CMAKE_EXE_LINKER_FLAGS)" "$(env2cmake CMAKE_CXX_EXTENSIONS)"\
     "$(env2cmake alpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE)" "$(env2cmake alpaka_ACC_CPU_B_SEQ_T_THREADS_ENABLE)" \
     "$(env2cmake alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE)" \
     "$(env2cmake alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE)" "$(env2cmake alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE)" \
diff --git a/alpaka/script/run_install.sh b/alpaka/script/run_install.sh
index 01e6053d..a016eecd 100755
--- a/alpaka/script/run_install.sh
+++ b/alpaka/script/run_install.sh
@@ -5,7 +5,10 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
-source ./script/set.sh
+set +xv
+source ./script/setup_utilities.sh
+
+echo_green "<SCRIPT: run_install>"
 
 ALPAKA_CI_CMAKE_EXECUTABLE=cmake
 if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
diff --git a/alpaka/script/run_tests.sh b/alpaka/script/run_tests.sh
index 3fdbbc7f..b770ff5a 100755
--- a/alpaka/script/run_tests.sh
+++ b/alpaka/script/run_tests.sh
@@ -5,7 +5,10 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
-source ./script/set.sh
+set +xv
+source ./script/setup_utilities.sh
+
+echo_green "<SCRIPT: run_tests>"
 
 : "${alpaka_ACC_GPU_CUDA_ENABLE?'alpaka_ACC_GPU_CUDA_ENABLE must be specified'}"
 : "${alpaka_ACC_GPU_HIP_ENABLE?'alpaka_ACC_GPU_HIP_ENABLE must be specified'}"
@@ -34,10 +37,10 @@ then
 
     if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] || [ "$ALPAKA_CI_OS_NAME" = "macOS" ]
     then
-        ctest -V
+        ctest --output-on-failure
     elif [ "$ALPAKA_CI_OS_NAME" = "Windows" ]
     then
-        ctest -V -C ${CMAKE_BUILD_TYPE}
+        ctest --output-on-failure -C ${CMAKE_BUILD_TYPE}
     fi
 
     cd ..
diff --git a/alpaka/script/setup_utilities.sh b/alpaka/script/setup_utilities.sh
new file mode 100755
index 00000000..2276fc85
--- /dev/null
+++ b/alpaka/script/setup_utilities.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+# SPDX-License-Identifier: MPL-2.0
+
+# serveral helper function and tools for the CI
+# the script should be source everywhere, the utils are required
+# if a bash script is normal called, self defined bash functions are not avaible from the calling bash instance
+
+
+# exit by default if the command does not return 0
+# can be deactivated by setting the environment variable alpaka_DISABLE_EXIT_FAILURE
+# for example for local debugging in a Docker container
+if [ -z ${alpaka_DISABLE_EXIT_FAILURE+x} ]; then
+    set -e
+fi
+
+# disable command traces for the following scripts to avoid useless noise in the job output
+source ./script/setup_utilities/color_echo.sh
+source ./script/setup_utilities/travis_retry.sh
+source ./script/setup_utilities/sudo.sh
+source ./script/setup_utilities/agc-manager.sh
+source ./script/setup_utilities/set.sh
diff --git a/alpaka/script/setup_utilities/agc-manager.sh b/alpaka/script/setup_utilities/agc-manager.sh
new file mode 100755
index 00000000..3c6773b1
--- /dev/null
+++ b/alpaka/script/setup_utilities/agc-manager.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+: ${ALPAKA_CI_OS_NAME?"ALPAKA_CI_OS_NAME must be specified"}
+
+# the agc-manager only exists in the agc-container
+# set alias to false, so each time if we ask the agc-manager if a software is installed, it will
+# return false and the installation of software will be triggered
+if [ "$ALPAKA_CI_OS_NAME" != "Linux" ] || [ ! -f "/usr/bin/agc-manager" ]; then
+    # display message only one time not everytime the script is sourced
+    if [ -z ${PRINT_INSTALL_AGC_MANAGER+x} ]; then
+        echo_yellow "install fake agc-manager"
+        export PRINT_INSTALL_AGC_MANAGER=true
+    fi
+
+    echo '#!/bin/bash' >agc-manager
+    echo 'exit 1' >>agc-manager
+
+    if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]; then
+        sudo chmod +x agc-manager
+        sudo mv agc-manager /usr/bin/agc-manager
+    elif [ "$ALPAKA_CI_OS_NAME" = "Windows" ]; then
+        chmod +x agc-manager
+        mv agc-manager /usr/bin
+    elif [ "$ALPAKA_CI_OS_NAME" = "macOS" ]; then
+        sudo chmod +x agc-manager
+        sudo mv agc-manager /usr/local/bin
+    else
+        echo_red "installing agc-manager: " \
+        "unknown operation system: ${ALPAKA_CI_OS_NAME}"
+        exit 1
+    fi
+else
+    # display message only one time not everytime the script is sourced
+    if [ -z ${PRINT_INSTALL_AGC_MANAGER+x} ]; then
+        echo_green "found agc-manager"
+        export PRINT_INSTALL_AGC_MANAGER=true
+    fi
+fi
diff --git a/alpaka/script/setup_utilities/color_echo.sh b/alpaka/script/setup_utilities/color_echo.sh
new file mode 100755
index 00000000..45d95085
--- /dev/null
+++ b/alpaka/script/setup_utilities/color_echo.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+# SPDX-License-Identifier: MPL-2.0
+
+# colored output
+
+# display a message in green
+echo_green() {
+    # macOS uses bash 3, therefor \e is not working \033 needs to be used
+    # https://stackoverflow.com/questions/28782394/how-to-get-osx-shell-script-to-show-colors-in-echo
+    echo -e "\033[1;32m$1\033[0m"
+}
+
+# display a message in yellow
+echo_yellow() {
+    echo -e "\033[1;33m$1\033[0m"
+}
+
+# display a message in red
+echo_red() {
+    echo -e "\033[1;31m$1\033[0m"
+}
diff --git a/alpaka/script/set.sh b/alpaka/script/setup_utilities/set.sh
similarity index 62%
rename from alpaka/script/set.sh
rename to alpaka/script/setup_utilities/set.sh
index 5719100f..2e738877 100755
--- a/alpaka/script/set.sh
+++ b/alpaka/script/setup_utilities/set.sh
@@ -11,4 +11,12 @@
 # -u: treat unset variables as an error
 # -v: Print shell input lines as they are read
 # -x: Print command traces before executing command
-set -eouvx pipefail
+
+# exit by default if the command does not return 0
+# can be deactivated by setting the environment variable alpaka_DISABLE_EXIT_FAILURE
+# for example for local debugging in a Docker container
+if [ -z ${alpaka_DISABLE_EXIT_FAILURE+x} ]; then
+    set -e
+fi
+
+set -ouvx pipefail
diff --git a/alpaka/script/setup_utilities/sudo.sh b/alpaka/script/setup_utilities/sudo.sh
new file mode 100755
index 00000000..26bbffeb
--- /dev/null
+++ b/alpaka/script/setup_utilities/sudo.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# inside the agc-container, the user is root and does not require sudo
+# to compatibility to other container, fake the missing sudo command
+if ! command -v sudo &>/dev/null; then
+    if [ "$ALPAKA_CI_OS_NAME" == "Linux" ]; then
+        # display message only one time not everytime the script is sourced
+        if [ -z ${PRINT_INSTALL_SUDO+x} ]; then
+            echo_yellow "install sudo"
+            export PRINT_INSTALL_SUDO=true
+        fi
+
+        DEBIAN_FRONTEND=noninteractive travis_retry apt update
+        DEBIAN_FRONTEND=noninteractive travis_retry apt install -y sudo
+    fi
+fi
diff --git a/alpaka/script/travis_retry.sh b/alpaka/script/setup_utilities/travis_retry.sh
similarity index 66%
rename from alpaka/script/travis_retry.sh
rename to alpaka/script/setup_utilities/travis_retry.sh
index f158f5eb..6bb5b594 100755
--- a/alpaka/script/travis_retry.sh
+++ b/alpaka/script/setup_utilities/travis_retry.sh
@@ -21,26 +21,27 @@
 # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-ANSI_RED="\033[31m"
-ANSI_RESET="\033[0m"
-
 travis_retry() {
-  set +euo pipefail
-  local result=0
-  local count=1
-  local max=666
-  while [ $count -le $max ]; do
-    [ $result -ne 0 ] && {
-      echo -e "\n${ANSI_RED}The command \"$*\" failed. Retrying, $count of $max.${ANSI_RESET}\n" >&2
+  # apply `set +euo pipefail` in a local scope so that the following script is not affected and 
+  # e.g. exit on failure is not deactivated
+  (
+    set +euo pipefail
+    local result=0
+    local count=1
+    local max=666
+    while [ $count -le $max ]; do
+      [ $result -ne 0 ] && {
+        echo_red "\nThe command \"$*\" failed. Retrying, $count of $max.\n" >&2
+      }
+      "$@"
+      result=$?
+      [ $result -eq 0 ] && break
+      count=$((count + 1))
+      sleep 1
+    done
+    [ $count -gt $max ] && {
+      echo_red "\nThe command \"$*\" failed $max times.\n" >&2
     }
-    "$@"
-    result=$?
-    [ $result -eq 0 ] && break
-    count=$((count + 1))
-    sleep 1
-  done
-  [ $count -gt $max ] && {
-    echo -e "\n${ANSI_RED}The command \"$*\" failed $max times.${ANSI_RESET}\n" >&2
-  }
-  return $result
+    return $result
+  )
 }
diff --git a/alpaka/test/common/CMakeLists.txt b/alpaka/test/common/CMakeLists.txt
index 538bd34e..1f3df34b 100644
--- a/alpaka/test/common/CMakeLists.txt
+++ b/alpaka/test/common/CMakeLists.txt
@@ -33,13 +33,13 @@ target_link_libraries(${_COMMON_TARGET_NAME} INTERFACE alpaka::alpaka)
 # Prevent "unsafe buffer usage" warnings from clang >= 16
 if((CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "16.0.0") OR
    (CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "2023.2.0"))
-    
+
     target_compile_options(${_COMMON_TARGET_NAME} INTERFACE "-Wno-unsafe-buffer-usage")
 
     # We have no way to determine if we are using amdclang++. So we will just decide this by checking for the HIP back-end.
     if(alpaka_ACC_GPU_HIP_ENABLE)
         # amdclang++-5.5 pretends to be clang-16 but doesn't know all warnings.
-        target_compile_options(${_COMMON_TARGET_NAME} INTERFACE "-Wno-unknown-warning-option") 
+        target_compile_options(${_COMMON_TARGET_NAME} INTERFACE "-Wno-unknown-warning-option")
     endif()
 endif()
 
diff --git a/alpaka/test/common/devCompileOptions.cmake b/alpaka/test/common/devCompileOptions.cmake
index 84e892bf..957354d8 100644
--- a/alpaka/test/common/devCompileOptions.cmake
+++ b/alpaka/test/common/devCompileOptions.cmake
@@ -33,7 +33,7 @@ if(MSVC)
     endif()
     # Improve debugging.
     list(APPEND alpaka_DEV_COMPILE_OPTIONS "$<$<AND:$<CONFIG:Debug>,$<COMPILE_LANGUAGE:CXX>>:SHELL:/Zo>"
-                                           "$<$<AND:$<CONFIG:Debug>,$<COMPILE_LANGUAGE:CUDA>>:-Xcompiler /Zo>")
+                                           "$<$<AND:$<CONFIG:Debug>,$<COMPILE_LANGUAGE:CUDA>>:SHELL:-Xcompiler /Zo>")
 
     # Flags added in Visual Studio 2013
     list(APPEND alpaka_DEV_COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CXX>:SHELL:/Zc:throwingNew>"
@@ -50,7 +50,7 @@ if(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
     list(APPEND alpaka_DEV_COMPILE_OPTIONS "-Wall")
     list(APPEND alpaka_DEV_COMPILE_OPTIONS "-Wextra")
     # Turn off -pedantic when compiling CUDA code, otherwise the CI logs are flooded with warnings. gcc doesn't like nvcc's code transformations.
-    list(APPEND alpaka_DEV_COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CXX>:SHELL:-pedantic>" 
+    list(APPEND alpaka_DEV_COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CXX>:SHELL:-pedantic>"
                                            "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler -Wno-pedantic>")
     if(alpaka_ENABLE_WERROR)
         list(APPEND alpaka_DEV_COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CXX>:SHELL:-Werror>"
@@ -189,7 +189,8 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Inte
         if (alpaka_ACC_SYCL_ENABLE)
             # avoid: warning: disabled expansion of recursive macro
             list(APPEND alpaka_DEV_COMPILE_OPTIONS "-Wno-disabled-macro-expansion")
+            list(APPEND alpaka_DEV_COMPILE_OPTIONS "-Wno-reserved-identifier")
+            list(APPEND alpaka_DEV_COMPILE_OPTIONS "-Wno-old-style-cast")
         endif()
     endif()
 endif()
-
diff --git a/alpaka/test/integ/axpy/src/axpy.cpp b/alpaka/test/integ/axpy/src/axpy.cpp
index c5742814..67b73f57 100644
--- a/alpaka/test/integ/axpy/src/axpy.cpp
+++ b/alpaka/test/integ/axpy/src/axpy.cpp
@@ -94,26 +94,13 @@ TEMPLATE_LIST_TEST_CASE("axpy", "[axpy]", TestAccs)
 
     alpaka::Vec<Dim, Idx> const extent(numElements);
 
-    // Let alpaka calculate good block and grid sizes given our full problem extent.
-    alpaka::WorkDivMembers<Dim, Idx> const workDiv(alpaka::getValidWorkDiv<Acc>(
-        devAcc,
-        extent,
-        static_cast<Idx>(3u),
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted));
-
-    std::cout << "AxpyKernel("
-              << " numElements:" << numElements << ", accelerator: " << alpaka::getAccName<Acc>()
-              << ", kernel: " << alpaka::core::demangled<decltype(kernel)> << ", workDiv: " << workDiv << ")"
-              << std::endl;
-
     // Allocate host memory buffers in pinned memory.
     auto memBufHostX = alpaka::allocMappedBufIfSupported<Val, Idx>(devHost, platformAcc, extent);
     auto memBufHostOrigY = alpaka::allocMappedBufIfSupported<Val, Idx>(devHost, platformAcc, extent);
     auto memBufHostY = alpaka::allocMappedBufIfSupported<Val, Idx>(devHost, platformAcc, extent);
-    Val* const pBufHostX = alpaka::getPtrNative(memBufHostX);
-    Val* const pBufHostOrigY = alpaka::getPtrNative(memBufHostOrigY);
-    Val* const pBufHostY = alpaka::getPtrNative(memBufHostY);
+    Val* const pBufHostX = std::data(memBufHostX);
+    Val* const pBufHostOrigY = std::data(memBufHostOrigY);
+    Val* const pBufHostY = std::data(memBufHostY);
 
     // random generator for uniformly distributed numbers in [0,1)
     // keep in mind, this can generate different values on different platforms
@@ -159,14 +146,33 @@ TEMPLATE_LIST_TEST_CASE("axpy", "[axpy]", TestAccs)
     std::cout << std::endl;
 #endif
 
+
+    alpaka::KernelCfg<Acc> const kernelCfg
+        = {extent, static_cast<Idx>(3u), false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted};
+
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    auto const workDiv = alpaka::getValidWorkDiv(
+        kernelCfg,
+        devAcc,
+        kernel,
+        numElements,
+        alpha,
+        std::data(memBufAccX),
+        std::data(memBufAccY));
+
+    std::cout << "AxpyKernel("
+              << " numElements:" << numElements << ", accelerator: " << alpaka::getAccName<Acc>()
+              << ", kernel: " << alpaka::core::demangled<decltype(kernel)> << ", workDiv: " << workDiv << ")"
+              << std::endl;
+
     // Create the kernel execution task.
     auto const taskKernel = alpaka::createTaskKernel<Acc>(
         workDiv,
         kernel,
         numElements,
         alpha,
-        alpaka::getPtrNative(memBufAccX),
-        alpaka::getPtrNative(memBufAccY));
+        std::data(memBufAccX),
+        std::data(memBufAccY));
 
     // Profile the kernel execution.
     std::cout << "Execution time: " << alpaka::test::integ::measureTaskRunTimeMs(queue, taskKernel) << " ms"
diff --git a/alpaka/test/integ/hostOnlyAPI/src/hostOnlyAPI.cpp b/alpaka/test/integ/hostOnlyAPI/src/hostOnlyAPI.cpp
index 59eb0bdd..338bebb7 100644
--- a/alpaka/test/integ/hostOnlyAPI/src/hostOnlyAPI.cpp
+++ b/alpaka/test/integ/hostOnlyAPI/src/hostOnlyAPI.cpp
@@ -49,14 +49,14 @@ TEMPLATE_LIST_TEST_CASE("hostOnlyAPI", "[hostOnlyAPI]", TestAccs)
     // host buffer
     auto h_buffer1 = alpaka::allocMappedBufIfSupported<int, Idx>(host, platformAcc, Vec1D{Idx{42}});
     INFO(
-        "host buffer allocated at " << alpaka::getPtrNative(h_buffer1) << " with "
-                                    << alpaka::getExtentProduct(h_buffer1) << " element(s)");
+        "host buffer allocated at " << std::data(h_buffer1) << " with " << alpaka::getExtentProduct(h_buffer1)
+                                    << " element(s)");
 
     // async host buffer
     auto h_buffer2 = alpaka::allocAsyncBufIfSupported<int, Idx>(hostQueue, Vec1D{Idx{42}});
     INFO(
-        "second host buffer allocated at " << alpaka::getPtrNative(h_buffer2) << " with "
-                                           << alpaka::getExtentProduct(h_buffer2) << " element(s)");
+        "second host buffer allocated at " << std::data(h_buffer2) << " with " << alpaka::getExtentProduct(h_buffer2)
+                                           << " element(s)");
 
     // host-side memset
     int const value1 = 42;
@@ -64,7 +64,7 @@ TEMPLATE_LIST_TEST_CASE("hostOnlyAPI", "[hostOnlyAPI]", TestAccs)
     INFO("host-side memset");
     alpaka::memset(hostQueue, h_buffer1, value1);
     alpaka::wait(hostQueue);
-    CHECK(expected1 == *alpaka::getPtrNative(h_buffer1));
+    CHECK(expected1 == *std::data(h_buffer1));
 
     // host-side async memset
     int const value2 = 99;
@@ -72,16 +72,16 @@ TEMPLATE_LIST_TEST_CASE("hostOnlyAPI", "[hostOnlyAPI]", TestAccs)
     INFO("host-side async memset");
     alpaka::memset(hostQueue, h_buffer2, value2);
     alpaka::wait(hostQueue);
-    CHECK(expected2 == *alpaka::getPtrNative(h_buffer2));
+    CHECK(expected2 == *std::data(h_buffer2));
 
     // host-host copies
     INFO("buffer host-host copies");
     alpaka::memcpy(hostQueue, h_buffer2, h_buffer1);
     alpaka::wait(hostQueue);
-    CHECK(expected1 == *alpaka::getPtrNative(h_buffer2));
+    CHECK(expected1 == *std::data(h_buffer2));
     alpaka::memcpy(hostQueue, h_buffer1, h_buffer2);
     alpaka::wait(hostQueue);
-    CHECK(expected1 == *alpaka::getPtrNative(h_buffer1));
+    CHECK(expected1 == *std::data(h_buffer1));
 
     // GPU device
     auto const device = alpaka::getDevByIdx(platformAcc, 0);
@@ -91,14 +91,14 @@ TEMPLATE_LIST_TEST_CASE("hostOnlyAPI", "[hostOnlyAPI]", TestAccs)
     // device buffer
     auto d_buffer1 = alpaka::allocBuf<int, Idx>(device, Vec1D{Idx{42}});
     INFO(
-        "device buffer allocated at " << alpaka::getPtrNative(d_buffer1) << " with "
-                                      << alpaka::getExtentProduct(d_buffer1) << " element(s)");
+        "device buffer allocated at " << std::data(d_buffer1) << " with " << alpaka::getExtentProduct(d_buffer1)
+                                      << " element(s)");
 
     // async or second sync device buffer
     auto d_buffer2 = alpaka::allocAsyncBufIfSupported<int, Idx>(deviceQueue, Vec1D{Idx{42}});
     INFO(
-        "second device buffer allocated at " << alpaka::getPtrNative(d_buffer2) << " with "
-                                             << alpaka::getExtentProduct(d_buffer2) << " element(s)");
+        "second device buffer allocated at " << std::data(d_buffer2) << " with " << alpaka::getExtentProduct(d_buffer2)
+                                             << " element(s)");
 
     // host-device copies
     INFO("host-device copies");
@@ -121,6 +121,6 @@ TEMPLATE_LIST_TEST_CASE("hostOnlyAPI", "[hostOnlyAPI]", TestAccs)
     alpaka::memcpy(deviceQueue, h_buffer2, d_buffer2);
 
     alpaka::wait(deviceQueue);
-    CHECK(expected1 == *alpaka::getPtrNative(h_buffer1));
-    CHECK(expected2 == *alpaka::getPtrNative(h_buffer2));
+    CHECK(expected1 == *std::data(h_buffer1));
+    CHECK(expected2 == *std::data(h_buffer2));
 }
diff --git a/alpaka/test/integ/mandelbrot/src/mandelbrot.cpp b/alpaka/test/integ/mandelbrot/src/mandelbrot.cpp
index ce94e178..58ba9c6b 100644
--- a/alpaka/test/integ/mandelbrot/src/mandelbrot.cpp
+++ b/alpaka/test/integ/mandelbrot/src/mandelbrot.cpp
@@ -236,7 +236,7 @@ auto writeTgaColorImage(std::string const& fileName, TBuf const& bufRgba) -> voi
     ofs.put(0x20); // Image Descriptor Byte.
 
     // Write the data.
-    char const* pData(reinterpret_cast<char const*>(alpaka::getPtrNative(bufRgba)));
+    char const* pData(reinterpret_cast<char const*>(std::data(bufRgba)));
     // If there is no padding, we can directly write the whole buffer data ...
     if(bufRowPitchBytes == bufWidthBytes)
     {
@@ -294,20 +294,6 @@ TEMPLATE_LIST_TEST_CASE("mandelbrot", "[mandelbrot]", TestAccs)
 
     alpaka::Vec<Dim, Idx> const extent(static_cast<Idx>(numRows), static_cast<Idx>(numCols));
 
-    // Let alpaka calculate good block and grid sizes given our full problem extent.
-    alpaka::WorkDivMembers<Dim, Idx> const workDiv(alpaka::getValidWorkDiv<Acc>(
-        devAcc,
-        extent,
-        alpaka::Vec<Dim, Idx>::ones(),
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted));
-
-    std::cout << "MandelbrotKernel("
-              << " numRows:" << numRows << ", numCols:" << numCols << ", maxIterations:" << maxIterations
-              << ", accelerator: " << alpaka::getAccName<Acc>()
-              << ", kernel: " << alpaka::core::demangled<decltype(kernel)> << ", workDiv: " << workDiv << ")"
-              << std::endl;
-
     // allocate host memory, potentially pinned for faster copy to/from the accelerator.
     auto bufColorHost = alpaka::allocMappedBufIfSupported<Val, Idx>(devHost, platformAcc, extent);
 
@@ -320,10 +306,36 @@ TEMPLATE_LIST_TEST_CASE("mandelbrot", "[mandelbrot]", TestAccs)
     // Create the kernel execution task.
     auto const [rowPitch, _] = alpaka::getPitchesInBytes(bufColorAcc);
     CHECK(rowPitch % sizeof(Val) == 0);
+
+    alpaka::KernelCfg<Acc> const kernelCfg
+        = {extent, alpaka::Vec<Dim, Idx>::ones(), false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted};
+
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    auto const workDiv = alpaka::getValidWorkDiv(
+        kernelCfg,
+        devAcc,
+        kernel,
+        std::data(bufColorAcc),
+        numRows,
+        numCols,
+        rowPitch,
+        fMinR,
+        fMaxR,
+        fMinI,
+        fMaxI,
+        maxIterations);
+
+    std::cout << "MandelbrotKernel("
+              << " numRows:" << numRows << ", numCols:" << numCols << ", maxIterations:" << maxIterations
+              << ", accelerator: " << alpaka::getAccName<Acc>()
+              << ", kernel: " << alpaka::core::demangled<decltype(kernel)> << ", workDiv: " << workDiv << ")"
+              << std::endl;
+
+
     auto const taskKernel = alpaka::createTaskKernel<Acc>(
         workDiv,
         kernel,
-        alpaka::getPtrNative(bufColorAcc),
+        std::data(bufColorAcc),
         numRows,
         numCols,
         rowPitch,
diff --git a/alpaka/test/integ/matMul/src/matMul.cpp b/alpaka/test/integ/matMul/src/matMul.cpp
index 149d94df..dd87cfe4 100644
--- a/alpaka/test/integ/matMul/src/matMul.cpp
+++ b/alpaka/test/integ/matMul/src/matMul.cpp
@@ -190,18 +190,6 @@ TEMPLATE_LIST_TEST_CASE("matMul", "[matMul]", TestAccs)
     // Result matrix is MxN. We create one worker per result matrix cell.
     Vec2 const extentC(static_cast<Idx>(m), static_cast<Idx>(n));
 
-    // Let alpaka calculate good block and grid sizes given our full problem extent.
-    alpaka::WorkDivMembers<Dim, Idx> const workDiv(alpaka::getValidWorkDiv<Acc>(
-        devAcc,
-        extentC,
-        alpaka::Vec<Dim, Idx>::ones(),
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::EqualExtent));
-
-    std::cout << "MatMulKernel("
-              << "m:" << m << ", n:" << n << ", k:" << k << ", accelerator: " << alpaka::getAccName<Acc>()
-              << ", kernel: " << alpaka::core::demangled<decltype(kernel)> << ", workDiv: " << workDiv << ")"
-              << std::endl;
 
     // Allocate the A and B matrices as std::vectors because this allows them to be filled with uint32_t(1).
     // alpaka::set only supports setting all bytes leading to a value of 16843009 in all elements.
@@ -255,6 +243,32 @@ TEMPLATE_LIST_TEST_CASE("matMul", "[matMul]", TestAccs)
     std::cout << "pitchesB " << alpaka::getPitchesInBytes(bufBAcc) << " ldb: " << ldb << "\n";
     std::cout << "pitchesC " << alpaka::getPitchesInBytes(bufCAcc) << " ldc: " << ldc << "\n";
 
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    alpaka::KernelCfg<Acc> const kernelCfg
+        = {extentC, alpaka::Vec<Dim, Idx>::ones(), false, alpaka::GridBlockExtentSubDivRestrictions::EqualExtent};
+    auto const workDiv = alpaka::getValidWorkDiv(
+        kernelCfg,
+        devAcc,
+        kernel,
+        m,
+        n,
+        k,
+        static_cast<Val>(1),
+        std::data(bufAAcc),
+        lda,
+        std::data(bufBAcc),
+        ldb,
+        static_cast<Val>(1),
+        std::data(bufCAcc),
+        ldc);
+
+
+    std::cout << "MatMulKernel("
+              << "m:" << m << ", n:" << n << ", k:" << k << ", accelerator: " << alpaka::getAccName<Acc>()
+              << ", kernel: " << alpaka::core::demangled<decltype(kernel)> << ", workDiv: " << workDiv << ")"
+              << std::endl;
+
+
     // Create the kernel execution task.
     auto const taskKernel = alpaka::createTaskKernel<Acc>(
         workDiv,
@@ -263,12 +277,12 @@ TEMPLATE_LIST_TEST_CASE("matMul", "[matMul]", TestAccs)
         n,
         k,
         static_cast<Val>(1),
-        alpaka::getPtrNative(bufAAcc),
+        std::data(bufAAcc),
         lda,
-        alpaka::getPtrNative(bufBAcc),
+        std::data(bufBAcc),
         ldb,
         static_cast<Val>(1),
-        alpaka::getPtrNative(bufCAcc),
+        std::data(bufCAcc),
         ldc);
 
     // Profile the kernel execution.
@@ -290,7 +304,7 @@ TEMPLATE_LIST_TEST_CASE("matMul", "[matMul]", TestAccs)
     auto const correctResult = static_cast<Val>(k);
 
     bool resultCorrect = true;
-    auto const pHostData = alpaka::getPtrNative(bufCHost);
+    auto const pHostData = std::data(bufCHost);
     for(Idx i(0u); i < m * n; ++i)
     {
         auto const& val(pHostData[i]);
diff --git a/alpaka/test/integ/separableCompilation/src/main.cpp b/alpaka/test/integ/separableCompilation/src/main.cpp
index cec49acf..70739d10 100644
--- a/alpaka/test/integ/separableCompilation/src/main.cpp
+++ b/alpaka/test/integ/separableCompilation/src/main.cpp
@@ -90,18 +90,6 @@ TEMPLATE_LIST_TEST_CASE("separableCompilation", "[separableCompilation]", TestAc
     // The data extent.
     alpaka::Vec<alpaka::DimInt<1u>, Idx> const extent(numElements);
 
-    // Let alpaka calculate good block and grid sizes given our full problem extent.
-    alpaka::WorkDivMembers<alpaka::DimInt<1u>, Idx> const workDiv(alpaka::getValidWorkDiv<Acc>(
-        devAcc,
-        extent,
-        static_cast<Idx>(3u),
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted));
-
-    std::cout << alpaka::core::demangled<decltype(kernel)> << "("
-              << "accelerator: " << alpaka::getAccName<Acc>() << ", workDiv: " << workDiv
-              << ", numElements:" << numElements << ")" << std::endl;
-
     // Allocate host memory buffers, potentially pinned for faster copy to/from the accelerator.
     auto memBufHostA = alpaka::allocMappedBufIfSupported<Val, Idx>(devHost, platformAcc, extent);
     auto memBufHostB = alpaka::allocMappedBufIfSupported<Val, Idx>(devHost, platformAcc, extent);
@@ -123,6 +111,21 @@ TEMPLATE_LIST_TEST_CASE("separableCompilation", "[separableCompilation]", TestAc
     alpaka::memcpy(queueAcc, memBufAccA, memBufHostA);
     alpaka::memcpy(queueAcc, memBufAccB, memBufHostB);
 
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    alpaka::KernelCfg<Acc> const kernelCfg = {extent, static_cast<Idx>(3u)};
+    auto const workDiv = alpaka::getValidWorkDiv(
+        kernelCfg,
+        devAcc,
+        kernel,
+        memBufAccA.data(),
+        memBufAccB.data(),
+        memBufAccC.data(),
+        numElements);
+
+    std::cout << alpaka::core::demangled<decltype(kernel)> << "("
+              << "accelerator: " << alpaka::getAccName<Acc>() << ", workDiv: " << workDiv
+              << ", numElements:" << numElements << ")" << std::endl;
+
     // Create the executor task.
     auto const taskKernel = alpaka::createTaskKernel<Acc>(
         workDiv,
diff --git a/alpaka/test/integ/sharedMem/src/sharedMem.cpp b/alpaka/test/integ/sharedMem/src/sharedMem.cpp
index 94cca7e7..855e0946 100644
--- a/alpaka/test/integ/sharedMem/src/sharedMem.cpp
+++ b/alpaka/test/integ/sharedMem/src/sharedMem.cpp
@@ -108,7 +108,6 @@ using TestAccs = alpaka::test::EnabledAccs<alpaka::DimInt<1u>, std::uint32_t>;
 TEMPLATE_LIST_TEST_CASE("sharedMem", "[sharedMem]", TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::Dim<Acc>;
     using Idx = alpaka::Idx<Acc>;
 
     Idx const numElements = 1u << 16u;
@@ -130,13 +129,17 @@ TEMPLATE_LIST_TEST_CASE("sharedMem", "[sharedMem]", TestAccs)
     // Get a queue on this device.
     QueueAcc queue(devAcc);
 
-    // Set the grid blocks extent.
-    alpaka::WorkDivMembers<Dim, Idx> const workDiv(alpaka::getValidWorkDiv<Acc>(
-        devAcc,
-        numElements,
-        static_cast<Idx>(1u),
-        false,
-        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted));
+
+    auto blockRetValuesDummy = alpaka::allocBuf<Val, Idx>(devAcc, static_cast<Idx>(1));
+
+    // Kernel input during the runtime of kernel will be different and is chosen to depend on workdiv.
+    // Therefore, initially a  workdiv is needed to find the parameter. Therefore, in kernel bundle, we can not use the
+    // real input for the buffer pointer.
+
+    // Let alpaka calculate good block and grid sizes given our full problem extent
+    alpaka::KernelCfg<Acc> const kernelCfg
+        = {numElements, static_cast<Idx>(1u), false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted};
+    auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, kernel, std::data(blockRetValuesDummy));
 
     std::cout << "SharedMemKernel("
               << " accelerator: " << alpaka::getAccName<Acc>()
@@ -155,7 +158,7 @@ TEMPLATE_LIST_TEST_CASE("sharedMem", "[sharedMem]", TestAccs)
     alpaka::memcpy(queue, blockRetValsAcc, blockRetVals, resultElemCount);
 
     // Create the kernel execution task.
-    auto const taskKernel = alpaka::createTaskKernel<Acc>(workDiv, kernel, alpaka::getPtrNative(blockRetValsAcc));
+    auto const taskKernel = alpaka::createTaskKernel<Acc>(workDiv, kernel, std::data(blockRetValsAcc));
 
     // Profile the kernel execution.
     std::cout << "Execution time: " << alpaka::test::integ::measureTaskRunTimeMs(queue, taskKernel) << " ms"
diff --git a/alpaka/test/unit/CMakeLists.txt b/alpaka/test/unit/CMakeLists.txt
index bcd108ff..983bb732 100644
--- a/alpaka/test/unit/CMakeLists.txt
+++ b/alpaka/test/unit/CMakeLists.txt
@@ -24,6 +24,7 @@ add_subdirectory("event/")
 add_subdirectory("idx/")
 add_subdirectory("intrinsic/")
 add_subdirectory("kernel/")
+add_subdirectory("exec/")
 add_subdirectory("math/")
 add_subdirectory("mem/buf/")
 add_subdirectory("mem/copy/")
@@ -33,6 +34,7 @@ add_subdirectory("mem/view/")
 add_subdirectory("meta/")
 add_subdirectory("queue/")
 add_subdirectory("rand/")
+add_subdirectory("runtime/")
 add_subdirectory("traits/")
 add_subdirectory("vec/")
 add_subdirectory("warp/")
diff --git a/alpaka/test/unit/acc/src/AccDevPropsTest.cpp b/alpaka/test/unit/acc/src/AccDevPropsTest.cpp
index 46100422..650942c4 100644
--- a/alpaka/test/unit/acc/src/AccDevPropsTest.cpp
+++ b/alpaka/test/unit/acc/src/AccDevPropsTest.cpp
@@ -24,4 +24,46 @@ TEMPLATE_LIST_TEST_CASE("getAccDevProps", "[acc]", alpaka::test::TestAccs)
     REQUIRE(devProps.m_threadElemCountMax > 0);
     REQUIRE(devProps.m_multiProcessorCount > 0);
     REQUIRE(devProps.m_sharedMemSizeBytes > 0);
+    REQUIRE(devProps.m_globalMemSizeBytes > 0);
 }
+
+TEST_CASE("AccDevProps.aggregate_init", "[acc]")
+{
+    auto const props = alpaka::AccDevProps<alpaka::DimInt<1>, int>{1, {2}, 3, {4}, 5, {6}, 7, 8, 9};
+
+    CHECK(props.m_multiProcessorCount == 1);
+    CHECK(props.m_gridBlockExtentMax == alpaka::Vec{2});
+    CHECK(props.m_gridBlockCountMax == 3);
+    CHECK(props.m_blockThreadExtentMax == alpaka::Vec{4});
+    CHECK(props.m_blockThreadCountMax == 5);
+    CHECK(props.m_threadElemExtentMax == alpaka::Vec{6});
+    CHECK(props.m_threadElemCountMax == 7);
+    CHECK(props.m_sharedMemSizeBytes == 8);
+    CHECK(props.m_globalMemSizeBytes == 9);
+}
+
+#ifdef __cpp_designated_initializers
+TEST_CASE("AccDevProps.designated_initializers", "[acc]")
+{
+    auto const props = alpaka::AccDevProps<alpaka::DimInt<1>, int>{
+        .m_multiProcessorCount = 10,
+        .m_gridBlockExtentMax = {20},
+        .m_gridBlockCountMax = 30,
+        .m_blockThreadExtentMax = {40},
+        .m_blockThreadCountMax = 50,
+        .m_threadElemExtentMax = {60},
+        .m_threadElemCountMax = 70,
+        .m_sharedMemSizeBytes = 80,
+        .m_globalMemSizeBytes = 90};
+
+    CHECK(props.m_multiProcessorCount == 10);
+    CHECK(props.m_gridBlockExtentMax == alpaka::Vec{20});
+    CHECK(props.m_gridBlockCountMax == 30);
+    CHECK(props.m_blockThreadExtentMax == alpaka::Vec{40});
+    CHECK(props.m_blockThreadCountMax == 50);
+    CHECK(props.m_threadElemExtentMax == alpaka::Vec{60});
+    CHECK(props.m_threadElemCountMax == 70);
+    CHECK(props.m_sharedMemSizeBytes == 80);
+    CHECK(props.m_globalMemSizeBytes == 90);
+}
+#endif
diff --git a/alpaka/test/unit/acc/src/AccTagTest.cpp b/alpaka/test/unit/acc/src/AccTagTest.cpp
index 49024fc7..ac798bc4 100644
--- a/alpaka/test/unit/acc/src/AccTagTest.cpp
+++ b/alpaka/test/unit/acc/src/AccTagTest.cpp
@@ -2,6 +2,10 @@
  * SPDX-License-Identifier: MPL-2.0
  */
 
+// Undefine ALPAKA_CI for this test, because the variable set the value of some acc types, like
+// AccCpuThreadsIfAvailableElseInt to int independent of the cmake configuration. This avoids long running test cases
+// but is problematic for this test.
+#undef ALPAKA_CI
 #include <alpaka/alpaka.hpp>
 #include <alpaka/test/acc/TestAccs.hpp>
 
@@ -248,3 +252,26 @@ TEMPLATE_LIST_TEST_CASE("kernel specialization with tags", "[acc][tag]", TestAcc
 
     REQUIRE(alpaka::getPtrNative(memHost)[0] == expected_result);
 }
+
+TEMPLATE_LIST_TEST_CASE("test AccIsEnabled", "[acc][tag]", AccToTagMap)
+{
+    using TestAcc = std::tuple_element_t<0, TestType>;
+    using TestTag = std::tuple_element_t<1, TestType>;
+
+
+    // if the Acc is not enabled, the type is int
+    if constexpr(!std::is_same_v<TestAcc, int>)
+    {
+        STATIC_REQUIRE(alpaka::AccIsEnabled<TestTag>::value);
+    }
+    else
+    {
+        STATIC_REQUIRE_FALSE(alpaka::AccIsEnabled<TestTag>::value);
+    }
+}
+
+TEST_CASE("test EnabledAccTags", "[acc][tag]")
+{
+    using AllAccs = alpaka::test::EnabledAccs<alpaka::DimInt<1>, int>;
+    STATIC_REQUIRE(std::tuple_size<AllAccs>::value == std::tuple_size<alpaka::EnabledAccTags>::value);
+}
diff --git a/alpaka/test/unit/acc/src/AccTraitTest.cpp b/alpaka/test/unit/acc/src/AccTraitTest.cpp
new file mode 100644
index 00000000..57ef5b2c
--- /dev/null
+++ b/alpaka/test/unit/acc/src/AccTraitTest.cpp
@@ -0,0 +1,35 @@
+/* Copyright 2024 Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#include <alpaka/acc/Traits.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+
+TEMPLATE_LIST_TEST_CASE("isSingleThreadAcc", "[acc]", alpaka::test::TestAccs)
+{
+    using Acc = TestType;
+
+    // Check that both traits are defined, and that only one is true.
+    REQUIRE(alpaka::isSingleThreadAcc<Acc> != alpaka::isMultiThreadAcc<Acc>);
+
+    auto const platform = alpaka::Platform<Acc>{};
+    auto const dev = alpaka::getDevByIdx(platform, 0);
+    auto const devProps = alpaka::getAccDevProps<Acc>(dev);
+
+    // Compare the runtime properties with the compile time trait.
+    INFO("Accelerator: " << alpaka::core::demangled<Acc>);
+    if constexpr(alpaka::isSingleThreadAcc<Acc>)
+    {
+        // Require a single thread per block.
+        REQUIRE(devProps.m_blockThreadCountMax == 1);
+    }
+    else
+    {
+        // Assume multiple threads per block, but allow a single thread per block.
+        // For example, the AccCpuOmp2Threads accelerator may report a single thread on a single core system.
+        REQUIRE(devProps.m_blockThreadCountMax >= 1);
+    }
+}
diff --git a/alpaka/test/unit/block/sharedSharing/src/BlockSharedMemSharing.cpp b/alpaka/test/unit/block/sharedSharing/src/BlockSharedMemSharing.cpp
index e62a97f3..587991eb 100644
--- a/alpaka/test/unit/block/sharedSharing/src/BlockSharedMemSharing.cpp
+++ b/alpaka/test/unit/block/sharedSharing/src/BlockSharedMemSharing.cpp
@@ -67,7 +67,7 @@ void BlockSharedMemSharingTest(TKernel kernel)
 
     auto bufAcc = alpaka::allocBuf<std::uint32_t, Idx>(devAcc, gridBlockCount);
 
-    alpaka::exec<TAcc>(queue, workDiv, kernel, alpaka::getPtrNative(bufAcc));
+    alpaka::exec<TAcc>(queue, workDiv, kernel, std::data(bufAcc));
 
     auto const platformHost = alpaka::PlatformCpu{};
     auto const devHost = alpaka::getDevByIdx(platformHost, 0);
@@ -75,7 +75,7 @@ void BlockSharedMemSharingTest(TKernel kernel)
 
     alpaka::memcpy(queue, bufHost, bufAcc);
 
-    auto pBufHost = alpaka::getPtrNative(bufHost);
+    auto pBufHost = std::data(bufHost);
     for(Idx a = 0u; a < gridBlockCount; ++a)
     {
         REQUIRE(pBufHost[a] == blockThreadCount);
diff --git a/alpaka/test/unit/dev/src/DevWarpSizeTest.cpp b/alpaka/test/unit/dev/src/DevWarpSizeTest.cpp
index e8f76b37..b451ccb1 100644
--- a/alpaka/test/unit/dev/src/DevWarpSizeTest.cpp
+++ b/alpaka/test/unit/dev/src/DevWarpSizeTest.cpp
@@ -29,3 +29,10 @@ TEMPLATE_LIST_TEST_CASE("getPreferredWarpSize", "[dev]", alpaka::test::TestAccs)
     auto const preferredWarpSize = alpaka::getPreferredWarpSize(dev);
     REQUIRE(preferredWarpSize > 0);
 }
+
+TEMPLATE_LIST_TEST_CASE("isDevice", "[dev]", alpaka::test::TestAccs)
+{
+    auto const platform = alpaka::Platform<TestType>{};
+    auto const dev = alpaka::getDevByIdx(platform, 0);
+    REQUIRE(alpaka::isDevice<decltype(dev)>);
+}
diff --git a/alpaka/test/unit/exec/CMakeLists.txt b/alpaka/test/unit/exec/CMakeLists.txt
new file mode 100644
index 00000000..44a06a2b
--- /dev/null
+++ b/alpaka/test/unit/exec/CMakeLists.txt
@@ -0,0 +1,24 @@
+# Copyright 2024 Benjamin Worpitz, Jakob Krude, Andrea Bocci
+# SPDX-License-Identifier: MPL-2.0
+
+set(_TARGET_NAME "execTest")
+
+append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
+append_recursive_files_add_to_src_group("src/" "src/" "hpp" _FILES_HEADER)
+
+alpaka_add_executable(
+    ${_TARGET_NAME}
+    ${_FILES_SOURCE}
+    ${_FILES_HEADER})
+
+target_include_directories(
+    ${_TARGET_NAME}
+    PRIVATE ${Boost_INCLUDE_DIRS})
+
+target_link_libraries(
+    ${_TARGET_NAME}
+    PRIVATE common)
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_alpaka_TEST_OPTIONS})
diff --git a/alpaka/test/unit/exec/src/IndependentElements.cpp b/alpaka/test/unit/exec/src/IndependentElements.cpp
new file mode 100644
index 00000000..18621f07
--- /dev/null
+++ b/alpaka/test/unit/exec/src/IndependentElements.cpp
@@ -0,0 +1,205 @@
+/* Copyright 2024 Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#include "WorkDiv.hpp"
+#include "alpaka/acc/Traits.hpp"
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/dev/DevCpu.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/exec/UniformElements.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/kernel/Traits.hpp"
+#include "alpaka/mem/view/Traits.hpp"
+#include "alpaka/platform/PlatformCpu.hpp"
+#include "alpaka/platform/Traits.hpp"
+#include "alpaka/queue/Traits.hpp"
+#include "alpaka/test/acc/TestAccs.hpp"
+#include "alpaka/wait/Traits.hpp"
+
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+
+#include <random>
+#include <type_traits>
+
+#if BOOST_COMP_MSVC
+// MSVC uses __restrict instead of __restrict__.
+#    define __restrict__ __restrict
+#endif
+
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wexit-time-destructors"
+#endif
+// Global Host object used by all tests.
+using Host = alpaka::DevCpu;
+static Host host = alpaka::getDevByIdx(alpaka::PlatformCpu{}, 0);
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic pop
+#endif
+
+/* Add the group id to the value of each element in the group.
+ * Each group is composed by the elements first[group]..first[group+1]-1 .
+ */
+struct IndependentWorkKernel
+{
+    template<typename TAcc, typename T>
+    ALPAKA_FN_ACC void operator()(
+        TAcc const& acc,
+        T const* __restrict__ in,
+        T* __restrict__ out,
+        alpaka::Idx<TAcc> const* __restrict__ indices,
+        alpaka::Idx<TAcc> groups) const
+    {
+        using Idx = alpaka::Idx<TAcc>;
+
+        for(auto group : alpaka::independentGroups(acc, groups))
+        {
+            Idx first = indices[group];
+            Idx last = indices[group + 1];
+            Idx size = last - first;
+            for(auto index : alpaka::independentGroupElements(acc, size))
+            {
+                out[first + index] = in[first + index] + static_cast<float>(group);
+            }
+        }
+    }
+};
+
+/* Test the IndependentWorkKernel kernel on all devices
+ */
+template<typename TAcc, typename TKernel>
+void testIndependentWorkKernel(
+    alpaka::Idx<TAcc> groups,
+    alpaka::Idx<TAcc> grid_size,
+    alpaka::Idx<TAcc> block_size,
+    TKernel kernel)
+{
+    using Acc = TAcc;
+    using Idx = alpaka::Idx<Acc>;
+    using Platform = alpaka::Platform<Acc>;
+    using Device = alpaka::Dev<Platform>;
+    using Queue = alpaka::Queue<Device, alpaka::NonBlocking>;
+
+    // Initialise the accelerator platform.
+    Platform platform{};
+
+    // Random number generator with a gaussian distribution.
+    std::random_device rd{};
+    std::default_random_engine engine{rd()};
+
+    // Uniform distribution.
+    std::uniform_int_distribution<Idx> random_size{100, 201};
+
+    // Gaussian distribution.
+    std::normal_distribution<float> dist{0.f, 1.f};
+
+    // Build the groups.
+    auto indices_h = alpaka::allocMappedBuf<Idx, Idx>(host, platform, groups + 1);
+    indices_h[0] = 0;
+    for(Idx i = 0; i < groups; ++i)
+    {
+        // Group "i" has "size" elements.
+        auto size = random_size(engine);
+        indices_h[i + 1] = indices_h[i] + size;
+    }
+
+    // Tolerance.
+    constexpr float epsilon = 0.000001f;
+
+    // Buffer size.
+    const Idx size = indices_h[groups];
+
+    // Allocate the input and output host buffer in pinned memory accessible by the Platform devices.
+    auto in_h = alpaka::allocMappedBuf<float, Idx>(host, platform, size);
+    auto out_h = alpaka::allocMappedBuf<float, Idx>(host, platform, size);
+
+    // Fill the input buffers with random data, and the output buffer with zeros.
+    for(Idx i = 0; i < size; ++i)
+    {
+        in_h[i] = dist(engine);
+        out_h[i] = 0;
+    }
+
+    // Run the test on each device.
+    for(auto const& device : alpaka::getDevs(platform))
+    {
+        /* clang-format off */
+        INFO("Test IndependentWorkKernel on " << alpaka::getName(device) << " over " << size << " elements in "
+                                              << groups << " independent groups with " << grid_size << " blocks of "
+                                              << block_size << " elements");
+        /* clang-format on */
+        auto queue = Queue(device);
+
+        // Allocate input and output buffers on the device.
+        auto indices_d = alpaka::allocAsyncBufIfSupported<Idx, Idx>(queue, groups + 1);
+        auto in_d = alpaka::allocAsyncBufIfSupported<float, Idx>(queue, size);
+        auto out_d = alpaka::allocAsyncBufIfSupported<float, Idx>(queue, size);
+
+        // Copy the input data to the device; the size is known from the buffer objects.
+        alpaka::memcpy(queue, indices_d, indices_h);
+        alpaka::memcpy(queue, in_d, in_h);
+
+        // Fill the output buffer with zeros; the size is known from the buffer objects.
+        alpaka::memset(queue, out_d, 0);
+
+        // Launch the 1-dimensional kernel with independent work groups.
+        auto workdiv = makeWorkDiv<TAcc>(grid_size, block_size);
+        alpaka::exec<TAcc>(queue, workdiv, kernel, in_d.data(), out_d.data(), indices_d.data(), groups);
+
+        // Copy the results from the device to the host.
+        alpaka::memcpy(queue, out_h, out_d);
+
+        // Wait for all the operations to complete.
+        alpaka::wait(queue);
+
+        // Check the results.
+        for(Idx g = 0; g < groups; ++g)
+        {
+            Idx first = indices_h[g];
+            Idx last = indices_h[g + 1];
+            for(Idx i = first; i < last; ++i)
+            {
+                float sum = in_h[i] + static_cast<float>(g);
+                float delta = std::max(std::fabs(sum) * epsilon, epsilon);
+                REQUIRE(out_h[i] < sum + delta);
+                REQUIRE(out_h[i] > sum - delta);
+            }
+        }
+    }
+}
+
+TEMPLATE_LIST_TEST_CASE("IndependentElements", "[exec]", alpaka::test::TestAccs)
+{
+    using Acc = TestType;
+    using Dim = alpaka::Dim<Acc>;
+
+    // 1-dimensional kernels.
+    if constexpr(Dim::value == 1)
+    {
+        SECTION("IndependentWorkKernel, small block size")
+        {
+            // Launch the independent work kernel with a small block size and a small number of blocks; this relies on
+            // the kernel to loop over the "problem space" and do more work per block.
+            INFO("Test independent work kernel with small block size, using scalar dimensions");
+            testIndependentWorkKernel<TestType>(100, 32, 32, IndependentWorkKernel{});
+        }
+
+        SECTION("IndependentWorkKernel, large block size")
+        {
+            // Launch the independent work kernel with a large block size and a single block; this relies on the kernel
+            // to check the size of the "problem space" and avoid accessing out-of-bounds data.
+            INFO("Test independent work kernel with large block size, using scalar dimensions");
+            testIndependentWorkKernel<TestType>(10, 1, 32, IndependentWorkKernel{});
+        }
+
+        SECTION("IndependentWorkKernel, many large blocks")
+        {
+            // Launch the independent work kernel with a large block size and a large number of blocks; this relies on
+            // the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data.
+            INFO("Test independent work kernel with large block size, using scalar dimensions");
+            testIndependentWorkKernel<TestType>(10, 32, 32, IndependentWorkKernel{});
+        }
+    }
+}
diff --git a/alpaka/test/unit/exec/src/Once.cpp b/alpaka/test/unit/exec/src/Once.cpp
new file mode 100644
index 00000000..1c9461f3
--- /dev/null
+++ b/alpaka/test/unit/exec/src/Once.cpp
@@ -0,0 +1,144 @@
+/* Copyright 2024 Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#include "alpaka/exec/Once.hpp"
+
+#include "alpaka/atomic/Traits.hpp"
+#include "alpaka/kernel/Traits.hpp"
+#include "alpaka/meta/ForEachType.hpp"
+#include "alpaka/test/KernelExecutionFixture.hpp"
+#include "alpaka/test/acc/TestAccs.hpp"
+
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+
+#include <type_traits>
+
+class KernelOncePerGrid
+{
+public:
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* status, int32_t* value) const -> void
+    {
+        // Only one thread in the whole grid should increment the conter.
+        if(alpaka::oncePerGrid(acc))
+        {
+            ALPAKA_CHECK(*status, *value == 0);
+
+            alpaka::atomicAdd(acc, value, 1, alpaka::hierarchy::Grids{});
+
+            ALPAKA_CHECK(*status, *value == 1);
+        }
+    }
+};
+
+// MSVC does not seem to recognize as "true" a value set to "true" in device code,
+// so force all object representations different from zero to evaluate as "true".
+inline void fixBooleanValue(bool& value)
+{
+    value = reinterpret_cast<char const&>(value) == 0x00 ? false : true;
+}
+
+TEMPLATE_LIST_TEST_CASE("oncePerGrid", "[exec]", alpaka::test::TestAccs)
+{
+    using Host = alpaka::DevCpu;
+    Host host = alpaka::getDevByIdx(alpaka::PlatformCpu{}, 0);
+
+    using Acc = TestType;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
+    using Platform = alpaka::Platform<Acc>;
+    using Device = alpaka::Dev<Platform>;
+    using Queue = alpaka::Queue<Device, alpaka::NonBlocking>;
+
+    Platform platform;
+    Device device = alpaka::getDevByIdx(platform, 0);
+    Queue queue{device};
+
+    using Scalar = alpaka::Vec<alpaka::DimInt<0u>, Idx>;
+    auto value = alpaka::allocMappedBuf<int32_t, Idx>(host, platform, Scalar{});
+    *value = 0;
+
+    auto status = alpaka::allocMappedBuf<bool, Idx>(host, platform, Scalar{});
+    *status = true;
+
+    auto const extent = alpaka::Vec<Dim, Idx>::all(32);
+    auto const elems = alpaka::Vec<Dim, Idx>::all(4);
+
+    KernelOncePerGrid kernel;
+    alpaka::KernelCfg<Acc> const config = {extent, elems, false};
+    auto const workDiv = alpaka::getValidWorkDiv(config, device, kernel, std::data(status), std::data(value));
+
+    alpaka::exec<Acc>(queue, workDiv, kernel, std::data(status), std::data(value));
+    alpaka::wait(queue);
+
+    fixBooleanValue(*status);
+    REQUIRE(*status == true);
+    REQUIRE(*value == 1);
+}
+
+class KernelOncePerBlock
+{
+public:
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* status, int32_t* value) const -> void
+    {
+        const int32_t blocks = static_cast<int32_t>(alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc).prod());
+
+        // Only one thread in each block should increment the conter.
+        if(alpaka::oncePerBlock(acc))
+        {
+            // FIXME: implement alpaka::atomicLoad and use it here
+            int before = alpaka::atomicAdd(acc, value, 0, alpaka::hierarchy::Grids{});
+            ALPAKA_CHECK(*status, before >= 0);
+            ALPAKA_CHECK(*status, before < blocks);
+
+            alpaka::atomicAdd(acc, value, 1, alpaka::hierarchy::Grids{});
+
+            // FIXME: implement alpaka::atomicLoad and use it here
+            int after = alpaka::atomicAdd(acc, value, 0, alpaka::hierarchy::Grids{});
+            ALPAKA_CHECK(*status, after > 0);
+            ALPAKA_CHECK(*status, after <= blocks);
+        }
+    }
+};
+
+TEMPLATE_LIST_TEST_CASE("oncePerBlock", "[exec]", alpaka::test::TestAccs)
+{
+    using Host = alpaka::DevCpu;
+    Host host = alpaka::getDevByIdx(alpaka::PlatformCpu{}, 0);
+
+    using Acc = TestType;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
+    using Platform = alpaka::Platform<Acc>;
+    using Device = alpaka::Dev<Platform>;
+    using Queue = alpaka::Queue<Device, alpaka::NonBlocking>;
+
+    Platform platform;
+    Device device = alpaka::getDevByIdx(platform, 0);
+    Queue queue{device};
+
+    using Scalar = alpaka::Vec<alpaka::DimInt<0u>, Idx>;
+    auto value = alpaka::allocMappedBuf<int32_t, Idx>(host, platform, Scalar{});
+    alpaka::memset(queue, value, 0x00);
+
+    auto status = alpaka::allocMappedBuf<bool, Idx>(host, platform, Scalar{});
+    alpaka::memset(queue, status, 0xff);
+
+    auto const extent = alpaka::Vec<Dim, Idx>::all(32);
+    auto const elems = alpaka::Vec<Dim, Idx>::all(4);
+
+    KernelOncePerBlock kernel;
+    alpaka::KernelCfg<Acc> const config = {extent, elems, false};
+    auto const workDiv = alpaka::getValidWorkDiv(config, device, kernel, std::data(status), std::data(value));
+    const int32_t blocks = static_cast<int32_t>(alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(workDiv).prod());
+
+    alpaka::exec<Acc>(queue, workDiv, kernel, std::data(status), std::data(value));
+    alpaka::wait(queue);
+
+    fixBooleanValue(*status);
+    REQUIRE(*status == true);
+    REQUIRE(*value == blocks);
+}
diff --git a/alpaka/test/unit/exec/src/UniformElements.cpp b/alpaka/test/unit/exec/src/UniformElements.cpp
new file mode 100644
index 00000000..0e080171
--- /dev/null
+++ b/alpaka/test/unit/exec/src/UniformElements.cpp
@@ -0,0 +1,710 @@
+/* Copyright 2024 Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#include "alpaka/exec/UniformElements.hpp"
+
+#include "WorkDiv.hpp"
+#include "alpaka/acc/Traits.hpp"
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/dev/DevCpu.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/exec/Once.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/kernel/Traits.hpp"
+#include "alpaka/mem/view/Traits.hpp"
+#include "alpaka/platform/PlatformCpu.hpp"
+#include "alpaka/platform/Traits.hpp"
+#include "alpaka/queue/Traits.hpp"
+#include "alpaka/test/acc/TestAccs.hpp"
+#include "alpaka/wait/Traits.hpp"
+
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+
+#include <random>
+#include <type_traits>
+
+#if BOOST_COMP_MSVC
+// MSVC uses __restrict instead of __restrict__.
+#    define __restrict__ __restrict
+#endif
+
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wexit-time-destructors"
+#endif
+// Global Host object used by all tests.
+using Host = alpaka::DevCpu;
+static Host host = alpaka::getDevByIdx(alpaka::PlatformCpu{}, 0);
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic pop
+#endif
+
+struct VectorAddKernel
+{
+    template<typename TAcc, typename T>
+    ALPAKA_FN_ACC void operator()(
+        TAcc const& acc,
+        T const* __restrict__ in1,
+        T const* __restrict__ in2,
+        T* __restrict__ out,
+        alpaka::Idx<TAcc> size) const
+    {
+        for(auto index : alpaka::uniformElements(acc, size))
+        {
+            out[index] = in1[index] + in2[index];
+        }
+    }
+};
+
+struct VectorAddKernelSkip
+{
+    template<typename TAcc, typename T>
+    ALPAKA_FN_ACC void operator()(
+        TAcc const& acc,
+        T const* __restrict__ in1,
+        T const* __restrict__ in2,
+        T* __restrict__ out,
+        alpaka::Idx<TAcc> first,
+        alpaka::Idx<TAcc> size) const
+    {
+        for(auto index : alpaka::uniformElements(acc, first, size))
+        {
+            out[index] = in1[index] + in2[index];
+        }
+    }
+};
+
+struct VectorAddKernel1D
+{
+    template<typename TAcc, typename T, typename = std::enable_if_t<alpaka::Dim<TAcc>::value == 1u>>
+    ALPAKA_FN_ACC void operator()(
+        TAcc const& acc,
+        T const* __restrict__ in1,
+        T const* __restrict__ in2,
+        T* __restrict__ out,
+        alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> size) const
+    {
+        for(auto ndindex : alpaka::uniformElementsND(acc, size))
+        {
+            auto index = ndindex[0];
+            out[index] = in1[index] + in2[index];
+        }
+    }
+};
+
+struct VectorAddKernel2D
+{
+    template<typename TAcc, typename T, typename = std::enable_if_t<alpaka::Dim<TAcc>::value == 2u>>
+    ALPAKA_FN_ACC void operator()(
+        TAcc const& acc,
+        T const* __restrict__ in1,
+        T const* __restrict__ in2,
+        T* __restrict__ out,
+        alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> size) const
+    {
+        for(auto ndindex : alpaka::uniformElementsND(acc, size))
+        {
+            auto index = ndindex[0] * size[1] + ndindex[1];
+            out[index] = in1[index] + in2[index];
+        }
+    }
+};
+
+struct VectorAddKernel3D
+{
+    template<typename TAcc, typename T, typename = std::enable_if_t<alpaka::Dim<TAcc>::value == 3u>>
+    ALPAKA_FN_ACC void operator()(
+        TAcc const& acc,
+        T const* __restrict__ in1,
+        T const* __restrict__ in2,
+        T* __restrict__ out,
+        alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> size) const
+    {
+        for(auto ndindex : alpaka::uniformElementsND(acc, size))
+        {
+            auto index = (ndindex[0] * size[1] + ndindex[1]) * size[2] + ndindex[2];
+            out[index] = in1[index] + in2[index];
+        }
+    }
+};
+
+/* This is not an efficient approach, and it uses more operations and synchronisations than needed. It is written like
+ * this to test the use of dynamic shared memory, split block and element loops, and block-level synchronisations.
+ */
+
+struct VectorAddBlockKernel
+{
+    template<typename TAcc, typename T>
+    ALPAKA_FN_ACC void operator()(
+        TAcc const& acc,
+        T const* __restrict__ in1,
+        T const* __restrict__ in2,
+        T* __restrict__ out,
+        alpaka::Idx<TAcc> size) const
+    {
+        using Idx = alpaka::Idx<TAcc>;
+
+        // Get the block size.
+        auto const blockSize = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
+
+        // Get the dynamic shared memory buffer.
+        T* buffer = alpaka::getDynSharedMem<T>(acc);
+
+        // Split the loop over the elements into an outer loop over the groups and an inner loop over the elements, to
+        // permit the synchronisation of the threads in each block between each step: the outer loop is needed to
+        // repeat the "block" as many times as needed to cover the whole problem space; the inner loop is needed for
+        // work division with more than one element per thread.
+        for(auto block : alpaka::uniformGroups(acc, size))
+        {
+            // Initialise the shared memory.
+            if(alpaka::oncePerBlock(acc))
+            {
+                for(Idx local = 0; local < blockSize; ++local)
+                {
+                    buffer[local] = 0;
+                }
+            }
+            // Synchronise all threads in the block.
+            alpaka::syncBlockThreads(acc);
+            // Accumulate the first set of data into shared memory.
+            for(auto index : alpaka::uniformGroupElements(acc, block, size))
+            {
+                buffer[index.local] += in1[index.global];
+            }
+            // Synchronise all threads in the block.
+            alpaka::syncBlockThreads(acc);
+            // Accumulate the second set of data into shared memory.
+            for(auto index : alpaka::uniformGroupElements(acc, block, size))
+            {
+                buffer[index.local] += in2[index.global];
+            }
+            // Synchronise all threads in the block.
+            alpaka::syncBlockThreads(acc);
+            // Store the results into global memory.
+            for(auto index : alpaka::uniformGroupElements(acc, block, size))
+            {
+                out[index.global] = buffer[index.local];
+            }
+            // Synchronise all threads in the block; this is necessary to avoid race conditions between different
+            // iterations of the uniformGroups loop.
+            alpaka::syncBlockThreads(acc);
+        }
+    }
+};
+
+/* Run all operations in a single thread.
+ * Written in an inefficient way to test "oncePerGrid".
+ */
+
+struct VectorAddKernelSerial
+{
+    template<typename TAcc, typename T>
+    ALPAKA_FN_ACC void operator()(
+        TAcc const& acc,
+        T const* __restrict__ in1,
+        T const* __restrict__ in2,
+        T* __restrict__ out,
+        alpaka::Idx<TAcc> size) const
+    {
+        using Idx = alpaka::Idx<TAcc>;
+
+        // The operations are performed by a single thread.
+        if(alpaka::oncePerGrid(acc))
+        {
+            for(Idx index = 0; index < size; ++index)
+            {
+                // Unsafe, used to test that each element is summed exactly once.
+                out[index] += in1[index];
+                out[index] += in2[index];
+            }
+        }
+    }
+};
+
+/* Run all operations in one thread per block.
+ * Written in an inefficient way to test "oncePerBlock".
+ */
+
+struct VectorAddKernelBlockSerial
+{
+    template<typename TAcc, typename T>
+    ALPAKA_FN_ACC void operator()(
+        TAcc const& acc,
+        T const* __restrict__ in1,
+        T const* __restrict__ in2,
+        T* __restrict__ out,
+        alpaka::Idx<TAcc> size) const
+    {
+        using Idx = alpaka::Idx<TAcc>;
+
+        // Get the block size.
+        auto const blockSize = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
+        // The loop is used to repeat the "block" as many times as needed to cover the whole problem space.
+        for(auto block : alpaka::uniformGroups(acc, size))
+        {
+            // The operations are performed by a single thread in each "logical" block.
+            auto const first = blockSize * block;
+            auto const range = std::min<Idx>(first + blockSize, size);
+            if(alpaka::oncePerBlock(acc))
+            {
+                for(Idx index = first; index < range; ++index)
+                {
+                    // Unsafe, used to test that each element is summed exactly once.
+                    out[index] += in1[index];
+                    out[index] += in2[index];
+                }
+            }
+        }
+    }
+};
+
+namespace alpaka::trait
+{
+    // Specialize the BlockSharedMemDynSizeBytes trait to specify the amount of block shared dynamic memory for the
+    // VectorAddBlockKernel kernel.
+    template<typename TAcc>
+    struct BlockSharedMemDynSizeBytes<VectorAddBlockKernel, TAcc>
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        using Dim1D = alpaka::DimInt<1u>;
+        using Vec1D = alpaka::Vec<Dim1D, Idx>;
+
+        // The size in bytes of the shared memory allocated for a block.
+        template<typename T>
+        ALPAKA_FN_HOST_ACC static std::size_t getBlockSharedMemDynSizeBytes(
+            VectorAddBlockKernel const& /* kernel */,
+            Vec1D threads,
+            Vec1D elements,
+            T const* __restrict__ /* in1 */,
+            T const* __restrict__ /* in2 */,
+            T* __restrict__ /* out */,
+            Idx /* size */)
+        {
+#if defined(__GNUC__)
+            // Silence a potential warning about
+            // warning: conversion to ‘long unsigned int’ from ‘long int’ may change the sign of the result
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wsign-conversion"
+#endif
+            return static_cast<std::size_t>(threads[0] * elements[0] * sizeof(T));
+#if defined(__GNUC__)
+#    pragma GCC diagnostic pop
+#endif
+        }
+    };
+} // namespace alpaka::trait
+
+// Test the 1-dimensional kernel on all devices.
+template<typename TAcc, typename TKernel>
+void testVectorAddKernel(
+    alpaka::Idx<TAcc> problem_size,
+    alpaka::Idx<TAcc> grid_size,
+    alpaka::Idx<TAcc> block_size,
+    TKernel kernel)
+{
+    using Acc = TAcc;
+    using Idx = alpaka::Idx<Acc>;
+    using Platform = alpaka::Platform<Acc>;
+    using Device = alpaka::Dev<Platform>;
+    using Queue = alpaka::Queue<Device, alpaka::NonBlocking>;
+
+    // Random number generator with a gaussian distribution.
+    std::random_device rd{};
+    std::default_random_engine rand{rd()};
+    std::normal_distribution<float> dist{0.f, 1.f};
+
+    // Tolerance.
+    constexpr float epsilon = 0.000001f;
+
+    // Buffer size.
+    const Idx size = problem_size;
+
+    // Initialise the accelerator platform.
+    Platform platform{};
+
+    // Allocate input and output host buffers in pinned memory accessible by the Platform devices.
+    auto in1_h = alpaka::allocMappedBuf<float, Idx>(host, platform, size);
+    auto in2_h = alpaka::allocMappedBuf<float, Idx>(host, platform, size);
+    auto out_h = alpaka::allocMappedBuf<float, Idx>(host, platform, size);
+
+    // Fill the input buffers with random data, and the output buffer with zeros.
+    for(Idx i = 0; i < size; ++i)
+    {
+        in1_h[i] = dist(rand);
+        in2_h[i] = dist(rand);
+        out_h[i] = 0.f;
+    }
+
+    // Run the test on each device.
+    for(auto const& device : alpaka::getDevs(platform))
+    {
+        /* clang-format off */
+        INFO("Test 1D vector addition on " << alpaka::getName(device) << " over " << problem_size << " values with "
+                                           << grid_size << " blocks of " << block_size << " elements");
+        /* clang-format on */
+        auto queue = Queue(device);
+
+        // Allocate input and output buffers on the device.
+        auto in1_d = alpaka::allocAsyncBufIfSupported<float, Idx>(queue, size);
+        auto in2_d = alpaka::allocAsyncBufIfSupported<float, Idx>(queue, size);
+        auto out_d = alpaka::allocAsyncBufIfSupported<float, Idx>(queue, size);
+
+        // Copy the input data to the device; the size is known from the buffer objects.
+        alpaka::memcpy(queue, in1_d, in1_h);
+        alpaka::memcpy(queue, in2_d, in2_h);
+
+        // Fill the output buffer with zeros; the size is known from the buffer objects.
+        alpaka::memset(queue, out_d, 0);
+
+        // Launch the 1-dimensional kernel with scalar size.
+        auto div = makeWorkDiv<Acc>(grid_size, block_size);
+        alpaka::exec<Acc>(queue, div, kernel, in1_d.data(), in2_d.data(), out_d.data(), size);
+
+        // Copy the results from the device to the host.
+        alpaka::memcpy(queue, out_h, out_d);
+
+        // Wait for all the operations to complete.
+        alpaka::wait(queue);
+
+        // Check the results.
+        for(Idx i = 0; i < size; ++i)
+        {
+            float sum = in1_h[i] + in2_h[i];
+            REQUIRE(out_h[i] < sum + epsilon);
+            REQUIRE(out_h[i] > sum - epsilon);
+        }
+    }
+}
+
+// Test the 1-dimensional kernel on all devices, potentially skipping some elements.
+template<typename TAcc, typename TKernel>
+void testVectorAddKernelSkip(
+    alpaka::Idx<TAcc> skip_elements,
+    alpaka::Idx<TAcc> problem_size,
+    alpaka::Idx<TAcc> grid_size,
+    alpaka::Idx<TAcc> block_size,
+    TKernel kernel)
+{
+    using Acc = TAcc;
+    using Idx = alpaka::Idx<Acc>;
+    using Platform = alpaka::Platform<Acc>;
+    using Device = alpaka::Dev<Platform>;
+    using Queue = alpaka::Queue<Device, alpaka::NonBlocking>;
+
+    // Random number generator with a gaussian distribution.
+    std::random_device rd{};
+    std::default_random_engine rand{rd()};
+    std::normal_distribution<float> dist{0.f, 1.f};
+
+    // Tolerance.
+    constexpr float epsilon = 0.000001f;
+
+    // Buffer size.
+    const Idx size = problem_size;
+
+    // Initialise the accelerator platform.
+    Platform platform{};
+
+    // Allocate input and output host buffers in pinned memory accessible by the Platform devices.
+    auto in1_h = alpaka::allocMappedBuf<float, Idx>(host, platform, size);
+    auto in2_h = alpaka::allocMappedBuf<float, Idx>(host, platform, size);
+    auto out_h = alpaka::allocMappedBuf<float, Idx>(host, platform, size);
+
+    // Fill the input buffers with random data, and the output buffer with zeros.
+    for(Idx i = 0; i < size; ++i)
+    {
+        in1_h[i] = dist(rand);
+        in2_h[i] = dist(rand);
+        out_h[i] = 0.f;
+    }
+
+    // Run the test on each device.
+    for(auto const& device : alpaka::getDevs(platform))
+    {
+        /* clang-format off */
+        INFO("Test 1D vector addition on " << alpaka::getName(device) << " skipping " << skip_elements << " over "
+                                           << problem_size << " values with " << grid_size << " blocks of "
+                                           << block_size << " elements");
+        /* clang-format on */
+        auto queue = Queue(device);
+
+        // Allocate input and output buffers on the device.
+        auto in1_d = alpaka::allocAsyncBufIfSupported<float, Idx>(queue, size);
+        auto in2_d = alpaka::allocAsyncBufIfSupported<float, Idx>(queue, size);
+        auto out_d = alpaka::allocAsyncBufIfSupported<float, Idx>(queue, size);
+
+        // Copy the input data to the device; the size is known from the buffer objects.
+        alpaka::memcpy(queue, in1_d, in1_h);
+        alpaka::memcpy(queue, in2_d, in2_h);
+
+        // Fill the output buffer with zeros; the size is known from the buffer objects.
+        alpaka::memset(queue, out_d, 0);
+
+        // Launch the 1-dimensional kernel with scalar size.
+        auto div = makeWorkDiv<Acc>(grid_size, block_size);
+        alpaka::exec<Acc>(queue, div, kernel, in1_d.data(), in2_d.data(), out_d.data(), skip_elements, size);
+
+        // Copy the results from the device to the host.
+        alpaka::memcpy(queue, out_h, out_d);
+
+        // Wait for all the operations to complete.
+        alpaka::wait(queue);
+
+        // Check the results.
+        for(Idx i = 0; i < skip_elements; ++i)
+        {
+            // The first part of the output vector should not have been modified at all, and should be identically 0.
+#ifdef __GNUC__
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+            REQUIRE(out_h[i] == 0);
+#ifdef __GNUC__
+#    pragma GCC diagnostic pop
+#endif
+        }
+        for(Idx i = skip_elements; i < size; ++i)
+        {
+            float sum = in1_h[i] + in2_h[i];
+            REQUIRE(out_h[i] < sum + epsilon);
+            REQUIRE(out_h[i] > sum - epsilon);
+        }
+    }
+}
+
+// Test the N-dimensional kernels on all devices.
+template<typename TAcc, typename TKernel>
+void testVectorAddKernelND(
+    alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> problem_size,
+    alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> grid_size,
+    alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> block_size,
+    TKernel kernel)
+{
+    using Acc = TAcc;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
+    using Platform = alpaka::Platform<Acc>;
+    using Device = alpaka::Dev<Platform>;
+    using Queue = alpaka::Queue<Device, alpaka::NonBlocking>;
+
+    // Random number generator with a gaussian distribution.
+    std::random_device rd{};
+    std::default_random_engine rand{rd()};
+    std::normal_distribution<float> dist{0.f, 1.f};
+
+    // Tolerance.
+    constexpr float epsilon = 0.000001f;
+
+    // Linearised buffer size.
+    const Idx size = problem_size.prod();
+
+    // Initialise the accelerator platform.
+    Platform platform{};
+
+    // Allocate input and output host buffers in pinned memory accessible by the Platform devices.
+    auto in1_h = alpaka::allocMappedBuf<float, Idx>(host, platform, size);
+    auto in2_h = alpaka::allocMappedBuf<float, Idx>(host, platform, size);
+    auto out_h = alpaka::allocMappedBuf<float, Idx>(host, platform, size);
+
+    // Fill the input buffers with random data, and the output buffer with zeros.
+    for(Idx i = 0; i < size; ++i)
+    {
+        in1_h[i] = dist(rand);
+        in2_h[i] = dist(rand);
+        out_h[i] = 0.f;
+    }
+
+    // Run the test on each device.
+    for(auto const& device : alpaka::getDevs(platform))
+    {
+        /* clang-format off */
+        INFO("Test " << Dim::value << "D vector addition on " << alpaka::getName(device) << " over " << problem_size
+                     << " values with " << grid_size << " blocks of " << block_size << " elements");
+        /* clang-format on */
+        auto queue = Queue(device);
+
+        // Allocate input and output buffers on the device.
+        auto in1_d = alpaka::allocAsyncBufIfSupported<float, Idx>(queue, size);
+        auto in2_d = alpaka::allocAsyncBufIfSupported<float, Idx>(queue, size);
+        auto out_d = alpaka::allocAsyncBufIfSupported<float, Idx>(queue, size);
+
+        // Copy the input data to the device; the size is known from the buffer objects.
+        alpaka::memcpy(queue, in1_d, in1_h);
+        alpaka::memcpy(queue, in2_d, in2_h);
+
+        // Fill the output buffer with zeros; the size is known from the buffer objects.
+        alpaka::memset(queue, out_d, 0);
+
+        // Launch the 3-dimensional kernel.
+        auto div = makeWorkDiv<Acc>(grid_size, block_size);
+        alpaka::exec<Acc>(queue, div, kernel, in1_d.data(), in2_d.data(), out_d.data(), problem_size);
+
+        // Copy the results from the device to the host.
+        alpaka::memcpy(queue, out_h, out_d);
+
+        // Wait for all the operations to complete.
+        alpaka::wait(queue);
+
+        // Check the results.
+        for(Idx i = 0; i < size; ++i)
+        {
+            float sum = in1_h[i] + in2_h[i];
+            REQUIRE(out_h[i] < sum + epsilon);
+            REQUIRE(out_h[i] > sum - epsilon);
+        }
+    }
+}
+
+TEMPLATE_LIST_TEST_CASE("UniformElements", "[exec]", alpaka::test::TestAccs)
+{
+    using Acc = TestType;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
+    using Vec = alpaka::Vec<Dim, Idx>;
+
+    // 1-dimensional kernels.
+    if constexpr(Dim::value == 1)
+    {
+        SECTION("VectorAddKernel1D, small block size")
+        {
+            // Launch the 1-dimensional kernel with a small block size and a small number of blocks; this relies on the
+            // kernel to loop over the "problem space" and do more work per block.
+            INFO("Test 1D vector addition with small block size");
+            testVectorAddKernelND<TestType, VectorAddKernel1D>(Vec{10000}, Vec{32}, Vec{32}, VectorAddKernel1D{});
+        }
+
+        SECTION("VectorAddKernel1D, large block size")
+        {
+            // Launch the 1-dimensional kernel with a large block size and a single block; this relies on the kernel to
+            // check the size of the "problem space" and avoid accessing out-of-bounds data.
+            INFO("Test 1D vector addition with large block size");
+            testVectorAddKernelND<TestType>({10}, {2}, {32}, VectorAddKernel1D{});
+        }
+    }
+
+    // 2-dimensional kernels.
+    if constexpr(Dim::value == 2)
+    {
+        SECTION("VectorAddKernel2D, small block size")
+        {
+            // Launch the 2-dimensional kernel with a small block size and a small number of blocks; this relies on the
+            // kernel to loop over the "problem space" and do more work per block.
+            INFO("Test 2D vector addition with small block size");
+            testVectorAddKernelND<TestType>({400, 250}, {5, 4}, {8, 4}, VectorAddKernel2D{});
+        }
+
+        SECTION("VectorAddKernel2D, large block size")
+        {
+            // Launch the 2-dimensional kernel with a large block size and a single block; this relies on the kernel to
+            // check the size of the "problem space" and avoid accessing out-of-bounds data.
+            INFO("Test 2D vector addition with large block size");
+            testVectorAddKernelND<TestType>({5, 3}, {2, 2}, {8, 4}, VectorAddKernel2D{});
+        }
+    }
+
+    // 3-dimensional kernels.
+    if constexpr(Dim::value == 3)
+    {
+        SECTION("VectorAddKernel3D, small block size")
+        {
+            // Launch the 3-dimensional kernel with a small block size and a small number of blocks; this relies on the
+            // kernel to loop over the "problem space" and do more work per block.
+            INFO("Test 3D vector addition with small block size");
+            testVectorAddKernelND<TestType>({50, 25, 16}, {5, 2, 2}, {2, 4, 4}, VectorAddKernel3D{});
+        }
+
+        SECTION("VectorAddKernel3D, large block size")
+        {
+            // Launch the 3-dimensional kernel with a large block size and a single block; this relies on the kernel to
+            // check the size of the "problem space" and avoid accessing out-of-bounds data.
+            INFO("Test 3D vector addition with large block size");
+            testVectorAddKernelND<TestType>({2, 3, 3}, {2, 2, 2}, {2, 4, 4}, VectorAddKernel3D{});
+        }
+    }
+
+    // 1-dimensional kernels.
+    if constexpr(Dim::value == 1)
+    {
+        SECTION("VectorAddKernel, small block size")
+        {
+            // Launch the 1-dimensional kernel with a small block size and a small number of blocks; this relies on the
+            // kernel to loop over the "problem space" and do more work per block.
+            INFO("Test 1D vector addition with small block size, using scalar dimensions");
+            testVectorAddKernel<TestType>(10000, 32, 32, VectorAddKernel{});
+        }
+
+        SECTION("VectorAddKernel, large block size")
+        {
+            // Launch the 1-dimensional kernel with a large block size and a single block; this relies on the kernel to
+            // check the size of the "problem space" and avoid accessing out-of-bounds data.
+            INFO("Test 1D vector addition with large block size, using scalar dimensions");
+            testVectorAddKernel<TestType>(10, 2, 32, VectorAddKernel{});
+        }
+
+        SECTION("VectorAddBlockKernel, small block size")
+        {
+            // Launch the 1-dimensional kernel with a small block size and a small number of blocks; this relies on the
+            // kernel to loop over the "problem space" and do more work per block.
+            INFO("Test 1D vector block-level addition with small block size, using scalar dimensions");
+            testVectorAddKernel<TestType>(10000, 32, 32, VectorAddBlockKernel{});
+        }
+
+        SECTION("VectorAddBlockKernel, large block size")
+        {
+            // Launch the 1-dimensional kernel with a large block size and a single block; this relies on the kernel to
+            // check the size of the "problem space" and avoid accessing out-of-bounds data.
+            INFO("Test 1D vector block-level addition with large block size, using scalar dimensions");
+            testVectorAddKernel<TestType>(10, 2, 32, VectorAddBlockKernel{});
+        }
+
+        SECTION("VectorAddKernelSerial, small block size")
+        {
+            // Launch the 1-dimensional kernel with a small block size and a small number of blocks; this relies on the
+            // kernel to loop over the "problem space" and do more work per block.
+            INFO("Test 1D vector single-threaded serial addition with small block size, using scalar dimensions");
+            testVectorAddKernel<TestType>(10000, 32, 32, VectorAddKernelSerial{});
+        }
+
+        SECTION("VectorAddKernelSerial, large block size")
+        {
+            // Launch the 1-dimensional kernel with a large block size and a single block; this relies on the kernel to
+            // check the size of the "problem space" and avoid accessing out-of-bounds data.
+            INFO("Test 1D vector single-threaded seria addition with large block size, using scalar dimensions");
+            testVectorAddKernel<TestType>(10, 2, 32, VectorAddKernelSerial{});
+        }
+
+        SECTION("VectorAddKernelBlockSerial, small block size")
+        {
+            // Launch the 1-dimensional kernel with a small block size and a small number of blocks; this relies on the
+            // kernel to loop over the "problem space" and do more work per block.
+            INFO("Test 1D vector block-level serial addition with small block size, using scalar dimensions");
+            testVectorAddKernel<TestType>(10000, 32, 32, VectorAddKernelBlockSerial{});
+        }
+
+        SECTION("VectorAddKernelBlockSerial, large block size")
+        {
+            // Launch the 1-dimensional kernel with a large block size and a single block; this relies on the kernel to
+            // check the size of the "problem space" and avoid accessing out-of-bounds data.
+            INFO("Test 1D vector block-level serial addition with large block size, using scalar dimensions");
+            testVectorAddKernel<TestType>(10, 2, 32, VectorAddKernelBlockSerial{});
+        }
+
+        SECTION("VectorAddKernelSkip, small block size")
+        {
+            // Launch the 1-dimensional kernel with a small block size and a small number of blocks; this relies on the
+            // kernel to loop over the "problem space" and do more work per block.
+            INFO("Test 1D vector addition with small block size, using scalar dimensions");
+            testVectorAddKernelSkip<TestType>(20, 10000, 32, 32, VectorAddKernelSkip{});
+        }
+
+        SECTION("VectorAddKernelSkip, large block size")
+        {
+            // Launch the 1-dimensional kernel with a large block size and a single block; this relies on the kernel to
+            // check the size of the "problem space" and avoid accessing out-of-bounds data.
+            INFO("Test 1D vector addition with large block size, using scalar dimensions");
+            testVectorAddKernelSkip<TestType>(2, 10, 2, 32, VectorAddKernelSkip{});
+        }
+    }
+}
diff --git a/alpaka/test/unit/exec/src/WorkDiv.hpp b/alpaka/test/unit/exec/src/WorkDiv.hpp
new file mode 100644
index 00000000..c93d4865
--- /dev/null
+++ b/alpaka/test/unit/exec/src/WorkDiv.hpp
@@ -0,0 +1,61 @@
+#pragma once
+
+#include "alpaka/acc/Traits.hpp"
+#include "alpaka/core/Unreachable.hpp"
+#include "alpaka/dim/Traits.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+#include "alpaka/workdiv/WorkDivMembers.hpp"
+
+#include <type_traits>
+
+// Create an accelerator-dependent work division for 1-dimensional kernels.
+template<typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+inline auto makeWorkDiv(alpaka::Idx<TAcc> blocks, alpaka::Idx<TAcc> elements)
+    -> alpaka::WorkDivMembers<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>>
+{
+    using Dim = alpaka::Dim<TAcc>;
+    using Idx = alpaka::Idx<TAcc>;
+    using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
+    if constexpr(alpaka::isMultiThreadAcc<TAcc>)
+    {
+        // On thread-parallel backends, each thread is looking at a single element:
+        //   - the number of threads per block is "elements";   - the number of elements per thread is always 1.
+        return WorkDiv{blocks, elements, Idx{1}};
+    }
+    else
+    {
+        // On thread-serial backends, run serially with a single thread per block:
+        //   - the number of threads per block is always 1;   - the number of elements per thread is "elements".
+        return WorkDiv{blocks, Idx{1}, elements};
+    }
+
+    ALPAKA_UNREACHABLE(WorkDiv{blocks, elements, Idx{1}});
+}
+
+// Create the accelerator-dependent workdiv for N-dimensional kernels.
+template<typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+inline auto makeWorkDiv(
+    alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> const& blocks,
+    alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> const& elements)
+    -> alpaka::WorkDivMembers<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>>
+{
+    using Dim = alpaka::Dim<TAcc>;
+    using Idx = alpaka::Idx<TAcc>;
+    using Vec = alpaka::Vec<Dim, Idx>;
+    using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
+    if constexpr(alpaka::isMultiThreadAcc<TAcc>)
+    {
+        // On thread-parallel backends, each thread is looking at a single element:
+        //   - the number of threads per block is "elements";   - the number of elements per thread is always 1.
+        return WorkDiv{blocks, elements, Vec::ones()};
+    }
+    else
+    {
+        // On thread-serial backends, run serially with a single thread per block:
+        //   - the number of threads per block is always 1;   - the number of elements per thread is "elements".
+        return WorkDiv{blocks, Vec::ones(), elements};
+    }
+
+    ALPAKA_UNREACHABLE(WorkDiv{blocks, elements, Vec::ones()});
+}
diff --git a/alpaka/test/unit/idx/src/MapIdxPitchBytes.cpp b/alpaka/test/unit/idx/src/MapIdxPitchBytes.cpp
index 327d9a46..6a341cac 100644
--- a/alpaka/test/unit/idx/src/MapIdxPitchBytes.cpp
+++ b/alpaka/test/unit/idx/src/MapIdxPitchBytes.cpp
@@ -3,7 +3,6 @@
  */
 
 #include <alpaka/dev/Traits.hpp>
-#include <alpaka/example/ExampleDefaultAcc.hpp>
 #include <alpaka/idx/Accessors.hpp>
 #include <alpaka/idx/MapIdx.hpp>
 #include <alpaka/mem/view/ViewPlainPtr.hpp>
@@ -14,15 +13,16 @@
 #include <catch2/catch_template_test_macros.hpp>
 #include <catch2/catch_test_macros.hpp>
 
-TEMPLATE_LIST_TEST_CASE("mapIdxPitchBytes", "[idx]", alpaka::test::NonZeroTestDims)
+template<typename TDim, typename TAccTag>
+auto mapIdxPitchBytes(TAccTag const&)
 {
-    using Dim = TestType;
+    using Dim = TDim;
     using Idx = std::size_t;
     using Vec = alpaka::Vec<Dim, Idx>;
 
     auto const extentNd = alpaka::test::extentBuf<Dim, Idx>;
 
-    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    using Acc = alpaka::TagToAcc<TAccTag, Dim, Idx>;
     using Elem = std::uint8_t;
     auto const platformAcc = alpaka::Platform<Acc>{};
     auto const devAcc = alpaka::getDevByIdx(platformAcc, 0);
@@ -45,3 +45,9 @@ TEMPLATE_LIST_TEST_CASE("mapIdxPitchBytes", "[idx]", alpaka::test::NonZeroTestDi
     // roundtrip
     REQUIRE(idxNd == idxNdResult);
 }
+
+TEMPLATE_LIST_TEST_CASE("mapIdxPitchBytes", "[idx]", alpaka::test::NonZeroTestDims)
+{
+    // execute the example once for each enabled accelerator
+    std::apply([](auto const&... tags) { (mapIdxPitchBytes<TestType>(tags), ...); }, alpaka::EnabledAccTags{});
+}
diff --git a/alpaka/test/unit/kernel/src/KernelWithoutTemplatedAccParam.cpp b/alpaka/test/unit/kernel/src/KernelWithoutTemplatedAccParam.cpp
index 64d11308..dc7cfc07 100644
--- a/alpaka/test/unit/kernel/src/KernelWithoutTemplatedAccParam.cpp
+++ b/alpaka/test/unit/kernel/src/KernelWithoutTemplatedAccParam.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan
+/* Copyright 2024 Axel Huebl, Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -7,23 +7,6 @@
 
 #include <catch2/catch_test_macros.hpp>
 
-//! It is not possible to use a alpaka kernel function object without a templated operator() when the CUDA accelerator
-//! is hard-coded.
-//!
-//! However, compiling such kernels with a CPU device works fine.
-//!
-//! When the CUDA accelerator is used, the following error is triggered:
-//! /alpaka/include/alpaka/workdiv/Traits.hpp(...): error: calling a __device__ function("getWorkDiv") from a __host__
-//! __device__ function("getWorkDiv") is not allowed The kernel function objects function call operator is attributed
-//! with ALPAKA_FN_ACC which is identical to __host__ __device__. The 'alpaka::getWorkDiv<...>(acc)' function that is
-//! called has the ALPAKA_FN_HOST_ACC attribute (also equal to __host__ __device__). The underlying trait calls the
-//! CUDA specialized method which has the __device__ attribute. Because this call chain does not contain any templates
-//! and therefore no calls depending on input types, everything can be resolved at the first time the template is
-//! parsed which results in the given error.
-//!
-//! Currently, the only possible way to solve this is to make the function call operator a template nonetheless by
-//! providing an unused template parameter.
-
 using Dim = alpaka::DimInt<2u>;
 using Idx = std::uint32_t;
 #if defined(ALPAKA_ACC_CPU_SERIAL_ENABLED)
@@ -56,15 +39,11 @@ TEST_CASE("kernelNoTemplateCpu", "[kernel]")
 }
 #endif
 
-/*#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA
-//! DO NOT ENABLE! COMPILATION WILL FAIL!
+#if(defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
 struct KernelNoTemplateGpu
 {
     ALPAKA_FN_ACC
-    auto operator()(
-        AccGpu const & acc,
-        bool* success) const
-    -> void
+    auto operator()(AccGpu const& acc, bool* success) const -> void
     {
         ALPAKA_CHECK(
             *success,
@@ -74,17 +53,16 @@ struct KernelNoTemplateGpu
 
 TEST_CASE("kernelNoTemplateGpu", "[kernel]")
 {
-    alpaka::test::KernelExecutionFixture<AccGpu> fixture(
-        alpaka::Vec<Dim, Idx>::ones());
+    alpaka::test::KernelExecutionFixture<AccGpu> fixture(alpaka::Vec<Dim, Idx>::ones());
 
     KernelNoTemplateGpu kernel;
 
     REQUIRE(fixture(kernel));
 }
-#endif*/
+#endif
 
 #if defined(ALPAKA_ACC_CPU_SERIAL_ENABLED)
-struct KernelWithoutTemplateParamCpu
+struct KernelUnusedTemplateParamCpu
 {
     template<typename TNotUsed = void>
     ALPAKA_FN_ACC auto operator()(AccCpu const& acc, bool* success) const -> void
@@ -95,18 +73,18 @@ struct KernelWithoutTemplateParamCpu
     }
 };
 
-TEST_CASE("kernelWithoutTemplateParamCpu", "[kernel]")
+TEST_CASE("kernelUnusedTemplateParamCpu", "[kernel]")
 {
     alpaka::test::KernelExecutionFixture<AccCpu> fixture(alpaka::Vec<Dim, Idx>::ones());
 
-    KernelWithoutTemplateParamCpu kernel;
+    KernelUnusedTemplateParamCpu kernel;
 
     REQUIRE(fixture(kernel));
 }
 #endif
 
 #if(defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
-struct KernelWithoutTemplateParamGpu
+struct KernelUnusedTemplateParamGpu
 {
     template<typename TNotUsed = void>
     ALPAKA_FN_ACC auto operator()(AccGpu const& acc, bool* success) const -> void
@@ -117,11 +95,11 @@ struct KernelWithoutTemplateParamGpu
     }
 };
 
-TEST_CASE("kernelWithoutTemplateParamGpu", "[kernel]")
+TEST_CASE("kernelUnusedTemplateParamGpu", "[kernel]")
 {
     alpaka::test::KernelExecutionFixture<AccGpu> fixture(alpaka::Vec<Dim, Idx>::ones());
 
-    KernelWithoutTemplateParamGpu kernel;
+    KernelUnusedTemplateParamGpu kernel;
 
     REQUIRE(fixture(kernel));
 }
diff --git a/alpaka/test/unit/math/src/Buffer.hpp b/alpaka/test/unit/math/src/Buffer.hpp
index fbffd348..bd532281 100644
--- a/alpaka/test/unit/math/src/Buffer.hpp
+++ b/alpaka/test/unit/math/src/Buffer.hpp
@@ -38,12 +38,10 @@ namespace mathtest
         using PlatformAcc = alpaka::Platform<DevAcc>;
         using BufAcc = alpaka::Buf<DevAcc, TData, Dim, Idx>;
 
-        PlatformHost platformHost;
         DevHost devHost;
 
         BufHost hostBuffer;
         BufAcc devBuffer;
-        PlatformAcc platformAcc;
 
         // Native pointer to access buffer.
         TData* const pHostBuffer;
@@ -54,12 +52,12 @@ namespace mathtest
         Buffer() = delete;
 
         // Constructor needs to initialize all Buffer.
-        Buffer(DevAcc const& devAcc)
+        Buffer(DevAcc const& devAcc, PlatformHost const& platformHost, PlatformAcc const& platformAcc)
             : devHost{alpaka::getDevByIdx(platformHost, 0)}
             , hostBuffer{alpaka::allocMappedBufIfSupported<TData, Idx>(devHost, platformAcc, Tcapacity)}
             , devBuffer{alpaka::allocBuf<TData, Idx>(devAcc, Tcapacity)}
-            , pHostBuffer{alpaka::getPtrNative(hostBuffer)}
-            , pDevBuffer{alpaka::getPtrNative(devBuffer)}
+            , pHostBuffer{std::data(hostBuffer)}
+            , pDevBuffer{std::data(devBuffer)}
         {
         }
 
diff --git a/alpaka/test/unit/math/src/DataGen.hpp b/alpaka/test/unit/math/src/DataGen.hpp
index 78a5d054..8b35cbee 100644
--- a/alpaka/test/unit/math/src/DataGen.hpp
+++ b/alpaka/test/unit/math/src/DataGen.hpp
@@ -11,6 +11,11 @@
 #include <limits>
 #include <random>
 
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wswitch-default"
+#endif
+
 namespace mathtest
 {
     //! Helper to generate random numbers of the given type for testing
@@ -199,3 +204,7 @@ namespace mathtest
         }
     }
 } // namespace mathtest
+
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic pop
+#endif
diff --git a/alpaka/test/unit/math/src/TestTemplate.hpp b/alpaka/test/unit/math/src/TestTemplate.hpp
index b417aeb4..093e06f5 100644
--- a/alpaka/test/unit/math/src/TestTemplate.hpp
+++ b/alpaka/test/unit/math/src/TestTemplate.hpp
@@ -74,9 +74,6 @@ namespace mathtest
             // DevAcc is defined in Buffer.hpp too.
             using DevAcc = alpaka::Dev<TAcc>;
 
-            using Dim = alpaka::DimInt<1u>;
-            using Idx = std::size_t;
-            using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
             using QueueAcc = alpaka::test::DefaultQueue<DevAcc>;
             using TArgsItem = ArgsItem<TData, TFunctor::arity>;
 
@@ -98,15 +95,20 @@ namespace mathtest
 
             TestKernel<capacity> kernel;
             TFunctor functor;
-            Args args{devAcc};
-            Results results{devAcc};
-
-            WorkDiv const workDiv = alpaka::getValidWorkDiv<TAcc>(
+            Args args{devAcc, platformHost, platformAcc};
+            Results results{devAcc, platformHost, platformAcc};
+
+            // Let alpaka calculate good block and grid sizes given our full problem extent
+            alpaka::KernelCfg<TAcc> const kernelCfg
+                = {sizeExtent, elementsPerThread, false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted};
+            auto const workDiv = alpaka::getValidWorkDiv(
+                kernelCfg,
                 devAcc,
-                sizeExtent,
-                elementsPerThread,
-                false,
-                alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
+                kernel,
+                results.pDevBuffer,
+                wrappedFunctor,
+                args.pDevBuffer);
+
             // SETUP COMPLETED.
 
             // Fill the buffer with random test-numbers.
@@ -174,7 +176,7 @@ namespace mathtest
             if(!isFinite(a) && !isFinite(b))
                 return true;
             // For the same reason use relative difference comparison with a large margin
-            auto const scalingFactor = static_cast<T>(std::is_same_v<T, float> ? 1.1e4 : 1.1e6);
+            auto const scalingFactor = static_cast<T>(std::is_same_v<T, float> ? 1.5e4 : 1.1e6);
             auto const marginValue = scalingFactor * std::numeric_limits<T>::epsilon();
             return (a.real() == Catch::Approx(b.real()).margin(marginValue).epsilon(marginValue))
                    && (a.imag() == Catch::Approx(b.imag()).margin(marginValue).epsilon(marginValue));
diff --git a/alpaka/test/unit/mem/view/src/ViewConst.cpp b/alpaka/test/unit/mem/view/src/ViewConst.cpp
index a7cfe155..8d8d50d7 100644
--- a/alpaka/test/unit/mem/view/src/ViewConst.cpp
+++ b/alpaka/test/unit/mem/view/src/ViewConst.cpp
@@ -74,40 +74,18 @@ TEMPLATE_LIST_TEST_CASE("viewConstTest", "[memView]", alpaka::test::TestAccs)
     alpaka::test::iotaFillView(queue, buf);
     auto const offsets = alpaka::Vec<Dim, Idx>::all(static_cast<Idx>(0));
 
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 2, 0)
-    auto view = alpaka::ViewConst<decltype(buf)>(buf);
-#else
     auto view = alpaka::ViewConst(buf);
-#endif
     alpaka::test::testViewConst<Acc>(view, dev, queue, extents, offsets);
 
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 2, 0)
-    auto const cview = alpaka::ViewConst<decltype(buf)>(buf);
-#else
     auto const cview = alpaka::ViewConst(buf);
-#endif
     alpaka::test::testViewConst<Acc>(cview, dev, queue, extents, offsets);
 
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 2, 0)
-    auto view_cbuf = alpaka::ViewConst<decltype(buf)>(std::as_const(buf));
-#else
     auto view_cbuf = alpaka::ViewConst(std::as_const(buf));
-#endif
     alpaka::test::testViewConst<Acc>(view_cbuf, dev, queue, extents, offsets);
 
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 2, 0)
-    auto const cview_cbuf = alpaka::ViewConst<decltype(buf)>(std::as_const(buf));
-#else
     auto const cview_cbuf = alpaka::ViewConst(std::as_const(buf));
-#endif
     alpaka::test::testViewConst<Acc>(cview_cbuf, dev, queue, extents, offsets);
 
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 2, 0)
-    using BufType = std::remove_const_t<decltype(cview_cbuf)>;
-    auto yolo = alpaka::ViewConst<alpaka::ViewConst<alpaka::ViewConst<BufType>>>(
-        alpaka::ViewConst<alpaka::ViewConst<BufType>>(alpaka::ViewConst<BufType>(cview_cbuf)));
-#else
     auto yolo = alpaka::ViewConst(alpaka::ViewConst(alpaka::ViewConst(cview_cbuf)));
-#endif
     alpaka::test::testViewConst<Acc>(yolo, dev, queue, extents, offsets);
 }
diff --git a/alpaka/test/unit/mem/view/src/ViewStaticAccMem.cpp b/alpaka/test/unit/mem/view/src/ViewStaticAccMem.cpp
index 543d90b3..a42a37c1 100644
--- a/alpaka/test/unit/mem/view/src/ViewStaticAccMem.cpp
+++ b/alpaka/test/unit/mem/view/src/ViewStaticAccMem.cpp
@@ -1,5 +1,5 @@
-/* Copyright 2023 Axel Huebl, Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
+/* Copyright 2024 Axel Huebl, Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci,
+ * Aurora Perego SPDX-License-Identifier: MPL-2.0
  */
 
 #include <alpaka/core/BoostPredef.hpp>
@@ -16,21 +16,17 @@ using Elem = std::uint32_t;
 using Dim = alpaka::DimInt<2u>;
 using Idx = std::uint32_t;
 
-#if !defined(ALPAKA_ACC_SYCL_ENABLED)
+ALPAKA_STATIC_ACC_MEM_GLOBAL alpaka::DevGlobal<TAcc, Elem[3][2]> g_globalMemory2DUninitialized;
 
-// These forward declarations are only necessary when you want to access those variables
-// from a different compilation unit and should be moved to a common header.
-// Here they are used to silence clang`s -Wmissing-variable-declarations warning
-// that forces every non-static variable to be declared with extern before the are defined.
-extern ALPAKA_STATIC_ACC_MEM_CONSTANT Elem g_constantMemory2DUninitialized[3][2];
-ALPAKA_STATIC_ACC_MEM_CONSTANT Elem g_constantMemory2DUninitialized[3][2];
+ALPAKA_STATIC_ACC_MEM_CONSTANT alpaka::DevGlobal<TAcc, const Elem[3][2]> g_constantMemory2DUninitialized;
 
 //! Uses static device memory on the accelerator defined globally for the whole compilation unit.
 struct StaticDeviceMemoryTestKernel
 {
     ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TAcc, typename TElem>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, bool* success, TElem const* const pConstantMem) const
+    template<typename TAcc>
+    // ALPAKA_FN_ACC void operator()(TAcc const& acc, bool* success) const
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, bool* success) const
     {
         auto const gridThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
         auto const gridThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
@@ -38,30 +34,54 @@ struct StaticDeviceMemoryTestKernel
         auto const offset = gridThreadExtent[1u] * gridThreadIdx[0u] + gridThreadIdx[1u];
         auto const val = offset;
 
-        ALPAKA_CHECK(*success, val == *(pConstantMem + offset));
+        ALPAKA_CHECK(*success, val == *((&g_globalMemory2DUninitialized<TAcc>.get())[0][0] + offset));
     }
 };
 
-#endif // !defined(ALPAKA_ACC_SYCL_ENABLED)
+struct ConstantDeviceMemoryTestKernel
+{
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, bool* success) const
+    {
+        auto const gridThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
+        auto const gridThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+
+        auto const offset = gridThreadExtent[1u] * gridThreadIdx[0u] + gridThreadIdx[1u];
+        auto const val = offset;
+
+        ALPAKA_CHECK(*success, val == *((&g_constantMemory2DUninitialized<TAcc>.get())[0][0] + offset));
+    }
+};
 
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_TARGET_CPU)
+template<typename TDim, typename TIdx>
+using EnabledAccsElseInt = std::tuple<alpaka::AccCpuSycl<TDim, TIdx>>;
+template<typename TDim, typename TIdx>
+using EnabledAccs = typename alpaka::meta::Filter<EnabledAccsElseInt<TDim, TIdx>, std::is_class>;
+using TestAccs = EnabledAccs<Dim, Idx>;
+#elif defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_TARGET_GPU)
+template<typename TDim, typename TIdx>
+using EnabledAccsElseInt = std::tuple<alpaka::AccGpuSyclIntel<TDim, TIdx>>;
+template<typename TDim, typename TIdx>
+using EnabledAccs = typename alpaka::meta::Filter<EnabledAccsElseInt<TDim, TIdx>, std::is_class>;
+using TestAccs = EnabledAccs<Dim, Idx>;
+#else
 using TestAccs = alpaka::test::EnabledAccs<Dim, Idx>;
+#endif
 
 TEMPLATE_LIST_TEST_CASE("staticDeviceMemoryGlobal", "[viewStaticAccMem]", TestAccs)
 {
-#if !defined(ALPAKA_ACC_SYCL_ENABLED)
     using Acc = TestType;
     using DevAcc = alpaka::Dev<Acc>;
 
     auto const platformAcc = alpaka::Platform<Acc>{};
+
     auto const devAcc = alpaka::getDevByIdx(platformAcc, 0);
 
     alpaka::Vec<Dim, Idx> const extent(3u, 2u);
 
-    alpaka::test::KernelExecutionFixture<Acc> fixture(extent);
-
-    StaticDeviceMemoryTestKernel kernel;
-
-    // uninitialized static constant device memory
+    // uninitialized static global device memory
     {
         auto const platformHost = alpaka::PlatformCpu{};
         auto const devHost = alpaka::getDevByIdx(platformHost, 0);
@@ -69,39 +89,28 @@ TEMPLATE_LIST_TEST_CASE("staticDeviceMemoryGlobal", "[viewStaticAccMem]", TestAc
         using QueueAcc = alpaka::test::DefaultQueue<DevAcc>;
         QueueAcc queueAcc(devAcc);
 
+        alpaka::test::KernelExecutionFixture<Acc> fixture(queueAcc, extent);
+
+        StaticDeviceMemoryTestKernel kernel;
+
         std::vector<Elem> const data{0u, 1u, 2u, 3u, 4u, 5u};
         auto bufHost = alpaka::createView(devHost, data.data(), extent);
 
-        auto viewConstantMemUninitialized
-            = alpaka::createStaticDevMemView(&g_constantMemory2DUninitialized[0u][0u], devAcc, extent);
-
-        alpaka::memcpy(queueAcc, viewConstantMemUninitialized, bufHost);
+        alpaka::memcpy(queueAcc, g_globalMemory2DUninitialized<Acc>, bufHost, extent);
         alpaka::wait(queueAcc);
 
-        REQUIRE(fixture(kernel, alpaka::getPtrNative(viewConstantMemUninitialized)));
-    }
+        REQUIRE(fixture(kernel));
 
-#else // !defined(ALPAKA_ACC_SYCL_ENABLED)
-
-    WARN("The SYCL backend does not support global device variables.");
-
-#endif // !defined(ALPAKA_ACC_SYCL_ENABLED)
+        std::vector<Elem> data2(6, 0u);
+        auto bufHost2 = alpaka::createView(devHost, data2.data(), extent);
+        alpaka::memcpy(queueAcc, bufHost2, g_globalMemory2DUninitialized<Acc>, extent);
+        alpaka::wait(queueAcc);
+        REQUIRE(data == data2);
+    }
 }
 
-#if !defined(ALPAKA_ACC_SYCL_ENABLED)
-
-// These forward declarations are only necessary when you want to access those variables
-// from a different compilation unit and should be moved to a common header.
-// Here they are used to silence clang`s -Wmissing-variable-declarations warning
-// that forces every non-static variable to be declared with extern before the are defined.
-extern ALPAKA_STATIC_ACC_MEM_GLOBAL Elem g_globalMemory2DUninitialized[3][2];
-ALPAKA_STATIC_ACC_MEM_GLOBAL Elem g_globalMemory2DUninitialized[3][2];
-
-#endif // !defined(ALPAKA_ACC_SYCL_ENABLED)
-
 TEMPLATE_LIST_TEST_CASE("staticDeviceMemoryConstant", "[viewStaticAccMem]", TestAccs)
 {
-#if !defined(ALPAKA_ACC_SYCL_ENABLED)
     using Acc = TestType;
     using DevAcc = alpaka::Dev<Acc>;
 
@@ -110,11 +119,7 @@ TEMPLATE_LIST_TEST_CASE("staticDeviceMemoryConstant", "[viewStaticAccMem]", Test
 
     alpaka::Vec<Dim, Idx> const extent(3u, 2u);
 
-    alpaka::test::KernelExecutionFixture<Acc> fixture(extent);
-
-    StaticDeviceMemoryTestKernel kernel;
-
-    // uninitialized static global device memory
+    // uninitialized static constant device memory
     {
         auto const platformHost = alpaka::PlatformCpu{};
         auto const devHost = alpaka::getDevByIdx(platformHost, 0);
@@ -122,21 +127,22 @@ TEMPLATE_LIST_TEST_CASE("staticDeviceMemoryConstant", "[viewStaticAccMem]", Test
         using QueueAcc = alpaka::test::DefaultQueue<DevAcc>;
         QueueAcc queueAcc(devAcc);
 
+        alpaka::test::KernelExecutionFixture<Acc> fixture(queueAcc, extent);
+
+        ConstantDeviceMemoryTestKernel kernel;
+
         std::vector<Elem> const data{0u, 1u, 2u, 3u, 4u, 5u};
         auto bufHost = alpaka::createView(devHost, data.data(), extent);
 
-        auto viewGlobalMemUninitialized
-            = alpaka::createStaticDevMemView(&g_globalMemory2DUninitialized[0u][0u], devAcc, extent);
-
-        alpaka::memcpy(queueAcc, viewGlobalMemUninitialized, bufHost);
+        alpaka::memcpy(queueAcc, g_constantMemory2DUninitialized<Acc>, bufHost);
         alpaka::wait(queueAcc);
 
-        REQUIRE(fixture(kernel, alpaka::getPtrNative(viewGlobalMemUninitialized)));
-    }
+        REQUIRE(fixture(kernel));
 
-#else // !defined(ALPAKA_ACC_SYCL_ENABLED)
-
-    WARN("The SYCL backend does not support global device constants.");
-
-#endif // !defined(ALPAKA_ACC_SYCL_ENABLED)
+        std::vector<Elem> data2(6, 0u);
+        auto bufHost2 = alpaka::createView(devHost, data2.data(), extent);
+        alpaka::memcpy(queueAcc, bufHost2, g_constantMemory2DUninitialized<Acc>);
+        alpaka::wait(queueAcc);
+        REQUIRE(data == data2);
+    }
 }
diff --git a/alpaka/test/unit/meta/src/CudaVectorArrayWrapperTest.cpp b/alpaka/test/unit/meta/src/CudaVectorArrayWrapperTest.cpp
deleted file mode 100644
index d325fc83..00000000
--- a/alpaka/test/unit/meta/src/CudaVectorArrayWrapperTest.cpp
+++ /dev/null
@@ -1,167 +0,0 @@
-/* Copyright 2022 Jiří Vyskočil, Jan Stephan
- * SPDX-License-Identifier: MPL-2.0
- */
-
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-#    include <alpaka/math/FloatEqualExact.hpp>
-#    include <alpaka/meta/CudaVectorArrayWrapper.hpp>
-#    include <alpaka/meta/IsStrictBase.hpp>
-#    include <alpaka/rand/Traits.hpp>
-#    include <alpaka/test/KernelExecutionFixture.hpp>
-#    include <alpaka/test/acc/TestAccs.hpp>
-
-#    include <catch2/catch_template_test_macros.hpp>
-#    include <catch2/catch_test_macros.hpp>
-
-#    include <type_traits>
-
-/* The tests here use equals for comparing float values for exact equality. This is not
- * an issue of arithmetics. We are testing whether the values saved in a container are the same as the ones retrieved
- * from it afterwards. In this case, returning a value that would not be exactly but only approximately equal to the
- * one that was stored in the container would be a grave error.
- */
-template<typename T1, typename T2>
-ALPAKA_FN_INLINE ALPAKA_FN_HOST_ACC bool equals(T1 a, T2 b)
-{
-    return a == static_cast<T1>(b);
-}
-
-template<>
-ALPAKA_FN_INLINE ALPAKA_FN_HOST_ACC bool equals<float, float>(float a, float b)
-{
-    return alpaka::math::floatEqualExactNoWarning(a, b);
-}
-
-template<>
-ALPAKA_FN_INLINE ALPAKA_FN_HOST_ACC bool equals<double, double>(double a, double b)
-{
-    return alpaka::math::floatEqualExactNoWarning(a, b);
-}
-
-template<typename T>
-class CudaVectorArrayWrapperTestKernel
-{
-public:
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TAcc>
-    ALPAKA_FN_ACC auto operator()(TAcc const& /* acc */, bool* success) const -> void
-    {
-        using T1 = alpaka::meta::CudaVectorArrayWrapper<T, 1>;
-        T1 t1{0};
-        static_assert(T1::size == 1, "CudaVectorArrayWrapper in-kernel size test failed!");
-        static_assert(std::tuple_size_v<T1> == 1, "CudaVectorArrayWrapper in-kernel tuple_size test failed!");
-        static_assert(std::is_same_v<decltype(t1[0]), T&>, "CudaVectorArrayWrapper in-kernel type test failed!");
-        ALPAKA_CHECK(*success, equals(t1[0], T{0}));
-
-        using T2 = alpaka::meta::CudaVectorArrayWrapper<T, 2>;
-        T2 t2{0, 1};
-        static_assert(T2::size == 2, "CudaVectorArrayWrapper in-kernel size test failed!");
-        static_assert(std::tuple_size_v<T2> == 2, "CudaVectorArrayWrapper in-kernel tuple_size test failed!");
-        static_assert(std::is_same_v<decltype(t2[0]), T&>, "CudaVectorArrayWrapper in-kernel type test failed!");
-        ALPAKA_CHECK(*success, equals(t2[0], T{0}));
-        ALPAKA_CHECK(*success, equals(t2[1], T{1}));
-
-        using T3 = alpaka::meta::CudaVectorArrayWrapper<T, 3>;
-        T3 t3{0, 0, 0};
-        t3 = {0, 1, 2};
-        static_assert(T3::size == 3, "CudaVectorArrayWrapper in-kernel size test failed!");
-        static_assert(std::tuple_size_v<T3> == 3, "CudaVectorArrayWrapper in-kernel tuple_size test failed!");
-        static_assert(std::is_same_v<decltype(t3[0]), T&>, "CudaVectorArrayWrapper in-kernel type test failed!");
-        ALPAKA_CHECK(*success, equals(t3[0], T{0}));
-        ALPAKA_CHECK(*success, equals(t3[1], T{1}));
-        ALPAKA_CHECK(*success, equals(t3[2], T{2}));
-
-        using T4 = alpaka::meta::CudaVectorArrayWrapper<T, 4>;
-        T4 t4{0, 0, 0, 0};
-        t4[1] = 1;
-        t4[2] = t4[1] + 1;
-        t4[3] = t4[2] + t2[1];
-        static_assert(T4::size == 4, "CudaVectorArrayWrapper in-kernel size test failed!");
-        static_assert(std::tuple_size_v<T4> == 4, "CudaVectorArrayWrapper in-kernel tuple_size test failed!");
-        static_assert(std::is_same_v<decltype(t4[0]), T&>, "CudaVectorArrayWrapper in-kernel type test failed!");
-        ALPAKA_CHECK(*success, equals(t4[0], T{0}));
-        ALPAKA_CHECK(*success, equals(t4[1], T{1}));
-        ALPAKA_CHECK(*success, equals(t4[2], T{2}));
-        ALPAKA_CHECK(*success, equals(t4[3], T{3}));
-    }
-};
-
-TEMPLATE_LIST_TEST_CASE("cudaVectorArrayWrapperDevice", "[meta]", alpaka::test::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::Dim<Acc>;
-    using Idx = alpaka::Idx<Acc>;
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
-
-    CudaVectorArrayWrapperTestKernel<int> kernelInt;
-    REQUIRE(fixture(kernelInt));
-
-    CudaVectorArrayWrapperTestKernel<unsigned> kernelUnsigned;
-    REQUIRE(fixture(kernelUnsigned));
-
-    CudaVectorArrayWrapperTestKernel<float> kernelFloat;
-    REQUIRE(fixture(kernelFloat));
-
-    CudaVectorArrayWrapperTestKernel<double> kernelDouble;
-    REQUIRE(fixture(kernelDouble));
-}
-
-TEST_CASE("cudaVectorArrayWrapperHost", "[meta]")
-{
-    // TODO: It would be nice to check all possible type vs. size combinations.
-
-    using Float1 = alpaka::meta::CudaVectorArrayWrapper<float, 1>;
-    Float1 floatWrapper1{-1.0f};
-    STATIC_REQUIRE(Float1::size == 1);
-    STATIC_REQUIRE(std::tuple_size_v<Float1> == 1);
-    STATIC_REQUIRE(std::is_same_v<decltype(floatWrapper1[0]), float&>);
-    STATIC_REQUIRE(alpaka::meta::IsStrictBase<float1, Float1>::value);
-    REQUIRE(equals(floatWrapper1[0], -1.0f));
-
-    using Int1 = alpaka::meta::CudaVectorArrayWrapper<int, 1>;
-    Int1 intWrapper1 = {-42};
-    STATIC_REQUIRE(Int1::size == 1);
-    STATIC_REQUIRE(std::tuple_size_v<Int1> == 1);
-    STATIC_REQUIRE(std::is_same_v<decltype(intWrapper1[0]), int&>);
-    STATIC_REQUIRE(alpaka::meta::IsStrictBase<int1, Int1>::value);
-    REQUIRE(intWrapper1[0] == -42);
-
-    using Uint2 = alpaka::meta::CudaVectorArrayWrapper<unsigned, 2>;
-    Uint2 uintWrapper2{0u, 1u};
-    STATIC_REQUIRE(Uint2::size == 2);
-    STATIC_REQUIRE(std::tuple_size_v<Uint2> == 2);
-    STATIC_REQUIRE(std::is_same_v<decltype(uintWrapper2[0]), unsigned&>);
-    STATIC_REQUIRE(alpaka::meta::IsStrictBase<uint2, Uint2>::value);
-    REQUIRE(uintWrapper2[0] == 0u);
-    REQUIRE(uintWrapper2[1] == 1u);
-
-    using Uint4 = alpaka::meta::CudaVectorArrayWrapper<unsigned, 4>;
-    Uint4 uintWrapper4{0u, 0u, 0u, 0u};
-    STATIC_REQUIRE(Uint4::size == 4);
-    STATIC_REQUIRE(std::tuple_size_v<Uint4> == 4);
-    STATIC_REQUIRE(std::is_same_v<decltype(uintWrapper4[0]), unsigned&>);
-    STATIC_REQUIRE(alpaka::meta::IsStrictBase<uint4, Uint4>::value);
-    uintWrapper4[1] = 1u;
-    uintWrapper4[2] = uintWrapper4[1] + 1u;
-    uintWrapper4[3] = uintWrapper4[2] + uintWrapper2[1];
-    REQUIRE(uintWrapper4[0] == 0u);
-    REQUIRE(uintWrapper4[1] == 1u);
-    REQUIRE(uintWrapper4[2] == 2u);
-    REQUIRE(uintWrapper4[3] == 3u);
-
-    using Double3 = alpaka::meta::CudaVectorArrayWrapper<double, 3>;
-    Double3 doubleWrapper3{0.0, 0.0, 0.0};
-    doubleWrapper3 = {0.0, -1.0, -2.0};
-    STATIC_REQUIRE(Double3::size == 3);
-    STATIC_REQUIRE(std::tuple_size_v<Double3> == 3);
-    STATIC_REQUIRE(std::is_same_v<decltype(doubleWrapper3[0]), double&>);
-    STATIC_REQUIRE(alpaka::meta::IsStrictBase<double3, Double3>::value);
-    REQUIRE(equals(doubleWrapper3[0], 0.0));
-    REQUIRE(equals(doubleWrapper3[1], -1.0));
-    REQUIRE(equals(doubleWrapper3[2], -2.0));
-}
-
-#endif
diff --git a/alpaka/test/unit/meta/src/IsArrayOrVectorTest.cpp b/alpaka/test/unit/meta/src/IsArrayOrVectorTest.cpp
index 9558afda..dc511d66 100644
--- a/alpaka/test/unit/meta/src/IsArrayOrVectorTest.cpp
+++ b/alpaka/test/unit/meta/src/IsArrayOrVectorTest.cpp
@@ -3,6 +3,7 @@
  */
 
 #include <alpaka/meta/IsArrayOrVector.hpp>
+#include <alpaka/vec/Vec.hpp>
 
 #include <catch2/catch_test_macros.hpp>
 
@@ -14,6 +15,7 @@ TEST_CASE("isArrayOrVector", "[meta]")
 {
     STATIC_REQUIRE(alpaka::meta::IsArrayOrVector<std::array<int, 10>>::value);
     STATIC_REQUIRE(alpaka::meta::IsArrayOrVector<std::vector<float>>::value);
+    STATIC_REQUIRE(alpaka::meta::IsArrayOrVector<alpaka::Vec<alpaka::DimInt<6u>, float>>::value);
 
     [[maybe_unused]] float arrayFloat[4] = {1.0f, 2.0f, 3.0f, 4.0f};
     STATIC_REQUIRE(alpaka::meta::IsArrayOrVector<decltype(arrayFloat)>::value);
@@ -30,18 +32,3 @@ TEST_CASE("isActuallyNotArrayOrVector", "[meta]")
     std::string notAnArrayString{"alpaka"};
     STATIC_REQUIRE_FALSE(alpaka::meta::IsArrayOrVector<decltype(notAnArrayString)>::value);
 }
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-TEST_CASE("isArrayOrVectorCudaWrappers", "[meta]")
-{
-    STATIC_REQUIRE(alpaka::meta::IsArrayOrVector<alpaka::meta::CudaVectorArrayWrapper<double, 1>>::value);
-    STATIC_REQUIRE(alpaka::meta::IsArrayOrVector<alpaka::meta::CudaVectorArrayWrapper<unsigned, 2>>::value);
-    STATIC_REQUIRE(alpaka::meta::IsArrayOrVector<alpaka::meta::CudaVectorArrayWrapper<int, 3>>::value);
-    STATIC_REQUIRE(alpaka::meta::IsArrayOrVector<alpaka::meta::CudaVectorArrayWrapper<float, 4>>::value);
-}
-
-TEST_CASE("isNotArrayOrVectorCudaVector", "[meta]")
-{
-    STATIC_REQUIRE_FALSE(alpaka::meta::IsArrayOrVector<uint4>::value);
-}
-#endif
diff --git a/alpaka/test/unit/meta/src/TypeListOpsTest.cpp b/alpaka/test/unit/meta/src/TypeListOpsTest.cpp
index b15e7773..1ee418cd 100644
--- a/alpaka/test/unit/meta/src/TypeListOpsTest.cpp
+++ b/alpaka/test/unit/meta/src/TypeListOpsTest.cpp
@@ -9,6 +9,11 @@
 #include <tuple>
 #include <type_traits>
 
+template<typename... TTypes>
+struct TypeList
+{
+};
+
 TEST_CASE("front", "[meta]")
 {
     STATIC_REQUIRE(std::is_same_v<alpaka::meta::Front<std::tuple<int>>, int>);
@@ -26,3 +31,39 @@ TEST_CASE("contains", "[meta]")
     STATIC_REQUIRE(alpaka::meta::Contains<std::tuple<short, int, double, float>, float>::value);
     STATIC_REQUIRE(!alpaka::meta::Contains<std::tuple<short, int, double, float>, char>::value);
 }
+
+TEST_CASE("isList", "[meta]")
+{
+    STATIC_REQUIRE(alpaka::meta::isList<std::tuple<int>>);
+    STATIC_REQUIRE(alpaka::meta::isList<std::tuple<int, float>>);
+    STATIC_REQUIRE_FALSE(alpaka::meta::isList<int>);
+
+    STATIC_REQUIRE(alpaka::meta::isList<TypeList<int>>);
+    STATIC_REQUIRE(alpaka::meta::isList<TypeList<int, float, double>>);
+}
+
+TEST_CASE("ToList", "[meta]")
+{
+    STATIC_REQUIRE(std::is_same_v<typename alpaka::meta::ToList<TypeList, int>::type, TypeList<int>>);
+    STATIC_REQUIRE(std::is_same_v<
+                   typename alpaka::meta::ToList<TypeList, float, double, int>::type,
+                   TypeList<float, double, int>>);
+    STATIC_REQUIRE(
+        std::is_same_v<typename alpaka::meta::ToList<TypeList, TypeList<unsigned int>>::type, TypeList<unsigned int>>);
+    STATIC_REQUIRE(std::is_same_v<
+                   typename alpaka::meta::ToList<TypeList, TypeList<float, double, int>>::type,
+                   TypeList<float, double, int>>);
+
+    STATIC_REQUIRE(std::is_same_v<typename alpaka::meta::ToList<std::tuple, int>::type, std::tuple<int>>);
+    STATIC_REQUIRE(
+        std::is_same_v<typename alpaka::meta::ToList<std::tuple, std::tuple<float>>::type, std::tuple<float>>);
+}
+
+TEST_CASE("toTuple", "[meta]")
+{
+    STATIC_REQUIRE(std::is_same_v<alpaka::meta::ToTuple<int>, std::tuple<int>>);
+    STATIC_REQUIRE(std::is_same_v<alpaka::meta::ToTuple<int, float, double>, std::tuple<int, float, double>>);
+    STATIC_REQUIRE(std::is_same_v<alpaka::meta::ToTuple<std::tuple<int>>, std::tuple<int>>);
+    STATIC_REQUIRE(
+        std::is_same_v<alpaka::meta::ToTuple<std::tuple<int, float, double>>, std::tuple<int, float, double>>);
+}
diff --git a/alpaka/test/unit/queue/src/QueueTest.cpp b/alpaka/test/unit/queue/src/QueueTest.cpp
index 60344ed6..32553a4e 100644
--- a/alpaka/test/unit/queue/src/QueueTest.cpp
+++ b/alpaka/test/unit/queue/src/QueueTest.cpp
@@ -269,3 +269,12 @@ TEMPLATE_LIST_TEST_CASE("enqueueBenchmark", "[queue]", alpaka::test::TestQueues)
         return count.load();
     };
 }
+
+TEMPLATE_LIST_TEST_CASE("isQueue", "[queue]", alpaka::test::TestQueues)
+{
+    using DevQueue = TestType;
+    using Fixture = alpaka::test::QueueTestFixture<DevQueue>;
+    Fixture f;
+
+    REQUIRE(alpaka::isQueue<decltype(f.m_queue)>);
+}
diff --git a/alpaka/test/unit/rand/src/PhiloxTest.cpp b/alpaka/test/unit/rand/src/PhiloxTest.cpp
new file mode 100644
index 00000000..833ab473
--- /dev/null
+++ b/alpaka/test/unit/rand/src/PhiloxTest.cpp
@@ -0,0 +1,187 @@
+/* Copyright 2024 Jiri Vyskocil
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#include <alpaka/rand/RandPhilox.hpp>
+#include <alpaka/rand/RandPhiloxStateless.hpp>
+#include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+
+class PhiloxTest
+{
+protected:
+    alpaka::rand::Philox4x32x10 statefulSingleEngine;
+    alpaka::rand::Philox4x32x10Vector statefulVectorEngine;
+};
+
+TEST_CASE_METHOD(PhiloxTest, "HostStatefulVectorEngineTest")
+{
+    auto const resultVec = statefulVectorEngine();
+    for(auto& result : resultVec)
+    {
+        REQUIRE(result >= statefulVectorEngine.min());
+        REQUIRE(result <= statefulVectorEngine.max());
+    }
+}
+
+TEST_CASE_METHOD(PhiloxTest, "HostStatefulSingleEngineTest")
+{
+    auto const result = statefulSingleEngine();
+    REQUIRE(result >= statefulSingleEngine.min());
+    REQUIRE(result <= statefulSingleEngine.max());
+}
+
+TEST_CASE("HostStatelessEngineTest")
+{
+    using Gen = alpaka::rand::PhiloxStateless4x32x10Vector;
+    using Key = typename Gen::Key;
+    using Counter = typename Gen::Counter;
+    Key key = {42, 12345};
+    Counter counter1 = {6789, 321, 0, 0};
+    auto const result1 = Gen::generate(counter1, key);
+    Counter counter2 = {6789, 321, 0, 1};
+    auto const result2 = Gen::generate(counter2, key);
+    // Make sure that the inputs are really expected to lead to different results.
+    REQUIRE(result1 != result2);
+}
+
+template<typename T>
+class PhiloxTestKernelSingle
+{
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc, typename T_Generator>
+    ALPAKA_FN_ACC void genNumbers(TAcc const& acc, bool* success, T_Generator& gen) const
+    {
+        {
+            static_cast<void>(acc);
+            alpaka::rand::UniformReal<T> dist;
+            auto const result = dist(gen);
+            ALPAKA_CHECK(*success, static_cast<T>(0.0) <= result);
+            ALPAKA_CHECK(*success, static_cast<T>(1.0) > result);
+        }
+    }
+
+public:
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
+    {
+        // Philox generator for accelerator
+        auto generator = alpaka::rand::Philox4x32x10(42, 12345, 6789);
+        genNumbers<TAcc, decltype(generator)>(acc, success, generator);
+    }
+};
+
+template<typename T>
+class PhiloxTestKernelVector
+{
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc, typename T_Generator>
+    ALPAKA_FN_ACC void genNumbers(TAcc const& acc, bool* success, T_Generator& gen) const
+    {
+        {
+            static_cast<void>(acc);
+            using DistributionResult = typename T_Generator::template ResultContainer<T>;
+            alpaka::rand::UniformReal<DistributionResult> dist;
+            auto const result = dist(gen);
+            for(auto& element : result)
+            {
+                ALPAKA_CHECK(*success, static_cast<T>(0.0) <= element);
+                ALPAKA_CHECK(*success, static_cast<T>(1.0) > element);
+            }
+        }
+    }
+
+public:
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
+    {
+        // Philox generator for accelerator
+        auto generator = alpaka::rand::Philox4x32x10Vector(42, 12345, 6789);
+        genNumbers<TAcc, decltype(generator)>(acc, success, generator);
+    }
+};
+
+class PhiloxTestKernelStateless
+{
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc>
+    ALPAKA_FN_ACC void genNumbers(TAcc const& acc, bool* success) const
+    {
+        {
+            static_cast<void>(acc);
+
+            using Gen = alpaka::rand::PhiloxStateless4x32x10Vector;
+            using Key = typename Gen::Key;
+            using Counter = typename Gen::Counter;
+
+            Key key = {42, 12345};
+            Counter counter = {6789, 321, 0, 0};
+            auto const result = Gen::generate(counter, key);
+
+            size_t check = 0;
+            for(auto& element : result)
+            {
+                check += element;
+            }
+            // Make sure the sequence is not in fact supposed to generate {0,0,0,0}.
+            ALPAKA_CHECK(*success, check != 0);
+        }
+    }
+
+public:
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
+    {
+        genNumbers<TAcc>(acc, success);
+    }
+};
+
+TEMPLATE_LIST_TEST_CASE("PhiloxRandomGeneratorStatelessIsWorking", "[rand]", alpaka::test::TestAccs)
+{
+    using Acc = TestType;
+    using Idx = alpaka::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
+
+    PhiloxTestKernelStateless kernel;
+
+    REQUIRE(fixture(kernel));
+}
+
+using TestScalars = std::tuple<float, double>;
+using TestTypes = alpaka::meta::CartesianProduct<std::tuple, alpaka::test::TestAccs, TestScalars>;
+
+TEMPLATE_LIST_TEST_CASE("PhiloxRandomGeneratorSingleIsWorking", "[rand]", TestTypes)
+{
+    using Acc = std::tuple_element_t<0, TestType>;
+    using DataType = std::tuple_element_t<1, TestType>;
+    using Idx = alpaka::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
+
+    PhiloxTestKernelSingle<DataType> kernel;
+
+    REQUIRE(fixture(kernel));
+}
+
+TEMPLATE_LIST_TEST_CASE("PhiloxRandomGeneratorVectorIsWorking", "[rand]", TestTypes)
+{
+    using Acc = std::tuple_element_t<0, TestType>;
+    using DataType = std::tuple_element_t<1, TestType>;
+    using Idx = alpaka::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
+
+    PhiloxTestKernelVector<DataType> kernel;
+
+    REQUIRE(fixture(kernel));
+}
diff --git a/alpaka/test/unit/runtime/CMakeLists.txt b/alpaka/test/unit/runtime/CMakeLists.txt
new file mode 100644
index 00000000..54ed2cfa
--- /dev/null
+++ b/alpaka/test/unit/runtime/CMakeLists.txt
@@ -0,0 +1,20 @@
+#
+# Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
+# SPDX-License-Identifier: MPL-2.0
+#
+
+set(_TARGET_NAME "runtimeTest")
+
+append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
+
+alpaka_add_executable(
+    ${_TARGET_NAME}
+    ${_FILES_SOURCE})
+target_link_libraries(
+    ${_TARGET_NAME}
+    PRIVATE common)
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+target_compile_definitions(${_TARGET_NAME} PRIVATE "-DTEST_UNIT_RUNTIME")
+
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_alpaka_TEST_OPTIONS})
diff --git a/alpaka/test/unit/runtime/src/KernelThrow.cpp b/alpaka/test/unit/runtime/src/KernelThrow.cpp
new file mode 100644
index 00000000..832608fe
--- /dev/null
+++ b/alpaka/test/unit/runtime/src/KernelThrow.cpp
@@ -0,0 +1,84 @@
+/* Copyright 2022  René Widera, Mehmet Yusufoglu, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#include <alpaka/core/RuntimeMacros.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+#include <alpaka/test/queue/Queue.hpp>
+
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+
+class KernelWithThrow
+{
+public:
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc) const -> void
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        using Dim = alpaka::Dim<TAcc>;
+        using Vec = alpaka::Vec<Dim, Idx>;
+        Vec const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        if(globalThreadIdx == Vec::zeros())
+        {
+            // Throw abort or std::runtime_error depending on acc type
+            ALPAKA_THROW_ACC("Exception thrown by the kernel.");
+        }
+        alpaka::syncBlockThreads(acc);
+    }
+};
+
+template<typename T, typename Acc>
+void checkThrow(std::string const& expectedErrStr)
+{
+    if constexpr(alpaka::accMatchesTags<Acc, T>)
+    {
+        using Idx = alpaka::Idx<Acc>;
+        using Dim = alpaka::Dim<Acc>;
+        using Vec = alpaka::Vec<Dim, Idx>;
+        using Queue = alpaka::Queue<Acc, alpaka::Blocking>;
+
+        auto const platformAcc = alpaka::Platform<Acc>{};
+        auto const devAcc = alpaka::getDevByIdx(platformAcc, 0);
+
+        Queue queue(devAcc);
+        auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{Vec{8}, Vec{1}, Vec{1}};
+
+        try
+        {
+            alpaka::exec<Acc>(queue, workDiv, KernelWithThrow{});
+            // Cuda can catch exceptions which were thrown at kernel during the wait(); therefore wait is added.
+            alpaka::wait(queue);
+        }
+        catch(std::runtime_error& e)
+        {
+            std::string const errorStr{e.what()};
+            printf("The error str catched: %s \n", errorStr.c_str());
+            printf("The expected str in error str: %s \n", expectedErrStr.c_str());
+
+            auto const found = errorStr.find(expectedErrStr);
+            CHECK(found != std::string::npos);
+        }
+        catch(std::exception& e)
+        {
+            FAIL(std::string("Wrong exception type thrown in kernel:") + e.what());
+        }
+    }
+}
+
+using TestAccs = alpaka::test::EnabledAccs<alpaka::DimInt<1u>, std::uint32_t>;
+
+TEMPLATE_LIST_TEST_CASE("ThrowForCpuThreadAndSerial", "[runtime]", TestAccs)
+{
+    using Acc = TestType;
+    // Test runtime-error exceptions.
+    checkThrow<alpaka::TagCpuThreads, Acc>("Exception thrown by the kernel");
+    checkThrow<alpaka::TagCpuSerial, Acc>("Exception thrown by the kernel");
+}
+
+TEMPLATE_LIST_TEST_CASE("ThrowForGpuBackend", "[runtime]", TestAccs)
+{
+    using Acc = TestType;
+    // Test runtime-error exceptions.
+    checkThrow<alpaka::TagGpuCudaRt, Acc>("cudaErrorLaunchFailure");
+}
diff --git a/alpaka/test/unit/vec/src/VecTest.cpp b/alpaka/test/unit/vec/src/VecTest.cpp
index f6ac3498..c428307e 100644
--- a/alpaka/test/unit/vec/src/VecTest.cpp
+++ b/alpaka/test/unit/vec/src/VecTest.cpp
@@ -468,3 +468,28 @@ TEST_CASE("accessByNameConstexpr", "[vec]")
     STATIC_REQUIRE(v4.z() == 3);
     STATIC_REQUIRE(v4.w() == 4);
 }
+
+TEMPLATE_TEST_CASE("Vec generator constructor", "[vec]", std::size_t, int, unsigned, float, double)
+{
+    // Define a generator function
+    auto generator = [](auto index) { return static_cast<TestType>(index.value + 1); };
+
+    // Create a Vec object using the generator function
+    alpaka::Vec<alpaka::DimInt<5>, TestType> vec(generator);
+
+    // Check that the values in the Vec object are as expected
+    for(std::size_t i = 0; i < 5; ++i)
+    {
+        // Floating point types require a precision check instead of an exact == match
+        if constexpr(std::is_floating_point<TestType>::value)
+        {
+            // Arbitrary precision requirement
+            auto const precision = std::numeric_limits<TestType>::epsilon() * 5;
+            CHECK(std::abs(vec[i] - static_cast<TestType>(i + 1)) < precision);
+        }
+        else
+        {
+            CHECK(vec[i] == static_cast<TestType>(i + 1));
+        }
+    }
+}
diff --git a/alpaka/test/unit/workDiv/CMakeLists.txt b/alpaka/test/unit/workDiv/CMakeLists.txt
index 0b4ebdd6..40ac9080 100644
--- a/alpaka/test/unit/workDiv/CMakeLists.txt
+++ b/alpaka/test/unit/workDiv/CMakeLists.txt
@@ -10,6 +10,11 @@ append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
 alpaka_add_executable(
     ${_TARGET_NAME}
     ${_FILES_SOURCE})
+
+target_include_directories(
+    ${_TARGET_NAME}
+    PRIVATE "src")
+
 target_link_libraries(
     ${_TARGET_NAME}
     PRIVATE common)
diff --git a/alpaka/test/unit/workDiv/src/FooVec.hpp b/alpaka/test/unit/workDiv/src/FooVec.hpp
new file mode 100644
index 00000000..f26376ed
--- /dev/null
+++ b/alpaka/test/unit/workDiv/src/FooVec.hpp
@@ -0,0 +1,112 @@
+/* Copyright 2022 Sergei Bastrakov, Jan Stephan, Mehmet Yusufoglu
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include <alpaka/dim/Traits.hpp>
+#include <alpaka/elem/Traits.hpp>
+#include <alpaka/extent/Traits.hpp>
+#include <alpaka/vec/Vec.hpp>
+
+#include <array>
+#include <iostream>
+
+//! User defined vector for testing the usability of any vector type.
+//!
+//! \tparam TVal The data type.
+//! \tparam N Vector size as a non-type parameter.
+template<typename TVal, std::size_t N>
+class FooVec
+{
+public:
+    static_assert(N <= 3, "Size must be 3 or smaller");
+    std::array<TVal, N> arr;
+
+    // Default Constructor
+    FooVec()
+    {
+        arr.fill(TVal());
+    }
+
+    // Constructor with initializer list
+    FooVec(std::initializer_list<TVal> initList)
+    {
+        if(initList.size() <= N)
+        {
+            std::copy(initList.begin(), initList.end(), arr.begin());
+        }
+        else
+        {
+            throw std::out_of_range("Initializer list size exceeds array size");
+        }
+    }
+
+    // Example member function to print the contents of the array
+    void printArray() const
+    {
+        for(auto const& element : arr)
+        {
+            std::cout << element << ' ';
+        }
+        std::cout << std::endl;
+    }
+};
+
+namespace alpaka::trait
+{
+
+    //! The DimType specialization for the user defined vector
+    //! \tparam TVal The data type.
+    //! \tparam N Vector size as a non-type parameter.
+    template<typename TVal, size_t N>
+    struct DimType<FooVec<TVal, N>>
+    {
+        using type = alpaka::DimInt<N>;
+    };
+
+    //! The ElemType specialization for the user defined vector
+    //! \tparam TVal The data type.
+    //! \tparam N Vector size as a non-type parameter.
+    template<typename TVal, size_t N>
+    struct ElemType<FooVec<TVal, N>>
+    {
+        using type = TVal;
+    };
+
+    //! The IdxType specialization for the user defined vecto
+    //! \tparam TVal The data type.
+    //! \tparam N Vector size as a non-type parameter.
+    template<typename TVal, size_t N>
+    struct IdxType<FooVec<TVal, N>>
+    {
+        using type = std::size_t;
+    };
+
+    //! Specialization for the user defined vector type FooVec. This specialization makes the vector usable in
+    //! WorkDivMembers construction. Since alpaka vectors use z-y-x order, FooVec is reversed.
+    //! \tparam TVal The element type of the vector type
+    //! \tparam N The size of the vector type
+    template<typename TVal, size_t N>
+    struct GetExtents<FooVec<TVal, N>>
+    {
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC constexpr auto operator()(FooVec<TVal, N> const& extent) const
+            -> alpaka::Vec<DimInt<N>, TVal>
+        {
+            alpaka::Vec<DimInt<N>, TVal> v{};
+#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
+            if(DimInt<N>::value > 0)
+#else
+            if constexpr(DimInt<N>::value > 0)
+#endif
+            {
+                // Reverse the vector since the dimensions ordered as z-y-x in alpaka
+                for(unsigned i = 0; i < DimInt<N>::value; i++)
+                    v[i] = extent.arr[DimInt<N>::value - i - 1];
+            }
+
+            return v;
+        }
+    };
+} // namespace alpaka::trait
diff --git a/alpaka/test/unit/workDiv/src/WorkDivForKernelTest.cpp b/alpaka/test/unit/workDiv/src/WorkDivForKernelTest.cpp
new file mode 100644
index 00000000..8d809616
--- /dev/null
+++ b/alpaka/test/unit/workDiv/src/WorkDivForKernelTest.cpp
@@ -0,0 +1,223 @@
+/* Copyright 2022 Sergei Bastrakov, Jan Stephan, Bernhard Manfred Gruber, Mehmet Yusufoglu
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#include <alpaka/acc/AccCpuOmp2Blocks.hpp>
+#include <alpaka/acc/AccCpuOmp2Threads.hpp>
+#include <alpaka/acc/AccCpuSerial.hpp>
+#include <alpaka/acc/AccCpuTbbBlocks.hpp>
+#include <alpaka/acc/AccDevProps.hpp>
+#include <alpaka/acc/AccGpuUniformCudaHipRt.hpp>
+#include <alpaka/idx/Traits.hpp>
+#include <alpaka/kernel/KernelFunctionAttributes.hpp>
+#include <alpaka/math/MathStdLib.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+#include <alpaka/workdiv/WorkDivHelpers.hpp>
+
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+
+struct TestKernelWithManyRegisters
+{
+    template<typename TAcc>
+    [[maybe_unused]] ALPAKA_FN_ACC auto operator()(TAcc const& acc, std::size_t val) const -> void
+    {
+        double var0 = 1.0;
+        double var1 = 2.0;
+        double var2 = 3.0;
+
+        // Define many variables and use some calculations in order to prevent compiler optimization and make the
+        // kernel use many registers (around 80 on sm_52). Using many registers per SM decreases the max number of
+        // threads per block while this kernel is being run.
+
+        // TODO: Use function templates to parametrize and shorten the code!
+        double var3 = var2 + alpaka::math::fmod(acc, var2, 5);
+        double var4 = var3 + alpaka::math::fmod(acc, var3, 5);
+        double var5 = var4 + alpaka::math::fmod(acc, var4, 5);
+        double var6 = var5 + alpaka::math::fmod(acc, var5, 5);
+        double var7 = var6 + alpaka::math::fmod(acc, var6, 5);
+        double var8 = var7 + alpaka::math::fmod(acc, var7, 5);
+        double var9 = var8 + alpaka::math::fmod(acc, var8, 5);
+        double var10 = var9 + alpaka::math::fmod(acc, var9, 5);
+        double var11 = var10 + alpaka::math::fmod(acc, var10, 5);
+        double var12 = var11 + alpaka::math::fmod(acc, var11, 5);
+        double var13 = var12 + alpaka::math::fmod(acc, var12, 5);
+        double var14 = var13 + alpaka::math::fmod(acc, var13, 5);
+        double var15 = var14 + alpaka::math::fmod(acc, var14, 5);
+        double var16 = var15 + alpaka::math::fmod(acc, var15, 5);
+        double var17 = var16 + alpaka::math::fmod(acc, var16, 5);
+        double var18 = var17 + alpaka::math::fmod(acc, var17, 5);
+        double var19 = var18 + alpaka::math::fmod(acc, var18, 5);
+        double var20 = var19 + alpaka::math::fmod(acc, var19, 5);
+        double var21 = var20 + alpaka::math::fmod(acc, var20, 5);
+        double var22 = var21 + alpaka::math::fmod(acc, var21, 5);
+        double var23 = var22 + alpaka::math::fmod(acc, var22, 5);
+        double var24 = var23 + alpaka::math::fmod(acc, var23, 5);
+        double var25 = var24 + alpaka::math::fmod(acc, var24, 5);
+        double var26 = var25 + alpaka::math::fmod(acc, var25, 5);
+        double var27 = var26 + alpaka::math::fmod(acc, var26, 5);
+        double var28 = var27 + alpaka::math::fmod(acc, var27, 5);
+        double var29 = var28 + alpaka::math::fmod(acc, var28, 5);
+        double var30 = var29 + alpaka::math::fmod(acc, var29, 5);
+        double var31 = var30 + alpaka::math::fmod(acc, var30, 5);
+        double var32 = var31 + alpaka::math::fmod(acc, var31, 5);
+        double var33 = var32 + alpaka::math::fmod(acc, var32, 5);
+        double var34 = var33 + alpaka::math::fmod(acc, var33, 5);
+        double var35 = var34 + alpaka::math::fmod(acc, var34, 5);
+
+        double sum = var0 + var1 + var2 + var3 + var4 + var5 + var6 + var7 + var8 + var9 + var10 + var11 + var12
+                     + var13 + var14 + var15 + var16 + var17 + var18 + var19 + var20 + var21 + var22 + var23 + var24
+                     + var25 + var26 + var27 + var28 + var29 + var30 + var31 + var32 + var33 + var34 + var35;
+        printf("The sum is %5.2f, the argument is %lu\n", sum, val);
+    }
+};
+
+using TestAccs = alpaka::test::EnabledAccs<alpaka::DimInt<1u>, std::uint32_t>;
+
+TEMPLATE_LIST_TEST_CASE("getValidWorkDiv.1D", "[workDivKernel]", TestAccs)
+{
+    using Acc = TestType;
+    using Idx = alpaka::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Vec = alpaka::Vec<Dim, Idx>;
+    using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
+    auto const platform = alpaka::Platform<Acc>{};
+    auto const dev = alpaka::getDevByIdx(platform, 0);
+
+    TestKernelWithManyRegisters kernel;
+
+    // Get the device properties and hard limits
+    auto const props = alpaka::getAccDevProps<Acc>(dev);
+    Idx const elementsPerGridTestValue = props.m_blockThreadCountMax * props.m_gridBlockCountMax;
+
+    // Test the getValidWorkDiv function for elementsPerGridTestValue threads per grid.
+    alpaka::KernelCfg<Acc> const kernelCfg = {Vec{elementsPerGridTestValue}, Vec{1}};
+    auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, dev, kernel, 200ul);
+
+    // Test the isValidWorkDiv function
+    CHECK(alpaka::isValidWorkDiv<Acc>(workDiv, dev, kernel, 200ul));
+
+    // Get calculated threads per block from the workDiv that was found by examining the kernel function.
+    Idx const threadsPerBlock = workDiv.m_blockThreadExtent.prod();
+
+    // Get the device limit.
+    Idx const threadsPerBlockLimit = props.m_blockThreadCountMax;
+
+    // Check that the number of threads per block is within the device limit.
+    CHECK(threadsPerBlock <= threadsPerBlockLimit);
+
+    // Check that using the maximum number of threads per block is valid.
+    auto const validWorkDiv = WorkDiv{Vec{elementsPerGridTestValue / threadsPerBlock}, Vec{threadsPerBlock}, Vec{1}};
+    CHECK(alpaka::isValidWorkDiv<Acc>(validWorkDiv, dev, kernel, 200ul));
+
+    // Check that using too many threads per block is not valid.
+    auto const invalidThreads = WorkDiv{Vec{1}, Vec{2 * threadsPerBlockLimit}, Vec{1}};
+    CHECK(not alpaka::isValidWorkDiv<Acc>(invalidThreads, dev, kernel, 200ul));
+
+    // Check that a work division with a single block, thread and element is always valid
+    auto const serialWorkDiv = WorkDiv{Vec{1}, Vec{1}, Vec{1}};
+    CHECK(alpaka::isValidWorkDiv<Acc>(serialWorkDiv, dev, kernel, 200ul));
+
+    // Some accelerators support only one thread per block:
+    if constexpr(alpaka::isSingleThreadAcc<Acc>)
+    {
+        // Check that the compute work division uses a single thread per block.
+        auto const expectedWorkDiv = WorkDiv{Vec{elementsPerGridTestValue}, Vec{1}, Vec{1}};
+        CHECK(workDiv == expectedWorkDiv);
+
+        // Check that a work division with more than one thread per block is not valid.
+        auto const parallelWorkDiv = WorkDiv{Vec{1}, Vec{2}, Vec{1}};
+        CHECK(not alpaka::isValidWorkDiv<Acc>(parallelWorkDiv, dev, kernel, 200ul));
+    }
+
+    // Check the maxDynamicSharedSizeBytes for CPU backends
+    if constexpr(alpaka::accMatchesTags<
+                     Acc,
+                     alpaka::TagCpuSerial,
+                     alpaka::TagCpuThreads,
+                     alpaka::TagCpuOmp2Blocks,
+                     alpaka::TagCpuOmp2Threads,
+                     alpaka::TagCpuTbbBlocks>)
+    {
+        int const maxDynamicSharedSizeBytes
+            = alpaka::getFunctionAttributes<Acc>(dev, kernel, 200ul).maxDynamicSharedSizeBytes;
+        CHECK(maxDynamicSharedSizeBytes == static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024));
+    }
+}
+
+using TestAccs2D = alpaka::test::EnabledAccs<alpaka::DimInt<2u>, std::uint32_t>;
+
+TEMPLATE_LIST_TEST_CASE("getValidWorkDiv.2D", "[workDivKernel]", TestAccs2D)
+{
+    using Acc = TestType;
+    using Idx = alpaka::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Vec = alpaka::Vec<Dim, Idx>;
+    using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
+    auto const platform = alpaka::Platform<Acc>{};
+    auto const dev = alpaka::getDevByIdx(platform, 0);
+
+    TestKernelWithManyRegisters kernel;
+
+    // Get the device properties and hard limits
+    auto const props = alpaka::getAccDevProps<Acc>(dev);
+    Idx const elementsPerGridTestValue = props.m_blockThreadCountMax * props.m_gridBlockCountMax;
+
+    // Test getValidWorkDiv function for elementsPerGridTestValue threads per grid.
+    alpaka::KernelCfg<Acc> const kernelCfg = {Vec{8, elementsPerGridTestValue / 8}, Vec{1, 1}};
+    auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, dev, kernel, 200ul);
+
+    // Test the isValidWorkDiv function
+    CHECK(alpaka::isValidWorkDiv<Acc>(workDiv, dev, kernel, 200ul));
+
+    // The valid workdiv values for the kernel may change depending on the GPU type and compiler.
+    // Therefore the generated workdiv is not compared to a specific workdiv in this test.
+
+    // Get calculated threads per block from the workDiv that was found by examining the kernel function.
+    Idx const threadsPerBlock = workDiv.m_blockThreadExtent.prod();
+
+    // Get the device limit.
+    Idx const threadsPerBlockLimit = props.m_blockThreadCountMax;
+
+    // Check that the number of threads per block is within the device limit.
+    CHECK(threadsPerBlock <= threadsPerBlockLimit);
+
+    // Check that using the maximum number of threads per block is valid.
+    auto const validWorkDiv
+        = WorkDiv{Vec{8, elementsPerGridTestValue / threadsPerBlock / 8}, Vec{1, threadsPerBlock}, Vec{1, 1}};
+    CHECK(alpaka::isValidWorkDiv<Acc>(validWorkDiv, dev, kernel, 200ul));
+
+    // Check that using too many threads per block is not valid.
+    auto const invalidThreads = WorkDiv{Vec{1, 1}, Vec{2, threadsPerBlockLimit}, Vec{1, 1}};
+    CHECK(not alpaka::isValidWorkDiv<Acc>(invalidThreads, dev, kernel, 200ul));
+
+    // Check that a work division with a single block, thread and element is always valid
+    auto const serialWorkDiv = WorkDiv{Vec{1, 1}, Vec{1, 1}, Vec{1, 1}};
+    CHECK(alpaka::isValidWorkDiv<Acc>(serialWorkDiv, dev, kernel, 200ul));
+
+    // Some accelerators support only one thread per block:
+    if constexpr(alpaka::isSingleThreadAcc<Acc>)
+    {
+        // Check that the compute work division uses a single thread per block.
+        auto const expectedWorkDiv = WorkDiv{Vec{8, elementsPerGridTestValue / 8}, Vec{1, 1}, Vec{1, 1}};
+        CHECK(workDiv == expectedWorkDiv);
+
+        // Check that a work division with more than one thread per block is not valid.
+        auto const parallelWorkDiv = WorkDiv{Vec{1, 1}, Vec{1, 2}, Vec{1, 1}};
+        CHECK(not alpaka::isValidWorkDiv<Acc>(parallelWorkDiv, dev, kernel, 200ul));
+    }
+
+    // Check the maxDynamicSharedSizeBytes for CPU backends
+    if constexpr(alpaka::accMatchesTags<
+                     Acc,
+                     alpaka::TagCpuSerial,
+                     alpaka::TagCpuThreads,
+                     alpaka::TagCpuOmp2Blocks,
+                     alpaka::TagCpuOmp2Threads,
+                     alpaka::TagCpuTbbBlocks>)
+    {
+        int const maxDynamicSharedSizeBytes
+            = alpaka::getFunctionAttributes<Acc>(dev, kernel, 200ul).maxDynamicSharedSizeBytes;
+        CHECK(maxDynamicSharedSizeBytes == static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024));
+    }
+}
diff --git a/alpaka/test/unit/workDiv/src/WorkDivHelpersTest.cpp b/alpaka/test/unit/workDiv/src/WorkDivHelpersTest.cpp
index b85aec82..b41f24af 100644
--- a/alpaka/test/unit/workDiv/src/WorkDivHelpersTest.cpp
+++ b/alpaka/test/unit/workDiv/src/WorkDivHelpersTest.cpp
@@ -1,7 +1,9 @@
-/* Copyright 2022 Sergei Bastrakov, Jan Stephan, Bernhard Manfred Gruber
+/* Copyright 2022 Sergei Bastrakov, Jan Stephan, Bernhard Manfred Gruber, Mehmet Yusufoglu
  * SPDX-License-Identifier: MPL-2.0
  */
 
+#include "FooVec.hpp"
+
 #include <alpaka/acc/AccDevProps.hpp>
 #include <alpaka/test/KernelExecutionFixture.hpp>
 #include <alpaka/test/acc/TestAccs.hpp>
@@ -10,37 +12,6 @@
 #include <catch2/catch_template_test_macros.hpp>
 #include <catch2/catch_test_macros.hpp>
 
-#include <tuple>
-
-namespace
-{
-    template<typename TAcc>
-    auto getWorkDiv()
-    {
-        using Dim = alpaka::Dim<TAcc>;
-        using Idx = alpaka::Idx<TAcc>;
-
-        auto const platform = alpaka::Platform<TAcc>{};
-        auto const dev = alpaka::getDevByIdx(platform, 0);
-        auto const gridThreadExtent = alpaka::Vec<Dim, Idx>::all(10);
-        auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
-        auto const workDiv = alpaka::getValidWorkDiv<TAcc>(
-            dev,
-            gridThreadExtent,
-            threadElementExtent,
-            false,
-            alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
-        return workDiv;
-    }
-} // namespace
-
-TEMPLATE_LIST_TEST_CASE("getValidWorkDiv", "[workDiv]", alpaka::test::TestAccs)
-{
-    using Acc = TestType;
-    // Note: getValidWorkDiv() is called inside getWorkDiv
-    std::ignore = getWorkDiv<Acc>();
-}
-
 TEMPLATE_LIST_TEST_CASE("subDivideGridElems.2D.examples", "[workDiv]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
@@ -65,6 +36,10 @@ TEMPLATE_LIST_TEST_CASE("subDivideGridElems.2D.examples", "[workDiv]", alpaka::t
                 Vec{300, 600},
                 Vec{1, 1},
                 props,
+                // Upper bound for threads per block.
+                // Set threads per block upper bound value to zero. If zero, device properties (device hard limits) for
+                // threads per block is used as an upper bound.
+                static_cast<Idx>(0u),
                 false,
                 alpaka::GridBlockExtentSubDivRestrictions::EqualExtent)
             == WorkDiv{Vec{14, 28}, Vec{22, 22}, Vec{1, 1}});
@@ -73,6 +48,7 @@ TEMPLATE_LIST_TEST_CASE("subDivideGridElems.2D.examples", "[workDiv]", alpaka::t
                 Vec{300, 600},
                 Vec{1, 1},
                 props,
+                static_cast<Idx>(0u),
                 false,
                 alpaka::GridBlockExtentSubDivRestrictions::CloseToEqualExtent)
             == WorkDiv{Vec{19, 19}, Vec{16, 32}, Vec{1, 1}});
@@ -81,6 +57,7 @@ TEMPLATE_LIST_TEST_CASE("subDivideGridElems.2D.examples", "[workDiv]", alpaka::t
                 Vec{300, 600},
                 Vec{1, 1},
                 props,
+                static_cast<Idx>(0u),
                 false,
                 alpaka::GridBlockExtentSubDivRestrictions::Unrestricted)
             == WorkDiv{Vec{75, 5}, Vec{4, 128}, Vec{1, 1}});
@@ -90,14 +67,17 @@ TEMPLATE_LIST_TEST_CASE("subDivideGridElems.2D.examples", "[workDiv]", alpaka::t
                 Vec{300, 600},
                 Vec{1, 1},
                 props,
+                static_cast<Idx>(0u),
                 true,
                 alpaka::GridBlockExtentSubDivRestrictions::EqualExtent)
-            == WorkDiv{Vec{1, 2}, Vec{300, 300}, Vec{1, 1}});
+            //  The max-extent vectors, which are in device-properties struct, limit the values along each axis
+            == WorkDiv{Vec{1, 2}, Vec{256, 128}, Vec{1, 1}});
         CHECK(
             alpaka::subDivideGridElems(
                 Vec{300, 600},
                 Vec{1, 1},
                 props,
+                static_cast<Idx>(0u),
                 true,
                 alpaka::GridBlockExtentSubDivRestrictions::CloseToEqualExtent)
             == WorkDiv{Vec{20, 20}, Vec{15, 30}, Vec{1, 1}});
@@ -106,36 +86,163 @@ TEMPLATE_LIST_TEST_CASE("subDivideGridElems.2D.examples", "[workDiv]", alpaka::t
                 Vec{300, 600},
                 Vec{1, 1},
                 props,
+                static_cast<Idx>(0u),
                 true,
                 alpaka::GridBlockExtentSubDivRestrictions::Unrestricted)
             == WorkDiv{Vec{75, 5}, Vec{4, 120}, Vec{1, 1}});
-    }
-}
 
-TEMPLATE_LIST_TEST_CASE("getValidWorkDiv.1D.withIdx", "[workDiv]", alpaka::test::TestAccs)
-{
-    using Acc = TestType;
-    using Idx = alpaka::Idx<Acc>;
-    using Dim = alpaka::Dim<Acc>;
-    using Vec = alpaka::Vec<Dim, Idx>;
-    if constexpr(Dim::value == 1)
-    {
-        auto const platform = alpaka::Platform<Acc>{};
-        auto const dev = alpaka::getDevByIdx(platform, 0);
-        // test that we can call getValidWorkDiv with the Idx type directly instead of a Vec
-        auto const ref = alpaka::getValidWorkDiv<Acc>(dev, Vec{256}, Vec{13});
-        CHECK(alpaka::getValidWorkDiv<Acc>(dev, Idx{256}, Idx{13}) == ref);
+        // Tests with blockThreadCountMax argument non zero! This is an argument because it is determined beforehand,
+        // and it depends not only to the device hard properties, but depends on the kernel used. If this argument is
+        // zero device hard limits is used inside subDivideGridElems.
+        Idx blockThreadCountMax = 256;
+        bool blockThreadMustDivideGridThreadExtent = true;
+        CHECK(
+            alpaka::subDivideGridElems(
+                Vec{300, 600},
+                Vec{1, 1},
+                props,
+                // Give threads per block upper bound value explicitly. If zero, upper bound will be the default max
+                // value defined in the device properties value.
+                blockThreadCountMax,
+                blockThreadMustDivideGridThreadExtent,
+                alpaka::GridBlockExtentSubDivRestrictions::Unrestricted)
+            == WorkDiv{Vec{150, 5}, Vec{2, 120}, Vec{1, 1}});
+        CHECK(
+            alpaka::subDivideGridElems(
+                Vec{300, 600},
+                Vec{1, 1},
+                props,
+                blockThreadCountMax,
+                blockThreadMustDivideGridThreadExtent,
+                alpaka::GridBlockExtentSubDivRestrictions::CloseToEqualExtent)
+            == WorkDiv{Vec{20, 40}, Vec{15, 15}, Vec{1, 1}});
+
+        // Test with non-zero blockThreadCountMax and false blockThreadMustDivideGridThreadExtent
+        blockThreadMustDivideGridThreadExtent = false;
+        CHECK(
+            alpaka::subDivideGridElems(
+                Vec{300, 600},
+                Vec{1, 1},
+                props,
+                blockThreadCountMax,
+                blockThreadMustDivideGridThreadExtent,
+                alpaka::GridBlockExtentSubDivRestrictions::EqualExtent)
+            == WorkDiv{Vec{19, 38}, Vec{16, 16}, Vec{1, 1}});
+        CHECK(
+            alpaka::subDivideGridElems(
+                Vec{300, 600},
+                Vec{1, 1},
+                props,
+                blockThreadCountMax,
+                blockThreadMustDivideGridThreadExtent,
+                alpaka::GridBlockExtentSubDivRestrictions::Unrestricted)
+            == WorkDiv{Vec{150, 5}, Vec{2, 128}, Vec{1, 1}});
+        CHECK(
+            alpaka::subDivideGridElems(
+                Vec{300, 600},
+                Vec{1, 1},
+                props,
+                blockThreadCountMax,
+                blockThreadMustDivideGridThreadExtent,
+                alpaka::GridBlockExtentSubDivRestrictions::CloseToEqualExtent)
+            == WorkDiv{Vec{19, 38}, Vec{16, 16}, Vec{1, 1}});
+
+        // change the gridElemExtent argument
+        CHECK(
+            alpaka::subDivideGridElems(
+                Vec{1000, 600},
+                Vec{1, 1},
+                props,
+                blockThreadCountMax,
+                blockThreadMustDivideGridThreadExtent,
+                alpaka::GridBlockExtentSubDivRestrictions::EqualExtent)
+            == WorkDiv{Vec{63, 38}, Vec{16, 16}, Vec{1, 1}});
+        CHECK(
+            alpaka::subDivideGridElems(
+                Vec{1000, 600},
+                Vec{1, 1},
+                props,
+                blockThreadCountMax,
+                blockThreadMustDivideGridThreadExtent,
+                alpaka::GridBlockExtentSubDivRestrictions::Unrestricted)
+            == WorkDiv{Vec{500, 5}, Vec{2, 128}, Vec{1, 1}});
+        CHECK(
+            alpaka::subDivideGridElems(
+                Vec{1000, 600},
+                Vec{1, 1},
+                props,
+                blockThreadCountMax,
+                blockThreadMustDivideGridThreadExtent,
+                alpaka::GridBlockExtentSubDivRestrictions::CloseToEqualExtent)
+            == WorkDiv{Vec{63, 38}, Vec{16, 16}, Vec{1, 1}});
     }
 }
 
-TEMPLATE_LIST_TEST_CASE("isValidWorkDiv", "[workDiv]", alpaka::test::TestAccs)
+//! Test the constructors of WorkDivMembers using 3D extent, 3D extent with zero elements and 2D extents
+//! Test using any vector type in WorkDivMembers construction.
+TEST_CASE("WorkDivMembers", "[workDiv]")
 {
-    using Acc = TestType;
+    using Idx = std::size_t;
+    using Dim3D = alpaka::DimInt<3>;
+    using Vec3D = alpaka::Vec<Dim3D, Idx>;
+
+    auto blocksPerGrid3D = Vec3D{1u, 2u, 3u};
+    auto const threadsPerBlock3D = Vec3D{2u, 4u, 6u};
+    auto const elementsPerThread3D = Vec3D::all(static_cast<Idx>(1u));
+
+    // Arguments: {1,2,3},{2,4,6},{1,1,1}
+    auto ref3D = alpaka::WorkDivMembers<Dim3D, Idx>{blocksPerGrid3D, threadsPerBlock3D, elementsPerThread3D};
+    // Call without explicitly specifying explicit WorkDivMembers class template parameter types
+    auto workDiv3D = alpaka::WorkDivMembers(blocksPerGrid3D, threadsPerBlock3D, elementsPerThread3D);
+    CHECK(ref3D == workDiv3D);
+
+    // Change blocks per grid
+    blocksPerGrid3D = Vec3D{3u, 6u, 9u};
+    // Arguments: {3,6,9},{2,4,6},{1,1,1}
+    ref3D = alpaka::WorkDivMembers<Dim3D, Idx>{blocksPerGrid3D, threadsPerBlock3D, elementsPerThread3D};
+    // Call without explicitly specifying explicit WorkDivMembers class template parameter types
+    workDiv3D = alpaka::WorkDivMembers(blocksPerGrid3D, threadsPerBlock3D, elementsPerThread3D);
+    CHECK(ref3D == workDiv3D);
+
+    // Test using 2D vectors
+    using Dim2D = alpaka::DimInt<2>;
+    using Vec2D = alpaka::Vec<Dim2D, Idx>;
+
+    auto const blocksPerGrid2D = Vec2D{6u, 9u};
+    auto const threadsPerBlock2D = Vec2D{4u, 6u};
+    auto const elementsPerThread2D = Vec2D::all(static_cast<Idx>(1u));
+
+    // Arguments: {6,9},{4,6},{1,1}. The order of each input is y-x since alpaka vector uses z-y-x ordering
+    auto const ref2D = alpaka::WorkDivMembers<Dim2D, Idx>{blocksPerGrid2D, threadsPerBlock2D, elementsPerThread2D};
+    auto const workDiv2D = alpaka::WorkDivMembers(blocksPerGrid2D, threadsPerBlock2D, elementsPerThread2D);
+    CHECK(ref2D == workDiv2D);
+
+    // Test using initializer lists. Arguments: {6,9},{4,6},{1,1}. The order of initializer list is y-x since alpaka
+    // vector uses z-y-x ordering
+    auto const workDiv2DUsingInitList = alpaka::WorkDivMembers<Dim2D, Idx>({6, 9}, {4, 6}, {1, 1});
+    CHECK(ref2D == workDiv2DUsingInitList);
+
+    // Test using different input types with different number of dimensions(ranks), number of dimensions reduced to
+    // given explicit class template type number of dimensions (e.g. Dim2D) in the call. Arguments: {6,9},{2,4,6},{1,1}
+    auto worDiv2DUsingMixedRanks
+        = alpaka::WorkDivMembers<Dim2D, Idx>{blocksPerGrid2D, threadsPerBlock3D, elementsPerThread3D};
+    // Since the first element of threadsPerBlock3D is along Z-axis, it is removed
+    CHECK(ref2D == worDiv2DUsingMixedRanks);
+
+    worDiv2DUsingMixedRanks
+        = alpaka::WorkDivMembers<Dim2D, Idx>{blocksPerGrid2D, threadsPerBlock3D, elementsPerThread2D};
+    CHECK(ref2D == worDiv2DUsingMixedRanks);
+
+    // Test the construction by using a user-defined type FooVec
+    //
+    // Test WorkDivMembers using the arguments of the type FooVec
+    auto const blocksPerGrid2DFoo = FooVec<size_t, 2u>{9u, 6u};
+    auto const threadsPerBlock2DFoo = FooVec<size_t, 2u>{6u, 4u};
+    auto const elementsPerThread2DFoo = FooVec<size_t, 2u>{1u, 1u};
 
-    auto const platform = alpaka::Platform<Acc>{};
-    auto const dev = alpaka::getDevByIdx(platform, 0);
-    auto const workDiv = getWorkDiv<Acc>();
-    // Test both overloads
-    REQUIRE(alpaka::isValidWorkDiv(alpaka::getAccDevProps<Acc>(dev), workDiv));
-    REQUIRE(alpaka::isValidWorkDiv<Acc>(dev, workDiv));
+    // Arguments: {9,6},{6,4},{1,1}. These arguments are reversed at GetExtents specialization of FooVec
+    // FooVec assumes the list is ordered as x-y-z
+    auto const workDiv2DUsingFooVec
+        = alpaka::WorkDivMembers<Dim2D, Idx>{blocksPerGrid2DFoo, threadsPerBlock2DFoo, elementsPerThread2DFoo};
+    CHECK(ref2D == workDiv2DUsingFooVec);
 }
diff --git a/alpaka/thirdParty/CMakeLists.txt b/alpaka/thirdParty/CMakeLists.txt
index e8f03d28..97ed4d5f 100644
--- a/alpaka/thirdParty/CMakeLists.txt
+++ b/alpaka/thirdParty/CMakeLists.txt
@@ -3,9 +3,9 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
-if(BUILD_TESTING)
+if(BUILD_TESTING OR alpaka_BUILD_BENCHMARKS)
     if(alpaka_USE_INTERNAL_CATCH2)
-        message(STATUS "Catch2: Using INTERNAL version 3.3.2")
+        message(STATUS "Catch2: Using INTERNAL version 3.5.2")
         # Force Catch2's CMake to pick up the variables we set below
         set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
 
@@ -39,7 +39,7 @@ if(BUILD_TESTING)
             endif()
         endforeach()
     else()
-        find_package(Catch2 3.3.2 CONFIG REQUIRED)
+        find_package(Catch2 3.5.2 CONFIG REQUIRED)
         message(STATUS "Catch2: Found version ${Catch2_VERSION}")
     endif()
 endif()
diff --git a/alpaka/thirdParty/catch2/.bazelrc b/alpaka/thirdParty/catch2/.bazelrc
index c01cb39f..9cb0aa1b 100644
--- a/alpaka/thirdParty/catch2/.bazelrc
+++ b/alpaka/thirdParty/catch2/.bazelrc
@@ -8,3 +8,4 @@ build:vs2022 --cxxopt=/std:c++17
 
 build:windows --config=vs2022
 build:linux --config=gcc11
+build:macos --cxxopt=-std=c++2b
diff --git a/alpaka/thirdParty/catch2/.github/workflows/linux-bazel-builds.yml b/alpaka/thirdParty/catch2/.github/workflows/linux-bazel-builds.yml
index 9006652e..dc826ac0 100644
--- a/alpaka/thirdParty/catch2/.github/workflows/linux-bazel-builds.yml
+++ b/alpaka/thirdParty/catch2/.github/workflows/linux-bazel-builds.yml
@@ -11,7 +11,7 @@ jobs:
         compilation_mode: [fastbuild, dbg, opt]
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
     - name: Mount bazel cache
       uses: actions/cache@v3
diff --git a/alpaka/thirdParty/catch2/.github/workflows/linux-meson-builds.yml b/alpaka/thirdParty/catch2/.github/workflows/linux-meson-builds.yml
index dec701b6..4ffa0243 100644
--- a/alpaka/thirdParty/catch2/.github/workflows/linux-meson-builds.yml
+++ b/alpaka/thirdParty/catch2/.github/workflows/linux-meson-builds.yml
@@ -18,10 +18,12 @@ jobs:
             other_pkgs: clang-11
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
 
     - name: Prepare environment
-      run: sudo apt-get install -y meson ninja-build ${{matrix.other_pkgs}}
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y meson ninja-build ${{matrix.other_pkgs}}
 
     - name: Configure build
       env:
diff --git a/alpaka/thirdParty/catch2/.github/workflows/linux-other-builds.yml b/alpaka/thirdParty/catch2/.github/workflows/linux-other-builds.yml
index cf4e2c06..4a7f5ecc 100644
--- a/alpaka/thirdParty/catch2/.github/workflows/linux-other-builds.yml
+++ b/alpaka/thirdParty/catch2/.github/workflows/linux-other-builds.yml
@@ -29,13 +29,13 @@ jobs:
             build_type: Debug
             std: 14
             other_pkgs: g++-7
-            cmake_configurations: -DCATCH_BUILD_EXTRA_TESTS=ON -DCATCH_BUILD_EXAMPLES=ON
+            cmake_configurations: -DCATCH_BUILD_EXTRA_TESTS=ON -DCATCH_BUILD_EXAMPLES=ON -DCATCH_ENABLE_CMAKE_HELPER_TESTS=ON
           - cxx: g++-7
             build_description: Extras + Examples
             build_type: Release
             std: 14
             other_pkgs: g++-7
-            cmake_configurations: -DCATCH_BUILD_EXTRA_TESTS=ON -DCATCH_BUILD_EXAMPLES=ON
+            cmake_configurations: -DCATCH_BUILD_EXTRA_TESTS=ON -DCATCH_BUILD_EXAMPLES=ON -DCATCH_ENABLE_CMAKE_HELPER_TESTS=ON
 
           # Extras and examples with Clang-10
           - cxx: clang++-10
@@ -43,13 +43,13 @@ jobs:
             build_type: Debug
             std: 17
             other_pkgs: clang-10
-            cmake_configurations: -DCATCH_BUILD_EXTRA_TESTS=ON -DCATCH_BUILD_EXAMPLES=ON
+            cmake_configurations: -DCATCH_BUILD_EXTRA_TESTS=ON -DCATCH_BUILD_EXAMPLES=ON -DCATCH_ENABLE_CMAKE_HELPER_TESTS=ON
           - cxx: clang++-10
             build_description: Extras + Examples
             build_type: Release
             std: 17
             other_pkgs: clang-10
-            cmake_configurations: -DCATCH_BUILD_EXTRA_TESTS=ON -DCATCH_BUILD_EXAMPLES=ON
+            cmake_configurations: -DCATCH_BUILD_EXTRA_TESTS=ON -DCATCH_BUILD_EXAMPLES=ON -DCATCH_ENABLE_CMAKE_HELPER_TESTS=ON
 
           # Configure tests with Clang-10
           - cxx: clang++-10
@@ -70,10 +70,12 @@ jobs:
 
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
 
     - name: Prepare environment
-      run: sudo apt-get install -y ninja-build ${{matrix.other_pkgs}}
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y ninja-build ${{matrix.other_pkgs}}
 
     - name: Configure build
       working-directory: ${{runner.workspace}}
diff --git a/alpaka/thirdParty/catch2/.github/workflows/linux-simple-builds.yml b/alpaka/thirdParty/catch2/.github/workflows/linux-simple-builds.yml
index 989c4942..a32eb597 100644
--- a/alpaka/thirdParty/catch2/.github/workflows/linux-simple-builds.yml
+++ b/alpaka/thirdParty/catch2/.github/workflows/linux-simple-builds.yml
@@ -83,7 +83,7 @@ jobs:
             other_pkgs: g++-10
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
 
     - name: Add repositories for older GCC
       run: |
@@ -92,7 +92,9 @@ jobs:
       if: ${{ matrix.cxx == 'g++-5' || matrix.cxx == 'g++-6' }}
 
     - name: Prepare environment
-      run: sudo apt-get install -y ninja-build ${{matrix.other_pkgs}}
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y ninja-build ${{matrix.other_pkgs}}
 
     - name: Configure build
       working-directory: ${{runner.workspace}}
diff --git a/alpaka/thirdParty/catch2/.github/workflows/mac-builds.yml b/alpaka/thirdParty/catch2/.github/workflows/mac-builds.yml
index 955b81fc..259d8b36 100644
--- a/alpaka/thirdParty/catch2/.github/workflows/mac-builds.yml
+++ b/alpaka/thirdParty/catch2/.github/workflows/mac-builds.yml
@@ -22,7 +22,7 @@ jobs:
             extra_tests: ON
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
 
     - name: Configure build
       working-directory: ${{runner.workspace}}
@@ -42,11 +42,10 @@ jobs:
 
     - name: Build tests + lib
       working-directory: ${{runner.workspace}}/build
-      run: make -j 2
+      run: make -j `sysctl -n hw.ncpu`
 
     - name: Run tests
       env:
           CTEST_OUTPUT_ON_FAILURE: 1
       working-directory: ${{runner.workspace}}/build
-      # Hardcode 2 cores we know are there
-      run: ctest -C ${{matrix.build_type}} -j 2
+      run: ctest -C ${{matrix.build_type}} -j `sysctl -n hw.ncpu`
diff --git a/alpaka/thirdParty/catch2/.github/workflows/validate-header-guards.yml b/alpaka/thirdParty/catch2/.github/workflows/validate-header-guards.yml
index c02b5d49..fa9d1574 100644
--- a/alpaka/thirdParty/catch2/.github/workflows/validate-header-guards.yml
+++ b/alpaka/thirdParty/catch2/.github/workflows/validate-header-guards.yml
@@ -9,7 +9,7 @@ jobs:
     steps:
 
       - name: Checkout source code
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
 
       - name: Setup Dependencies
         uses: actions/setup-python@v2
diff --git a/alpaka/thirdParty/catch2/.github/workflows/windows-simple-builds.yml b/alpaka/thirdParty/catch2/.github/workflows/windows-simple-builds.yml
index 197fa219..5fb7b8fe 100644
--- a/alpaka/thirdParty/catch2/.github/workflows/windows-simple-builds.yml
+++ b/alpaka/thirdParty/catch2/.github/workflows/windows-simple-builds.yml
@@ -13,7 +13,7 @@ jobs:
         build_type: [Debug, Release]
         std: [14, 17]
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
 
       - name: Configure build
         working-directory: ${{runner.workspace}}
diff --git a/alpaka/thirdParty/catch2/BUILD.bazel b/alpaka/thirdParty/catch2/BUILD.bazel
index 3125e7c5..c51bf57e 100644
--- a/alpaka/thirdParty/catch2/BUILD.bazel
+++ b/alpaka/thirdParty/catch2/BUILD.bazel
@@ -43,12 +43,15 @@ expand_template(
         "#cmakedefine CATCH_CONFIG_NO_GLOBAL_NEXTAFTER": "",
         "#cmakedefine CATCH_CONFIG_NO_POSIX_SIGNALS": "",
         "#cmakedefine CATCH_CONFIG_NO_USE_ASYNC": "",
+        "#cmakedefine CATCH_CONFIG_NO_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT": "",
         "#cmakedefine CATCH_CONFIG_NO_WCHAR": "",
         "#cmakedefine CATCH_CONFIG_NO_WINDOWS_SEH": "",
         "#cmakedefine CATCH_CONFIG_NOSTDOUT": "",
         "#cmakedefine CATCH_CONFIG_POSIX_SIGNALS": "",
         "#cmakedefine CATCH_CONFIG_PREFIX_ALL": "",
+        "#cmakedefine CATCH_CONFIG_PREFIX_MESSAGES": "",
         "#cmakedefine CATCH_CONFIG_SHARED_LIBRARY": "",
+        "#cmakedefine CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT": "",
         "#cmakedefine CATCH_CONFIG_USE_ASYNC": "",
         "#cmakedefine CATCH_CONFIG_WCHAR": "",
         "#cmakedefine CATCH_CONFIG_WINDOWS_CRTDBG": "",
diff --git a/alpaka/thirdParty/catch2/CMake/CatchConfigOptions.cmake b/alpaka/thirdParty/catch2/CMake/CatchConfigOptions.cmake
index 733ec65e..6eae220d 100644
--- a/alpaka/thirdParty/catch2/CMake/CatchConfigOptions.cmake
+++ b/alpaka/thirdParty/catch2/CMake/CatchConfigOptions.cmake
@@ -18,10 +18,12 @@
 macro(AddOverridableConfigOption OptionBaseName)
   option(CATCH_CONFIG_${OptionBaseName} "Read docs/configuration.md for details" OFF)
   option(CATCH_CONFIG_NO_${OptionBaseName} "Read docs/configuration.md for details" OFF)
+  mark_as_advanced(CATCH_CONFIG_${OptionBaseName} CATCH_CONFIG_NO_${OptionBaseName})
 endmacro()
 
 macro(AddConfigOption OptionBaseName)
   option(CATCH_CONFIG_${OptionBaseName} "Read docs/configuration.md for details" OFF)
+  mark_as_advanced(CATCH_CONFIG_${OptionBaseName})
 endmacro()
 
 set(_OverridableOptions
@@ -41,6 +43,7 @@ set(_OverridableOptions
   "WCHAR"
   "WINDOWS_SEH"
   "GETENV"
+  "EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT"
 )
 
 foreach(OptionName ${_OverridableOptions})
@@ -61,6 +64,7 @@ set(_OtherConfigOptions
   "FAST_COMPILE"
   "NOSTDOUT"
   "PREFIX_ALL"
+  "PREFIX_MESSAGES"
   "WINDOWS_CRTDBG"
 )
 
@@ -68,11 +72,17 @@ set(_OtherConfigOptions
 foreach(OptionName ${_OtherConfigOptions})
   AddConfigOption(${OptionName})
 endforeach()
-set(CATCH_CONFIG_SHARED_LIBRARY ${BUILD_SHARED_LIBS})
+if(DEFINED BUILD_SHARED_LIBS)
+    set(CATCH_CONFIG_SHARED_LIBRARY ${BUILD_SHARED_LIBS})
+else()
+    set(CATCH_CONFIG_SHARED_LIBRARY "")
+endif()
 
 set(CATCH_CONFIG_DEFAULT_REPORTER "console" CACHE STRING "Read docs/configuration.md for details. The name of the reporter should be without quotes.")
 set(CATCH_CONFIG_CONSOLE_WIDTH "80" CACHE STRING "Read docs/configuration.md for details. Must form a valid integer literal.")
 
+mark_as_advanced(CATCH_CONFIG_SHARED_LIBRARY CATCH_CONFIG_DEFAULT_REPORTER CATCH_CONFIG_CONSOLE_WIDTH)
+
 # There is no good way to both turn this into a CMake cache variable,
 # and keep reasonable default semantics inside the project. Thus we do
 # not define it and users have to provide it as an outside variable.
diff --git a/alpaka/thirdParty/catch2/CMake/CatchMiscFunctions.cmake b/alpaka/thirdParty/catch2/CMake/CatchMiscFunctions.cmake
index 3758d956..84bd7cc7 100644
--- a/alpaka/thirdParty/catch2/CMake/CatchMiscFunctions.cmake
+++ b/alpaka/thirdParty/catch2/CMake/CatchMiscFunctions.cmake
@@ -46,7 +46,6 @@ function(add_warnings_to_targets targets)
         set(CHECKED_WARNING_FLAGS
           "-Wabsolute-value"
           "-Wall"
-          "-Wc++20-compat"
           "-Wcall-to-pure-virtual-from-ctor-dtor"
           "-Wcast-align"
           "-Wcatch-value"
@@ -74,16 +73,18 @@ function(add_warnings_to_targets targets)
           "-Woverloaded-virtual"
           "-Wparentheses"
           "-Wpedantic"
+          "-Wredundant-decls"
           "-Wreorder"
           "-Wreturn-std-move"
           "-Wshadow"
           "-Wstrict-aliasing"
+          "-Wsubobject-linkage"
           "-Wsuggest-destructor-override"
           "-Wsuggest-override"
           "-Wundef"
           "-Wuninitialized"
           "-Wunneeded-internal-declaration"
-          "-Wunreachable-code"
+          "-Wunreachable-code-aggressive"
           "-Wunused"
           "-Wunused-function"
           "-Wunused-parameter"
diff --git a/alpaka/thirdParty/catch2/CMakeLists.txt b/alpaka/thirdParty/catch2/CMakeLists.txt
index 6d381d8d..78ac4c8a 100644
--- a/alpaka/thirdParty/catch2/CMakeLists.txt
+++ b/alpaka/thirdParty/catch2/CMakeLists.txt
@@ -11,6 +11,7 @@ endif()
 option(CATCH_INSTALL_DOCS "Install documentation alongside library" ON)
 option(CATCH_INSTALL_EXTRAS "Install extras (CMake scripts, debugger helpers) alongside library" ON)
 option(CATCH_DEVELOPMENT_BUILD "Build tests, enable warnings, enable Werror, etc" OFF)
+option(CATCH_ENABLE_REPRODUCIBLE_BUILD "Add compiler flags for improving build reproducibility" ON)
 
 include(CMakeDependentOption)
 cmake_dependent_option(CATCH_BUILD_TESTING "Build the SelfTest project" ON "CATCH_DEVELOPMENT_BUILD" OFF)
@@ -21,6 +22,7 @@ cmake_dependent_option(CATCH_ENABLE_COVERAGE "Generate coverage for codecov.io"
 cmake_dependent_option(CATCH_ENABLE_WERROR "Enables Werror during build" ON "CATCH_DEVELOPMENT_BUILD" OFF)
 cmake_dependent_option(CATCH_BUILD_SURROGATES "Enable generating and building surrogate TUs for the main headers" OFF "CATCH_DEVELOPMENT_BUILD" OFF)
 cmake_dependent_option(CATCH_ENABLE_CONFIGURE_TESTS "Enable CMake configuration tests. WARNING: VERY EXPENSIVE" OFF "CATCH_DEVELOPMENT_BUILD" OFF)
+cmake_dependent_option(CATCH_ENABLE_CMAKE_HELPER_TESTS "Enable CMake helper tests. WARNING: VERY EXPENSIVE" OFF "CATCH_DEVELOPMENT_BUILD" OFF)
 
 
 # Catch2's build breaks if done in-tree. You probably should not build
@@ -31,7 +33,7 @@ if (CMAKE_BINARY_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
 endif()
 
 project(Catch2
-  VERSION 3.3.2 # CML version placeholder, don't delete
+  VERSION 3.5.2 # CML version placeholder, don't delete
   LANGUAGES CXX
   # HOMEPAGE_URL is not supported until CMake version 3.12, which
   # we do not target yet.
@@ -148,6 +150,8 @@ if (NOT_SUBPROJECT)
             "extras/ParseAndAddCatchTests.cmake"
             "extras/Catch.cmake"
             "extras/CatchAddTests.cmake"
+            "extras/CatchShardTests.cmake"
+            "extras/CatchShardTestsImpl.cmake"
           DESTINATION
             ${CATCH_CMAKE_CONFIG_DESTINATION}
         )
diff --git a/alpaka/thirdParty/catch2/CMakePresets.json b/alpaka/thirdParty/catch2/CMakePresets.json
index 00f3a6d3..88541285 100644
--- a/alpaka/thirdParty/catch2/CMakePresets.json
+++ b/alpaka/thirdParty/catch2/CMakePresets.json
@@ -18,7 +18,8 @@
                 "CATCH_BUILD_EXAMPLES": "ON",
                 "CATCH_BUILD_EXTRA_TESTS": "ON",
                 "CATCH_BUILD_SURROGATES": "ON",
-                "CATCH_ENABLE_CONFIGURE_TESTS": "ON"
+                "CATCH_ENABLE_CONFIGURE_TESTS": "ON",
+                "CATCH_ENABLE_CMAKE_HELPER_TESTS": "ON"
             }
         }
     ]   
diff --git a/alpaka/thirdParty/catch2/Doxyfile b/alpaka/thirdParty/catch2/Doxyfile
index 07b385ec..914e5984 100644
--- a/alpaka/thirdParty/catch2/Doxyfile
+++ b/alpaka/thirdParty/catch2/Doxyfile
@@ -1,4 +1,4 @@
-# Doxyfile 1.8.16
+# Doxyfile 1.9.1
 
 # This file describes the settings to be used by the documentation system
 # doxygen (www.doxygen.org) for a project.
@@ -32,7 +32,7 @@ DOXYFILE_ENCODING      = UTF-8
 # title of most generated pages and in a few other places.
 # The default value is: My Project.
 
-PROJECT_NAME           = "Catch2"
+PROJECT_NAME           = Catch2
 
 # The PROJECT_NUMBER tag can be used to enter a project or revision number. This
 # could be handy for archiving the generated documentation or if some version
@@ -51,6 +51,7 @@ PROJECT_BRIEF          = "Popular C++ unit testing framework"
 # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
 # the logo to the output directory.
 
+PROJECT_LOGO           =
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
 # into which the generated documentation will be written. If a relative path is
@@ -216,6 +217,14 @@ QT_AUTOBRIEF           = YES
 
 MULTILINE_CPP_IS_BRIEF = NO
 
+# By default Python docstrings are displayed as preformatted text and doxygen's
+# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the
+# doxygen's special commands can be used and the contents of the docstring
+# documentation blocks is shown as doxygen documentation.
+# The default value is: YES.
+
+PYTHON_DOCSTRING       = YES
+
 # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
 # documentation from any documented member that it re-implements.
 # The default value is: YES.
@@ -251,13 +260,7 @@ TAB_SIZE               = 4
 # a double escape (\\{ and \\})
 
 ALIASES                = "complexity=@par Complexity:" \
-                         "noexcept=**Noexcept**"
-
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              =
+                         noexcept=**Noexcept**
 
 # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
 # only. Doxygen will then generate output that is more tailored for C. For
@@ -299,19 +302,22 @@ OPTIMIZE_OUTPUT_SLICE  = NO
 # parses. With this tag you can assign which parser to use for a given
 # extension. Doxygen has a built-in mapping, but you can override or extend it
 # using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice,
+# language is one of the parsers supported by doxygen: IDL, Java, JavaScript,
+# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, VHDL,
 # Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
 # FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
 # tries to guess whether the code is fixed or free formatted code, this is the
-# default for Fortran type files), VHDL, tcl. For instance to make doxygen treat
-# .inc files as Fortran files (default is PHP), and .f files as C (default is
-# Fortran), use: inc=Fortran f=C.
+# default for Fortran type files). For instance to make doxygen treat .inc files
+# as Fortran files (default is PHP), and .f files as C (default is Fortran),
+# use: inc=Fortran f=C.
 #
 # Note: For files without extension you can use no_extension as a placeholder.
 #
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
+# the files are not read by doxygen. When specifying no_extension you should add
+# * to the FILE_PATTERNS.
+#
+# Note see also the list of default file extension mappings.
 
 EXTENSION_MAPPING      =
 
@@ -445,6 +451,19 @@ TYPEDEF_HIDES_STRUCT   = NO
 
 LOOKUP_CACHE_SIZE      = 0
 
+# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use
+# during processing. When set to 0 doxygen will based this on the number of
+# cores available in the system. You can set it explicitly to a value larger
+# than 0 to get more control over the balance between CPU load and processing
+# speed. At this moment only the input processing can be done using multiple
+# threads. Since this is still an experimental feature the default is set to 1,
+# which efficively disables parallel processing. Please report any issues you
+# encounter. Generating dot graphs in parallel is controlled by the
+# DOT_NUM_THREADS setting.
+# Minimum value: 0, maximum value: 32, default value: 1.
+
+NUM_PROC_THREADS       = 1
+
 #---------------------------------------------------------------------------
 # Build related configuration options
 #---------------------------------------------------------------------------
@@ -508,6 +527,13 @@ EXTRACT_LOCAL_METHODS  = NO
 
 EXTRACT_ANON_NSPACES   = NO
 
+# If this flag is set to YES, the name of an unnamed parameter in a declaration
+# will be determined by the corresponding definition. By default unnamed
+# parameters remain unnamed in the output.
+# The default value is: YES.
+
+RESOLVE_UNNAMED_PARAMS = YES
+
 # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
 # undocumented members inside documented classes or files. If set to NO these
 # members will be included in the various overviews, but no documentation
@@ -525,8 +551,8 @@ HIDE_UNDOC_MEMBERS     = NO
 HIDE_UNDOC_CLASSES     = NO
 
 # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO, these declarations will be
-# included in the documentation.
+# declarations. If set to NO, these declarations will be included in the
+# documentation.
 # The default value is: NO.
 
 HIDE_FRIEND_COMPOUNDS  = NO
@@ -545,11 +571,18 @@ HIDE_IN_BODY_DOCS      = NO
 
 INTERNAL_DOCS          = NO
 
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES, upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# (including Cygwin) ands Mac users are advised to set this option to NO.
+# With the correct setting of option CASE_SENSE_NAMES doxygen will better be
+# able to match the capabilities of the underlying filesystem. In case the
+# filesystem is case sensitive (i.e. it supports files in the same directory
+# whose names only differ in casing), the option must be set to YES to properly
+# deal with such files in case they appear in the input. For filesystems that
+# are not case sensitive the option should be be set to NO to properly deal with
+# output files written for symbols that only differ in casing, such as for two
+# classes, one named CLASS and the other named Class, and to also support
+# references to files without having to specify the exact matching casing. On
+# Windows (including Cygwin) and MacOS, users should typically set this option
+# to NO, whereas on Linux or other Unix flavors it should typically be set to
+# YES.
 # The default value is: system dependent.
 
 CASE_SENSE_NAMES       = NO
@@ -788,7 +821,10 @@ WARN_IF_DOC_ERROR      = YES
 WARN_NO_PARAMDOC       = YES
 
 # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
-# a warning is encountered.
+# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
+# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
+# at the end of the doxygen process doxygen will return with a non-zero status.
+# Possible values are: NO, YES and FAIL_ON_WARNINGS.
 # The default value is: NO.
 
 WARN_AS_ERROR          = NO
@@ -819,13 +855,13 @@ WARN_LOGFILE           = doxygen.errors
 # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.
 
-INPUT  = "src/catch2"
+INPUT                  = src/catch2
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
 # libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: https://www.gnu.org/software/libiconv/) for the list of
-# possible encodings.
+# documentation (see:
+# https://www.gnu.org/software/libiconv/) for the list of possible encodings.
 # The default value is: UTF-8.
 
 INPUT_ENCODING         = UTF-8
@@ -838,13 +874,61 @@ INPUT_ENCODING         = UTF-8
 # need to set EXTENSION_MAPPING for the extension otherwise the files are not
 # read by doxygen.
 #
+# Note the list of default checked file patterns might differ from the list of
+# default file extension mappings.
+#
 # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
 # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
 # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
-# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf, *.qsf and *.ice.
-
-# FILE_PATTERNS          =
+# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment),
+# *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, *.vhdl,
+# *.ucf, *.qsf and *.ice.
+
+FILE_PATTERNS          = *.c \
+                         *.cc \
+                         *.cxx \
+                         *.cpp \
+                         *.c++ \
+                         *.java \
+                         *.ii \
+                         *.ixx \
+                         *.ipp \
+                         *.i++ \
+                         *.inl \
+                         *.idl \
+                         *.ddl \
+                         *.odl \
+                         *.h \
+                         *.hh \
+                         *.hxx \
+                         *.hpp \
+                         *.h++ \
+                         *.cs \
+                         *.d \
+                         *.php \
+                         *.php4 \
+                         *.php5 \
+                         *.phtml \
+                         *.inc \
+                         *.m \
+                         *.markdown \
+                         *.md \
+                         *.mm \
+                         *.dox \
+                         *.py \
+                         *.pyw \
+                         *.f90 \
+                         *.f95 \
+                         *.f03 \
+                         *.f08 \
+                         *.f18 \
+                         *.f \
+                         *.for \
+                         *.vhd \
+                         *.vhdl \
+                         *.ucf \
+                         *.qsf \
+                         *.ice
 
 # The RECURSIVE tag can be used to specify whether or not subdirectories should
 # be searched for input files as well.
@@ -968,6 +1052,7 @@ FILTER_SOURCE_PATTERNS =
 # (index.html). This can be useful if you have a project on for instance GitHub
 # and want to reuse the introduction page also for the doxygen output.
 
+USE_MDFILE_AS_MAINPAGE =
 
 #---------------------------------------------------------------------------
 # Configuration options related to source browsing
@@ -1055,6 +1140,44 @@ USE_HTAGS              = NO
 
 VERBATIM_HEADERS       = YES
 
+# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
+# clang parser (see:
+# http://clang.llvm.org/) for more accurate parsing at the cost of reduced
+# performance. This can be particularly helpful with template rich C++ code for
+# which doxygen's built-in parser lacks the necessary type information.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+# The default value is: NO.
+
+CLANG_ASSISTED_PARSING = NO
+
+# If clang assisted parsing is enabled and the CLANG_ADD_INC_PATHS tag is set to
+# YES then doxygen will add the directory of each input to the include path.
+# The default value is: YES.
+
+CLANG_ADD_INC_PATHS    = YES
+
+# If clang assisted parsing is enabled you can provide the compiler with command
+# line options that you would normally use when invoking the compiler. Note that
+# the include paths will already be set by doxygen for the files and directories
+# specified with INPUT and INCLUDE_PATH.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+CLANG_OPTIONS          =
+
+# If clang assisted parsing is enabled you can provide the clang parser with the
+# path to the directory containing a file called compile_commands.json. This
+# file is the compilation database (see:
+# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) containing the
+# options used when the source files were built. This is equivalent to
+# specifying the -p option to a clang tool, such as clang-check. These options
+# will then be passed to the parser. Any options specified with CLANG_OPTIONS
+# will be added as well.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+
+CLANG_DATABASE_PATH    =
+
 #---------------------------------------------------------------------------
 # Configuration options related to the alphabetical class index
 #---------------------------------------------------------------------------
@@ -1066,13 +1189,6 @@ VERBATIM_HEADERS       = YES
 
 ALPHABETICAL_INDEX     = YES
 
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
 # In case all classes in a project start with a common prefix, all classes will
 # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
 # can be used to specify a prefix (or a list of prefixes) that should be ignored
@@ -1211,9 +1327,9 @@ HTML_TIMESTAMP         = NO
 
 # If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
 # documentation will contain a main index with vertical navigation menus that
-# are dynamically created via Javascript. If disabled, the navigation index will
+# are dynamically created via JavaScript. If disabled, the navigation index will
 # consists of multiple levels of tabs that are statically embedded in every HTML
-# page. Disable this option to support browsers that do not have Javascript,
+# page. Disable this option to support browsers that do not have JavaScript,
 # like the Qt help browser.
 # The default value is: YES.
 # This tag requires that the tag GENERATE_HTML is set to YES.
@@ -1243,10 +1359,11 @@ HTML_INDEX_NUM_ENTRIES = 100
 
 # If the GENERATE_DOCSET tag is set to YES, additional index files will be
 # generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: https://developer.apple.com/xcode/), introduced with OSX
-# 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
+# environment (see:
+# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To
+# create a documentation set, doxygen will generate a Makefile in the HTML
+# output directory. Running make will produce the docset in that directory and
+# running make install will install the docset in
 # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
 # startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
 # genXcode/_index.html for more information.
@@ -1288,8 +1405,8 @@ DOCSET_PUBLISHER_NAME  = Publisher
 # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
 # additional HTML index files: index.hhp, index.hhc, and index.hhk. The
 # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
+# (see:
+# https://www.microsoft.com/en-us/download/details.aspx?id=21138) on Windows.
 #
 # The HTML Help Workshop contains a compiler that can convert all HTML output
 # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
@@ -1364,7 +1481,8 @@ QCH_FILE               =
 
 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
-# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
+# (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
 # The default value is: org.doxygen.Project.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1372,8 +1490,8 @@ QHP_NAMESPACE          = org.doxygen.Project
 
 # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
 # Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-
-# folders).
+# Folders (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders).
 # The default value is: doc.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1381,16 +1499,16 @@ QHP_VIRTUAL_FOLDER     = doc
 
 # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
 # filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
-# filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_NAME   =
 
 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
-# filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_ATTRS  =
@@ -1402,9 +1520,9 @@ QHP_CUST_FILTER_ATTRS  =
 
 QHP_SECT_FILTER_ATTRS  =
 
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
+# The QHG_LOCATION tag can be used to specify the location (absolute path
+# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to
+# run qhelpgenerator on the generated .qhp file.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHG_LOCATION           =
@@ -1481,6 +1599,17 @@ TREEVIEW_WIDTH         = 250
 
 EXT_LINKS_IN_WINDOW    = NO
 
+# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg
+# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
+# https://inkscape.org) to generate formulas as SVG images instead of PNGs for
+# the HTML output. These images will generally look nicer at scaled resolutions.
+# Possible values are: png (the default) and svg (looks nicer but requires the
+# pdf2svg or inkscape tool).
+# The default value is: png.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FORMULA_FORMAT    = png
+
 # Use this tag to change the font size of LaTeX formulas included as images in
 # the HTML documentation. When you change the font size after a successful
 # doxygen run you need to manually remove any form_*.png images from the HTML
@@ -1501,8 +1630,14 @@ FORMULA_FONTSIZE       = 10
 
 FORMULA_TRANSPARENT    = YES
 
+# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands
+# to create new LaTeX commands to be used in formulas as building blocks. See
+# the section "Including formulas" for details.
+
+FORMULA_MACROFILE      =
+
 # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# https://www.mathjax.org) which uses client side Javascript for the rendering
+# https://www.mathjax.org) which uses client side JavaScript for the rendering
 # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
 # installed or if you want to formulas look prettier in the HTML output. When
 # enabled you may also need to install MathJax separately and configure the path
@@ -1514,7 +1649,7 @@ USE_MATHJAX            = YES
 
 # When MathJax is enabled you can set the default output format to be used for
 # the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details.
 # Possible values are: HTML-CSS (which is slower, but has the best
 # compatibility), NativeMML (i.e. MathML) and SVG.
 # The default value is: HTML-CSS.
@@ -1530,7 +1665,7 @@ MATHJAX_FORMAT         = HTML-CSS
 # Content Delivery Network so you can quickly see the result without installing
 # MathJax. However, it is strongly recommended to install a local copy of
 # MathJax from https://www.mathjax.org before deployment.
-# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/.
+# The default value is: https://cdn.jsdelivr.net/npm/mathjax@2.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
 MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
@@ -1545,7 +1680,8 @@ MATHJAX_EXTENSIONS     = TeX/AMSmath \
 
 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
 # of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an
 # example see the documentation.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
@@ -1573,7 +1709,7 @@ MATHJAX_CODEFILE       =
 SEARCHENGINE           = YES
 
 # When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
+# implemented using a web server instead of a web client using JavaScript. There
 # are two flavors of web server based searching depending on the EXTERNAL_SEARCH
 # setting. When disabled, doxygen will generate a PHP script for searching and
 # an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
@@ -1592,7 +1728,8 @@ SERVER_BASED_SEARCH    = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: https://xapian.org/).
+# Xapian (see:
+# https://xapian.org/).
 #
 # See the section "External Indexing and Searching" for details.
 # The default value is: NO.
@@ -1605,8 +1742,9 @@ EXTERNAL_SEARCH        = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: https://xapian.org/). See the section "External Indexing and
-# Searching" for details.
+# Xapian (see:
+# https://xapian.org/). See the section "External Indexing and Searching" for
+# details.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
 SEARCHENGINE_URL       =
@@ -1770,9 +1908,11 @@ LATEX_EXTRA_FILES      =
 
 PDF_HYPERLINKS         = YES
 
-# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES, to get a
-# higher quality PDF documentation.
+# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as
+# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX
+# files. Set this option to YES, to get a higher quality PDF documentation.
+#
+# See also section LATEX_CMD_NAME for selecting the engine.
 # The default value is: YES.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
@@ -2204,7 +2344,7 @@ HIDE_UNDOC_RELATIONS   = YES
 # http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
 # Bell Labs. The other options in this section have no effect if this option is
 # set to NO
-# The default value is: NO.
+# The default value is: YES.
 
 HAVE_DOT               = YES
 
@@ -2283,10 +2423,32 @@ UML_LOOK               = NO
 # but if the number exceeds 15, the total amount of fields shown is limited to
 # 10.
 # Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
+# This tag requires that the tag UML_LOOK is set to YES.
 
 UML_LIMIT_NUM_FIELDS   = 10
 
+# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and
+# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS
+# tag is set to YES, doxygen will add type and arguments for attributes and
+# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen
+# will not generate fields with class member information in the UML graphs. The
+# class diagrams will look similar to the default class diagrams but using UML
+# notation for the relationships.
+# Possible values are: NO, YES and NONE.
+# The default value is: NO.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+DOT_UML_DETAILS        = NO
+
+# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters
+# to display on a single line. If the actual line length exceeds this threshold
+# significantly it will wrapped across multiple lines. Some heuristics are apply
+# to avoid ugly line breaks.
+# Minimum value: 0, maximum value: 1000, default value: 17.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_WRAP_THRESHOLD     = 17
+
 # If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
 # collaboration graphs will show the relations between templates and their
 # instances.
@@ -2360,7 +2522,9 @@ DIRECTORY_GRAPH        = NO
 # Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
 # to make the SVG files visible in IE 9+ (other browsers do not have this
 # requirement).
-# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
+# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd,
+# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo,
+# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo,
 # png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
 # png:gdiplus:gdiplus.
 # The default value is: png.
@@ -2476,9 +2640,11 @@ DOT_MULTI_TARGETS      = YES
 
 GENERATE_LEGEND        = YES
 
-# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate
 # files that are used to generate the various graphs.
+#
+# Note: This setting is not only used for dot files but also for msc and
+# plantuml temporary files.
 # The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_CLEANUP            = YES
diff --git a/alpaka/thirdParty/catch2/WORKSPACE.bazel b/alpaka/thirdParty/catch2/WORKSPACE.bazel
index 6fd2ffa5..357e6f94 100644
--- a/alpaka/thirdParty/catch2/WORKSPACE.bazel
+++ b/alpaka/thirdParty/catch2/WORKSPACE.bazel
@@ -4,12 +4,13 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 
 http_archive(
     name = "bazel_skylib",
+    sha256 = "cd55a062e763b9349921f0f5db8c3933288dc8ba4f76dd9416aac68acee3cb94",
     urls = [
-        "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.3.0/bazel-skylib-1.3.0.tar.gz",
-        "https://github.com/bazelbuild/bazel-skylib/releases/download/1.3.0/bazel-skylib-1.3.0.tar.gz",
+        "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.5.0/bazel-skylib-1.5.0.tar.gz",
+        "https://github.com/bazelbuild/bazel-skylib/releases/download/1.5.0/bazel-skylib-1.5.0.tar.gz",
     ],
-    sha256 = "74d544d96f4a5bb630d465ca8bbcfe231e3594e5aae57e1edbf17a6eb3ca2506",
 )
 
 load("@bazel_skylib//:workspace.bzl", "bazel_skylib_workspace")
+
 bazel_skylib_workspace()
diff --git a/alpaka/thirdParty/catch2/appveyor.yml b/alpaka/thirdParty/catch2/appveyor.yml
index 3b6580d8..7a0ad83f 100644
--- a/alpaka/thirdParty/catch2/appveyor.yml
+++ b/alpaka/thirdParty/catch2/appveyor.yml
@@ -70,14 +70,3 @@ environment:
       additional_flags: "/permissive- /std:c++latest"
       platform: x64
       configuration: Debug
-
-    - FLAVOR: VS 2017 x64 Debug
-      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
-      platform: x64
-      configuration: Debug
-
-    - FLAVOR: VS 2017 x64 Release Coverage
-      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
-      coverage: 1
-      platform: x64
-      configuration: Debug
\ No newline at end of file
diff --git a/alpaka/thirdParty/catch2/docs/benchmarks.md b/alpaka/thirdParty/catch2/docs/benchmarks.md
index 548913c7..9edbb93c 100644
--- a/alpaka/thirdParty/catch2/docs/benchmarks.md
+++ b/alpaka/thirdParty/catch2/docs/benchmarks.md
@@ -93,7 +93,7 @@ Fibonacci
 -------------------------------------------------------------------------------
 C:\path\to\Catch2\Benchmark.tests.cpp(10)
 ...............................................................................
-benchmark name                                  samples       iterations    estimated
+benchmark name                                  samples       iterations    est run time
                                                 mean          low mean      high mean
                                                 std dev       low std dev   high std dev
 -------------------------------------------------------------------------------
diff --git a/alpaka/thirdParty/catch2/docs/ci-and-misc.md b/alpaka/thirdParty/catch2/docs/ci-and-misc.md
index c07da29f..49bbd989 100644
--- a/alpaka/thirdParty/catch2/docs/ci-and-misc.md
+++ b/alpaka/thirdParty/catch2/docs/ci-and-misc.md
@@ -82,7 +82,7 @@ variable set to "1".
 
 ### CodeCoverage module (GCOV, LCOV...)
 
-If you are using GCOV tool to get testing coverage of your code, and are not sure how to integrate it with CMake and Catch, there should be an external example over at https://github.com/fkromer/catch_cmake_coverage
+If you are using GCOV tool to get testing coverage of your code, and are not sure how to integrate it with CMake and Catch, there should be an external example over at https://github.com/claremacrae/catch_cmake_coverage
 
 
 ### pkg-config
diff --git a/alpaka/thirdParty/catch2/docs/cmake-integration.md b/alpaka/thirdParty/catch2/docs/cmake-integration.md
index 0720a95b..86666efe 100644
--- a/alpaka/thirdParty/catch2/docs/cmake-integration.md
+++ b/alpaka/thirdParty/catch2/docs/cmake-integration.md
@@ -51,7 +51,7 @@ Include(FetchContent)
 FetchContent_Declare(
   Catch2
   GIT_REPOSITORY https://github.com/catchorg/Catch2.git
-  GIT_TAG        v3.0.1 # or a later release
+  GIT_TAG        v3.4.0 # or a later release
 )
 
 FetchContent_MakeAvailable(Catch2)
@@ -126,6 +126,7 @@ catch_discover_tests(target
                      [OUTPUT_DIR dir]
                      [OUTPUT_PREFIX prefix]
                      [OUTPUT_SUFFIX suffix]
+                     [DISCOVERY_MODE <POST_BUILD|PRE_TEST>]
 )
 ```
 
@@ -198,6 +199,16 @@ If specified, `suffix` is added to each output file name, like so
 `--out dir/<test_name>suffix`. This can be used to add a file extension to
 the output file name e.g. ".xml".
 
+* `DISCOVERY_MODE mode`
+
+If specified allows control over when test discovery is performed.
+For a value of `POST_BUILD` (default) test discovery is performed at build time.
+For a value of `PRE_TEST` test discovery is delayed until just prior to test
+execution (useful e.g. in cross-compilation environments).
+``DISCOVERY_MODE`` defaults to the value of the
+``CMAKE_CATCH_DISCOVER_TESTS_DISCOVERY_MODE`` variable if it is not passed when
+calling ``catch_discover_tests``. This provides a mechanism for globally
+selecting a preferred test discovery behavior.
 
 ### `ParseAndAddCatchTests.cmake`
 
diff --git a/alpaka/thirdParty/catch2/docs/command-line.md b/alpaka/thirdParty/catch2/docs/command-line.md
index a15a2131..bb483959 100644
--- a/alpaka/thirdParty/catch2/docs/command-line.md
+++ b/alpaka/thirdParty/catch2/docs/command-line.md
@@ -85,43 +85,102 @@ Click one of the following links to take you straight to that option - or scroll
 
 <pre>&lt;test-spec> ...</pre>
 
-Test cases, wildcarded test cases, tags and tag expressions are all passed directly as arguments. Tags are distinguished by being enclosed in square brackets.
+By providing a test spec, you filter which tests will be run. If you call
+Catch2 without any test spec, then it will run all non-hidden test
+cases. A test case is hidden if it has the `[!benchmark]` tag, any tag
+with a dot at the start, e.g. `[.]` or `[.foo]`.
 
-If no test specs are supplied then all test cases, except "hidden" tests, are run.
-A test is hidden by giving it any tag starting with (or just) a period (```.```) - or, in the deprecated case, tagged ```[hide]``` or given name starting with `'./'`. To specify hidden tests from the command line ```[.]``` or ```[hide]``` can be used *regardless of how they were declared*.
+There are three basic test specs that can then be combined into more
+complex specs:
 
-Specs must be enclosed in quotes if they contain spaces. If they do not contain spaces the quotes are optional.
+  * Full test name, e.g. `"Test 1"`.
 
-Wildcards consist of the `*` character at the beginning and/or end of test case names and can substitute for any number of any characters (including none).
+    This allows only test cases whose name is "Test 1".
 
-Test specs are case insensitive.
+  * Wildcarded test name, e.g. `"*Test"`, or `"Test*"`, or `"*Test*"`.
 
-If a spec is prefixed with `exclude:` or the `~` character then the pattern matches an exclusion. This means that tests matching the pattern are excluded from the set - even if a prior inclusion spec included them. Subsequent inclusion specs will take precedence, however.
-Inclusions and exclusions are evaluated in left-to-right order.
+    This allows any test case whose name ends with, starts with, or contains
+    in the middle the string "Test". Note that the wildcard can only be at
+    the start or end.
 
-Test case examples:
+  * Tag name, e.g. `[some-tag]`.
 
+    This allows any test case tagged with "[some-tag]". Remember that some
+    tags are special, e.g. those that start with "." or with "!".
+
+
+You can also combine the basic test specs to create more complex test
+specs. You can:
+
+  * Concatenate specs to apply all of them, e.g. `[some-tag][other-tag]`.
+
+    This allows test cases that are tagged with **both** "[some-tag]" **and**
+    "[other-tag]". A test case with just "[some-tag]" will not pass the filter,
+    nor will test case with just "[other-tag]".
+
+  * Comma-join specs to apply any of them, e.g. `[some-tag],[other-tag]`.
+
+    This allows test cases that are tagged with **either** "[some-tag]" **or**
+    "[other-tag]". A test case with both will obviously also pass the filter.
+
+    Note that commas take precendence over simple concatenation. This means
+    that `[a][b],[c]` accepts tests that are tagged with either both "[a]" and
+    "[b]", or tests that are tagged with just "[c]".
+
+  * Negate the spec by prepending it with `~`, e.g. `~[some-tag]`.
+
+    This rejects any test case that is tagged with "[some-tag]". Note that
+    rejection takes precedence over other filters.
+
+    Note that negations always binds to the following _basic_ test spec.
+    This means that `~[foo][bar]` negates only the "[foo]" tag and not the
+    "[bar]" tag.
+
+Note that when Catch2 is deciding whether to include a test, first it
+checks whether the test matches any negative filters. If it does,
+the test is rejected. After that, the behaviour depends on whether there
+are positive filters as well. If there are no positive filters, all
+remaining non-hidden tests are included. If there are positive filters,
+only tests that match the positive filters are included.
+
+You can also match test names with special characters by escaping them
+with a backslash (`"\"`), e.g. a test named `"Do A, then B"` is matched
+by "Do A\, then B" test spec. Backslash also escapes itself.
+
+
+### Examples
+
+Given these TEST_CASEs,
 ```
-thisTestOnly            Matches the test case called, 'thisTestOnly'
-"this test only"        Matches the test case called, 'this test only'
-these*                  Matches all cases starting with 'these'
-exclude:notThis         Matches all tests except, 'notThis'
-~notThis                Matches all tests except, 'notThis'
-~*private*              Matches all tests except those that contain 'private'
-a* ~ab* abc             Matches all tests that start with 'a', except those that
-                        start with 'ab', except 'abc', which is included
-~[tag1]                 Matches all tests except those tagged with '[tag1]'
--# [#somefile]          Matches all tests from the file 'somefile.cpp'
+TEST_CASE("Test 1") {}
+
+TEST_CASE("Test 2", "[.foo]") {}
+
+TEST_CASE("Test 3", "[.bar]") {}
+
+TEST_CASE("Test 4", "[.][foo][bar]") {}
 ```
 
-Names within square brackets are interpreted as tags.
-A series of tags form an AND expression whereas a comma-separated sequence forms an OR expression. e.g.:
+this is the result of these filters
+```
+./tests                      # Selects only the first test, others are hidden
+./tests "Test 1"             # Selects only the first test, other do not match
+./tests ~"Test 1"            # Selects no tests. Test 1 is rejected, other tests are hidden
+./tests "Test *"             # Selects all tests.
+./tests [bar]                # Selects tests 3 and 4. Other tests are not tagged [bar]
+./tests ~[foo]               # Selects test 1, because it is the only non-hidden test without [foo] tag
+./tests [foo][bar]           # Selects test 4.
+./tests [foo],[bar]          # Selects tests 2, 3, 4.
+./tests ~[foo][bar]          # Selects test 3. 2 and 4 are rejected due to having [foo] tag
+./tests ~"Test 2"[foo]       # Selects test 4, because test 2 is explicitly rejected
+./tests [foo][bar],"Test 1"  # Selects tests 1 and 4.
+./tests "Test 1*"            # Selects test 1, wildcard can match zero characters
+```
 
-<pre>[one][two],[three]</pre>
-This matches all tests tagged `[one]` and `[two]`, as well as all tests tagged `[three]`
+_Note: Using plain asterisk on a command line can cause issues with shell
+expansion. Make sure that the asterisk is passed to Catch2 and is not
+interpreted by the shell._
 
-Test names containing special characters, such as `,` or `[` can specify them on the command line using `\`.
-`\` also escapes itself.
 
 <a id="choosing-a-reporter-to-use"></a>
 ## Choosing a reporter to use
diff --git a/alpaka/thirdParty/catch2/docs/configuration.md b/alpaka/thirdParty/catch2/docs/configuration.md
index d4421f3c..8a3ddfab 100644
--- a/alpaka/thirdParty/catch2/docs/configuration.md
+++ b/alpaka/thirdParty/catch2/docs/configuration.md
@@ -15,6 +15,7 @@
 [Enabling stringification](#enabling-stringification)<br>
 [Disabling exceptions](#disabling-exceptions)<br>
 [Overriding Catch's debug break (`-b`)](#overriding-catchs-debug-break--b)<br>
+[Static analysis support](#static-analysis-support)<br>
 
 Catch2 is designed to "just work" as much as possible, and most of the
 configuration options below are changed automatically during compilation,
@@ -25,7 +26,8 @@ with the same name.
 
 ## Prefixing Catch macros
 
-    CATCH_CONFIG_PREFIX_ALL
+    CATCH_CONFIG_PREFIX_ALL       // Prefix all macros with CATCH_
+    CATCH_CONFIG_PREFIX_MESSAGES  // Prefix only INFO, UNSCOPED_INFO, WARN and CAPTURE
 
 To keep test code clean and uncluttered Catch uses short macro names (e.g. ```TEST_CASE``` and ```REQUIRE```). Occasionally these may conflict with identifiers from platform headers or the system under test. In this case the above identifier can be defined. This will cause all the Catch user macros to be prefixed with ```CATCH_``` (e.g. ```CATCH_TEST_CASE``` and ```CATCH_REQUIRE```).
 
@@ -264,6 +266,31 @@ The macro will be used as is, that is, `CATCH_BREAK_INTO_DEBUGGER();`
 must compile and must break into debugger.
 
 
+## Static analysis support
+
+> Introduced in Catch2 3.4.0.
+
+Some parts of Catch2, e.g. `SECTION`s, can be hard for static analysis
+tools to reason about. Catch2 can change its internals to help static
+analysis tools reason about the tests.
+
+Catch2 automatically detects some static analysis tools (initial
+implementation checks for clang-tidy and Coverity), but you can override
+its detection (in either direction) via
+
+```
+CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT     // force enables static analysis help
+CATCH_CONFIG_NO_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT  // force disables static analysis help
+```
+
+_As the name suggests, this is currently experimental, and thus we provide
+no backwards compatibility guarantees._
+
+**DO NOT ENABLE THIS FOR BUILDS YOU INTEND TO RUN.** The changed internals
+are not meant to be runnable, only "scannable".
+
+
+
 ---
 
 [Home](Readme.md#top)
diff --git a/alpaka/thirdParty/catch2/docs/faq.md b/alpaka/thirdParty/catch2/docs/faq.md
index 0f303ee5..80923d26 100644
--- a/alpaka/thirdParty/catch2/docs/faq.md
+++ b/alpaka/thirdParty/catch2/docs/faq.md
@@ -10,6 +10,7 @@
 [Does Catch2 support running tests in parallel?](#does-catch2-support-running-tests-in-parallel)<br>
 [Can I compile Catch2 into a dynamic library?](#can-i-compile-catch2-into-a-dynamic-library)<br>
 [What repeatability guarantees does Catch2 provide?](#what-repeatability-guarantees-does-catch2-provide)<br>
+[My build cannot find `catch2/catch_user_config.hpp`, how can I fix it?](#my-build-cannot-find-catch2catch_user_confighpp-how-can-i-fix-it)<br>
 
 
 ## How do I run global setup/teardown only if tests will be run?
@@ -28,7 +29,7 @@ depending on how often the cleanup needs to happen.
 ## Why cannot I derive from the built-in reporters?
 
 They are not made to be overridden, in that we do not attempt to maintain
-a consistent internal state if a member function is overriden, and by
+a consistent internal state if a member function is overridden, and by
 forbidding users from using them as a base class, we can refactor them
 as needed later.
 
@@ -83,12 +84,30 @@ and it is also generally repeatable across versions, but we might break
 it from time to time. E.g. we broke repeatability with previous versions
 in v2.13.4 so that test cases with similar names are shuffled better.
 
-Random generators currently rely on platform's stdlib, specifically
-the distributions from `<random>`. We thus provide no extra guarantee
-above what your platform does. **Important: `<random>`'s distributions
+Since Catch2 3.5.0 the random generators use custom distributions,
+that should be repeatable across different platforms, with few caveats.
+For details see the section on random generators in the [Generator
+documentation](generators.md#random-number-generators-details).
+
+Before this version, random generators relied on distributions from
+platform's stdlib. We thus can provide no extra guarantee on top of the
+ones given by your platform. **Important: `<random>`'s distributions
 are not specified to be repeatable across different platforms.**
 
 
+## My build cannot find `catch2/catch_user_config.hpp`, how can I fix it?
+
+`catch2/catch_user_config.hpp` is a generated header that contains user
+compile time configuration. It is generated by CMake/Meson/Bazel during
+build. If you are not using either of these, your three options are to
+
+1) Build Catch2 separately using build tool that will generate the header
+2) Use the amalgamated files to build Catch2
+3) Use CMake to configure a build. This will generate the header and you
+   can copy it into your own checkout of Catch2.
+
+
+
 ---
 
 [Home](Readme.md#top)
diff --git a/alpaka/thirdParty/catch2/docs/generators.md b/alpaka/thirdParty/catch2/docs/generators.md
index 69d1a02d..8bca54c7 100644
--- a/alpaka/thirdParty/catch2/docs/generators.md
+++ b/alpaka/thirdParty/catch2/docs/generators.md
@@ -134,7 +134,7 @@ type, making their usage much nicer. These are
 * `map<T>(func, GeneratorWrapper<U>&&)` for `MapGenerator<T, U, Func>` (map `U` to `T`)
 * `chunk(chunk-size, GeneratorWrapper<T>&&)` for `ChunkGenerator<T>`
 * `random(IntegerOrFloat a, IntegerOrFloat b)` for `RandomIntegerGenerator` or `RandomFloatGenerator`
-* `range(Arithemtic start, Arithmetic end)` for `RangeGenerator<Arithmetic>` with a step size of `1`
+* `range(Arithmetic start, Arithmetic end)` for `RangeGenerator<Arithmetic>` with a step size of `1`
 * `range(Arithmetic start, Arithmetic end, Arithmetic step)` for `RangeGenerator<Arithmetic>` with a custom step size
 * `from_range(InputIterator from, InputIterator to)` for `IteratorGenerator<T>`
 * `from_range(Container const&)` for `IteratorGenerator<T>`
@@ -189,6 +189,31 @@ TEST_CASE("type conversion", "[generators]") {
 }
 ```
 
+
+### Random number generators: details
+
+> This section applies from Catch2 3.5.0. Before that, random generators
+> were a thin wrapper around distributions from `<random>`.
+
+All of the `random(a, b)` generators in Catch2 currently generate uniformly
+distributed number in closed interval \[a; b\]. This  is different from
+`std::uniform_real_distribution`, which should return numbers in interval
+\[a; b) (but due to rounding can end up returning b anyway), but the
+difference is intentional, so that `random(a, a)` makes sense. If there is
+enough interest from users, we can provide API to pick any of CC, CO, OC,
+or OO ranges.
+
+Unlike `std::uniform_int_distribution`, Catch2's generators also support
+various single-byte integral types, such as `char` or `bool`.
+
+Given the same seed, the output from the integral generators is
+reproducible across different platforms. For floating point generators,
+we only promise reproducibility on platforms that obey the IEEE 754
+standard, and where `float` is 4 bytes and `double` is 8 bytes. We provide
+no guarantees for `long double`, as the internals of `long double` can
+vary wildly across different platforms.
+
+
 ## Generator interface
 
 You can also implement your own generators, by deriving from the
@@ -221,3 +246,21 @@ For full example of implementing your own generator, look into Catch2's
 examples, specifically
 [Generators: Create your own generator](../examples/300-Gen-OwnGenerator.cpp).
 
+
+### Handling empty generators
+
+The generator interface assumes that a generator always has at least one
+element. This is not always true, e.g. if the generator depends on an external
+datafile, the file might be missing.
+
+There are two ways to handle this, depending on whether you want this
+to be an error or not.
+
+ * If empty generator **is** an error, throw an exception in constructor.
+ * If empty generator **is not** an error, use the [`SKIP`](skipping-passing-failing.md#skipping-test-cases-at-runtime) in constructor.
+
+
+
+---
+
+[Home](Readme.md#top)
diff --git a/alpaka/thirdParty/catch2/docs/limitations.md b/alpaka/thirdParty/catch2/docs/limitations.md
index cc0ed05d..099dd82a 100644
--- a/alpaka/thirdParty/catch2/docs/limitations.md
+++ b/alpaka/thirdParty/catch2/docs/limitations.md
@@ -173,13 +173,3 @@ TEST_CASE("b") {
 
 If you are seeing a problem like this, i.e. weird test paths that trigger only under Clang with `libc++`, or only under very specific version of `libstdc++`, it is very likely you are seeing this. The only known workaround is to use a fixed version of your standard library.
 
-
-### libstdc++, `_GLIBCXX_DEBUG` macro and random ordering of tests
-
-Running a Catch2 binary compiled against libstdc++ with `_GLIBCXX_DEBUG`
-macro defined with `--order rand` will cause a debug check to trigger and
-abort the run due to self-assignment.
-[This is a known bug inside libstdc++](https://stackoverflow.com/questions/22915325/avoiding-self-assignment-in-stdshuffle/23691322)
-
-Workaround: Don't use `--order rand` when compiling against debug-enabled
-libstdc++.
diff --git a/alpaka/thirdParty/catch2/docs/matchers.md b/alpaka/thirdParty/catch2/docs/matchers.md
index 14c15898..d5be1f5a 100644
--- a/alpaka/thirdParty/catch2/docs/matchers.md
+++ b/alpaka/thirdParty/catch2/docs/matchers.md
@@ -50,25 +50,43 @@ Both of the string matchers used in the examples above live in the
 `catch_matchers_string.hpp` header, so to compile the code above also
 requires `#include <catch2/matchers/catch_matchers_string.hpp>`.
 
+### Combining operators and lifetimes
+
 **IMPORTANT**: The combining operators do not take ownership of the
-matcher objects being combined. This means that if you store combined
-matcher object, you have to ensure that the matchers being combined
-outlive its last use. What this means is that the following code leads
-to a use-after-free (UAF):
+matcher objects being combined.
+
+This means that if you store combined matcher object, you have to ensure
+that the individual matchers being combined outlive the combined matcher.
+Note that the negation matcher from `!` also counts as combining matcher
+for this.
 
+Explained on an example, this is fine
 ```cpp
-#include <catch2/catch_test_macros.hpp>
-#include <catch2/matchers/catch_matchers_string.hpp>
+CHECK_THAT(value, WithinAbs(0, 2e-2) && !WithinULP(0., 1));
+```
 
-TEST_CASE("Bugs, bugs, bugs", "[Bug]"){
-    std::string str = "Bugs as a service";
+and so is this
+```cpp
+auto is_close_to_zero = WithinAbs(0, 2e-2);
+auto is_zero          = WithinULP(0., 1);
 
-    auto match_expression = Catch::Matchers::EndsWith( "as a service" ) ||
-        (Catch::Matchers::StartsWith( "Big data" ) && !Catch::Matchers::ContainsSubstring( "web scale" ) );
-    REQUIRE_THAT(str, match_expression);
-}
+CHECK_THAT(value, is_close_to_zero && !is_zero);
 ```
 
+but this is not
+```cpp
+auto is_close_to_zero = WithinAbs(0, 2e-2);
+auto is_zero          = WithinULP(0., 1);
+auto is_close_to_but_not_zero = is_close_to_zero && !is_zero;
+
+CHECK_THAT(a_value, is_close_to_but_not_zero); // UAF
+```
+
+because `!is_zero` creates a temporary instance of Negation matcher,
+which the `is_close_to_but_not_zero` refers to. After the line ends,
+the temporary is destroyed and the combined `is_close_to_but_not_zero`
+matcher now refers to non-existent object, so using it causes use-after-free.
+
 
 ## Built-in matchers
 
@@ -286,7 +304,7 @@ comparable. (e.g. you may compare `std::vector<int>` to `std::array<char>`).
 `UnorderedRangeEquals` is similar to `RangeEquals`, but the order
 does not matter. For example "1, 2, 3" would match "3, 2, 1", but not
 "1, 1, 2, 3" As with `RangeEquals`, `UnorderedRangeEquals` compares
-the individual elements using using `operator==` by default.
+the individual elements using `operator==` by default.
 
 Both `RangeEquals` and `UnorderedRangeEquals` optionally accept a
 predicate which can be used to compare the containers element-wise.
diff --git a/alpaka/thirdParty/catch2/docs/opensource-users.md b/alpaka/thirdParty/catch2/docs/opensource-users.md
index 12b4551c..a02d0b98 100644
--- a/alpaka/thirdParty/catch2/docs/opensource-users.md
+++ b/alpaka/thirdParty/catch2/docs/opensource-users.md
@@ -95,6 +95,9 @@ A C++ client library for Consul. Consul is a distributed tool for discovering an
 ### [Reactive-Extensions/ RxCpp](https://github.com/Reactive-Extensions/RxCpp)
 A library of algorithms for values-distributed-in-time.
 
+### [SFML](https://github.com/SFML/SFML)
+Simple and Fast Multimedia Library.
+
 ### [SOCI](https://github.com/SOCI/soci)
 The C++ Database Access Library.
 
@@ -110,6 +113,12 @@ A header-only TOML parser and serializer for modern C++.
 ### [Trompeloeil](https://github.com/rollbear/trompeloeil)
 A thread-safe header-only mocking framework for C++14.
 
+### [wxWidgets](https://www.wxwidgets.org/)
+Cross-Platform C++ GUI Library.
+
+### [xmlwrapp](https://github.com/vslavik/xmlwrapp)
+C++ XML parsing library using libxml2.
+
 ## Applications & Tools
 
 ### [App Mesh](https://github.com/laoshanxi/app-mesh)
@@ -137,7 +146,7 @@ Newsbeuter is an open-source RSS/Atom feed reader for text terminals.
 A 2D, Zombie, RPG game which is being made on our own engine.
 
 ### [raspigcd](https://github.com/pantadeusz/raspigcd)
-Low level CLI app and library for execution of GCODE on Raspberry Pi without any additional microcontrolers (just RPi + Stepsticks).
+Low level CLI app and library for execution of GCODE on Raspberry Pi without any additional microcontrollers (just RPi + Stepsticks).
 
 ### [SpECTRE](https://github.com/sxs-collaboration/spectre)
 SpECTRE is a code for multi-scale, multi-physics problems in astrophysics and gravitational physics.
diff --git a/alpaka/thirdParty/catch2/docs/release-notes.md b/alpaka/thirdParty/catch2/docs/release-notes.md
index 1fa37da4..ca7f4dde 100644
--- a/alpaka/thirdParty/catch2/docs/release-notes.md
+++ b/alpaka/thirdParty/catch2/docs/release-notes.md
@@ -2,6 +2,10 @@
 
 # Release notes
 **Contents**<br>
+[3.5.2](#352)<br>
+[3.5.1](#351)<br>
+[3.5.0](#350)<br>
+[3.4.0](#340)<br>
 [3.3.2](#332)<br>
 [3.3.1](#331)<br>
 [3.3.0](#330)<br>
@@ -56,6 +60,87 @@
 [Even Older versions](#even-older-versions)<br>
 
 
+## 3.5.1
+
+### Fixes
+* Fixed `-Wsubobject-linkage` in the Console reporter (#2794)
+* Fixed adding new CLI Options to lvalue parser using `|` (#2787)
+
+
+## 3.5.1
+
+### Improvements
+* Significantly improved performance of the CLI parsing.
+  * This includes the cost of preparing the CLI parser, so Catch2's binaries start much faster.
+
+### Miscellaneous
+* Added support for Bazel modules (#2781)
+* Added CMake option to disable the build reproducibility settings (#2785)
+* Added `log` library linking to the Meson build (#2784)
+
+
+## 3.5.0
+
+### Improvements
+* Introduced `CATCH_CONFIG_PREFIX_MESSAGES` to prefix only logging macros (#2544)
+  * This means `INFO`, `UNSCOPED_INFO`, `WARN` and `CAPTURE`.
+* Section hints in static analysis mode are now `const`
+  * This prevents Clang-Tidy from complaining about `misc-const-correctness`.
+* `from_range` generator supports C arrays and ranges that require ADL (#2737)
+* Stringification support for `std::optional` now also includes `std::nullopt` (#2740)
+* The Console reporter flushes output after writing benchmark runtime estimate.
+  * This means that you can immediately see for how long the benchmark is expected to run.
+* Added workaround to enable compilation with ICC 19.1 (#2551, #2766)
+* Compiling Catch2 for XBox should work out of the box (#2772)
+  * Catch2 should automatically disable getenv when compiled for XBox.
+* Compiling Catch2 with exceptions disabled no longer triggers `Wunused-function` (#2726)
+* **`random` Generators for integral types are now reproducible across different platforms**
+  * Unlike `<random>`, Catch2's generators also support 1 byte integral types (`char`, `bool`, ...)
+* **`random` Generators for `float` and `double` are now reproducible across different platforms**
+  * `long double` varies across different platforms too much to be reproducible
+  * This guarantee applies only to platforms with IEEE 754 floats.
+
+### Fixes
+* UDL declaration inside Catch2 are now strictly conforming to the standard
+  * `operator "" _a` is UB, `operator ""_a` is fine. Seriously.
+* Fixed `CAPTURE` tests failing to compile in C++23 mode (#2744)
+* Fixed missing include in `catch_message.hpp` (#2758)
+* Fixed `CHECK_ELSE` suppressing failure from uncaught exceptions(#2723)
+
+### Miscellaneous
+* The documentation for specifying which tests to run through commandline has been completely rewritten (#2738)
+* Fixed installation when building Catch2 with meson (#2722, #2742)
+* Fixed `catch_discover_tests` when using custom reporter and `PRE_TEST` discovery mode (#2747)
+* `catch_discover_tests` supports multi-config CMake generator in `PRE_TEST` discovery mode (#2739, #2746)
+
+
+## 3.4.0
+
+### Improvements
+* `VectorEquals` supports elements that provide only `==` and not `!=` (#2648)
+* Catch2 supports compiling with IAR compiler (#2651)
+* Various small internal performance improvements
+* Various small internal compilation time improvements
+* XMLReporter now reports location info for INFO and WARN (#1251)
+  * This bumps up the xml format version to 3
+* Documented that `SKIP` in generator constructor can be used to handle empty  generator (#1593)
+* Added experimental static analysis support to `TEST_CASE` and `SECTION` macros (#2681)
+  * The two macros are redefined in a way that helps the SA tools reason about the possible paths through a test case with sections.
+  * The support is controlled by the `CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT` option and autodetects clang-tidy and Coverity.
+* `*_THROWS`, `*_THROWS_AS`, etc now suppress warning coming from `__attribute__((warn_unused_result))` on GCC  (#2691)
+  * Unlike plain `[[nodiscard]]`, this warning is not silenced by void cast. WTF GCC?
+
+### Fixes
+* Fixed `assertionStarting` events being sent after the expr is evaluated (#2678)
+* Errors in `TEST_CASE` tags are now reported nicely (#2650)
+
+### Miscellaneous
+* Bunch of improvements to `catch_discover_tests`
+  * Added DISCOVERY_MODE option, so the discovery can happen either post build or pre-run.
+  * Fixed handling of semicolons and backslashes in test names (#2674, #2676)
+* meson build can disable building tests (#2693)
+* meson build properly sets meson version 0.54.1 as the minimal supported version (#2688)
+
 
 ## 3.3.2
 
@@ -149,7 +234,7 @@
 
 ### Fixes
 * Cleaned out some warnings and static analysis issues
-  * Suppressed `-Wcomma` warning rarely occuring in templated test cases (#2543)
+  * Suppressed `-Wcomma` warning rarely occurring in templated test cases (#2543)
   * Constified implementation details in `INFO` (#2564)
   * Made `MatcherGenericBase` copy constructor const (#2566)
 * Fixed serialization of test filters so the output roundtrips
@@ -352,7 +437,7 @@ v3 releases.
 * Added `STATIC_CHECK` macro, similar to `STATIC_REQUIRE` (#2318)
   * When deferred tu runtime, it behaves like `CHECK`, and not like `REQUIRE`.
 * You can have multiple tests with the same name, as long as other parts of the test identity differ (#1915, #1999, #2175)
-  * Test identity includes test's name, test's tags and and test's class name if applicable.
+  * Test identity includes test's name, test's tags and test's class name if applicable.
 * Added new warning, `UnmatchedTestSpec`, to error on test specs with no matching tests
 * The `-w`, `--warn` warning flags can now be provided multiple times to enable multiple warnings
 * The case-insensitive handling of tags is now more reliable and takes up less memory
@@ -517,7 +602,7 @@ v3 releases.
   * The `SECTION`(s) before the `GENERATE` will not be run multiple times, the following ones will.
 * Added `-D`/`--min-duration` command line flag (#1910)
   * If a test takes longer to finish than the provided value, its name and duration will be printed.
-  * This flag is overriden by setting `-d`/`--duration`.
+  * This flag is overridden by setting `-d`/`--duration`.
 
 ### Fixes
 * `TAPReporter` no longer skips successful assertions (#1983)
@@ -585,7 +670,7 @@ v3 releases.
 ### Fixes
 * Fixed computation of benchmarking column widths in ConsoleReporter (#1885, #1886)
 * Suppressed clang-tidy's `cppcoreguidelines-pro-type-vararg` in assertions (#1901)
-  * It was a false positive trigered by the new warning support workaround
+  * It was a false positive triggered by the new warning support workaround
 * Fixed bug in test specification parser handling of OR'd patterns using escaping (#1905)
 
 ### Miscellaneous
@@ -922,7 +1007,7 @@ v3 releases.
 
 ### Contrib
 * `ParseAndAddCatchTests` has learned how to use `DISABLED` CTest property (#1452)
-* `ParseAndAddCatchTests` now works when there is a whitspace before the test name (#1493)
+* `ParseAndAddCatchTests` now works when there is a whitespace before the test name (#1493)
 
 
 ### Miscellaneous
diff --git a/alpaka/thirdParty/catch2/docs/reporter-events.md b/alpaka/thirdParty/catch2/docs/reporter-events.md
index 32a0ae50..015f67be 100644
--- a/alpaka/thirdParty/catch2/docs/reporter-events.md
+++ b/alpaka/thirdParty/catch2/docs/reporter-events.md
@@ -96,12 +96,12 @@ void assertionStarting( AssertionInfo const& assertionInfo );
 void assertionEnded( AssertionStats const& assertionStats );
 ```
 
-`assertionStarting` is called after the expression is captured, but before
-the assertion expression is evaluated. This might seem like a minor
-distinction, but what it means is that if you have assertion like
-`REQUIRE( a + b == c + d )`, then what happens is that `a + b` and `c + d`
-are evaluated before `assertionStarting` is emitted, while the `==` is
-evaluated after the event.
+The `assertionStarting` event is emitted before the expression in the
+assertion is captured or evaluated and `assertionEnded` is emitted
+afterwards. This means that given assertion like `REQUIRE(a + b == c + d)`,
+Catch2 first emits `assertionStarting` event, then `a + b` and `c + d`
+are evaluated, then their results are captured, the comparison is evaluated,
+and then `assertionEnded` event is emitted.
 
 
 ## Benchmarking events
diff --git a/alpaka/thirdParty/catch2/docs/reporters.md b/alpaka/thirdParty/catch2/docs/reporters.md
index 496c61a9..e2abfe34 100644
--- a/alpaka/thirdParty/catch2/docs/reporters.md
+++ b/alpaka/thirdParty/catch2/docs/reporters.md
@@ -52,7 +52,7 @@ its machine-readable XML output to file `result-junit.xml`, and the
 uses ANSI colour codes for colouring the output.
 
 Using multiple reporters (or one reporter and one-or-more [event
-listeners](event-listener.md#top)) can have surprisingly complex semantics
+listeners](event-listeners.md#top)) can have surprisingly complex semantics
 when using customization points provided to reporters by Catch2, namely
 capturing stdout/stderr from test cases.
 
diff --git a/alpaka/thirdParty/catch2/docs/skipping-passing-failing.md b/alpaka/thirdParty/catch2/docs/skipping-passing-failing.md
index 4300d9d3..52bb18f7 100644
--- a/alpaka/thirdParty/catch2/docs/skipping-passing-failing.md
+++ b/alpaka/thirdParty/catch2/docs/skipping-passing-failing.md
@@ -9,7 +9,7 @@ In some situations it may not be possible to meaningfully execute a test case,
 for example when the system under test is missing certain hardware capabilities.
 If the required conditions can only be determined at runtime, it often
 doesn't make sense to consider such a test case as either passed or failed,
-because it simply can not run at all.
+because it simply cannot run at all.
 
 To properly express such scenarios, Catch2 provides a way to explicitly
 _skip_ test cases, using the `SKIP` macro:
@@ -84,6 +84,12 @@ exit code, same as it does if no test cases have run. This behaviour can
 be overridden using the [--allow-running-no-tests](command-line.md#no-tests-override)
 flag.
 
+### `SKIP` inside generators
+
+You can also use the `SKIP` macro inside generator's constructor to handle
+cases where the generator is empty, but you do not want to fail the test
+case.
+
 
 ## Passing and failing test cases
 
diff --git a/alpaka/thirdParty/catch2/docs/test-cases-and-sections.md b/alpaka/thirdParty/catch2/docs/test-cases-and-sections.md
index acebcc51..01c898bb 100644
--- a/alpaka/thirdParty/catch2/docs/test-cases-and-sections.md
+++ b/alpaka/thirdParty/catch2/docs/test-cases-and-sections.md
@@ -231,7 +231,7 @@ TEMPLATE_TEST_CASE( "vectors can be sized and resized", "[vector][template]", in
 
 > [Introduced](https://github.com/catchorg/Catch2/issues/1468) in Catch2 2.6.0.
 
-_template-type1_ through _template-typen_ is list of template template
+_template-type1_ through _template-typen_ is list of template
 types which should be combined with each of _template-arg1_ through
  _template-argm_, resulting in _n * m_ test cases. Inside the test case,
 the resulting type is available under the name of `TestType`.
diff --git a/alpaka/thirdParty/catch2/docs/tostring.md b/alpaka/thirdParty/catch2/docs/tostring.md
index adce3cc7..b99b6742 100644
--- a/alpaka/thirdParty/catch2/docs/tostring.md
+++ b/alpaka/thirdParty/catch2/docs/tostring.md
@@ -75,7 +75,7 @@ CATCH_TRANSLATE_EXCEPTION( MyType const& ex ) {
 
 Enums that already have a `<<` overload for `std::ostream` will convert to strings as expected.
 If you only need to convert enums to strings for test reporting purposes you can provide a `StringMaker` specialisations as any other type.
-However, as a convenience, Catch provides the `REGISTER_ENUM` helper macro that will generate the `StringMaker` specialiation for you with minimal code.
+However, as a convenience, Catch provides the `REGISTER_ENUM` helper macro that will generate the `StringMaker` specialisation for you with minimal code.
 Simply provide it the (qualified) enum name, followed by all the enum values, and you're done!
 
 E.g.
diff --git a/alpaka/thirdParty/catch2/docs/tutorial.md b/alpaka/thirdParty/catch2/docs/tutorial.md
index 342c7381..dfccac88 100644
--- a/alpaka/thirdParty/catch2/docs/tutorial.md
+++ b/alpaka/thirdParty/catch2/docs/tutorial.md
@@ -119,7 +119,7 @@ This is best explained through an example ([code](../examples/100-Fix-Section.cp
 
 ```c++
 TEST_CASE( "vectors can be sized and resized", "[vector]" ) {
-
+    // This setup will be done 4 times in total, once for each section
     std::vector<int> v( 5 );
 
     REQUIRE( v.size() == 5 );
@@ -152,11 +152,12 @@ TEST_CASE( "vectors can be sized and resized", "[vector]" ) {
 }
 ```
 
-For each `SECTION` the `TEST_CASE` is executed from the start. This means
+For each `SECTION` the `TEST_CASE` is **executed from the start**. This means
 that each section is entered with a freshly constructed vector `v`, that
 we know has size 5 and capacity at least 5, because the two assertions
-are also checked before the section is entered. Each run through a test
-case will execute one, and only one, leaf section.
+are also checked before the section is entered. This behaviour may not be
+ideal for tests where setup is expensive. Each run through a test case will
+execute one, and only one, leaf section.
 
 Section can also be nested, in which case the parent section can be
 entered multiple times, once for each leaf section. Nested sections are
diff --git a/alpaka/thirdParty/catch2/docs/why-catch.md b/alpaka/thirdParty/catch2/docs/why-catch.md
index 2c0178ca..b7367496 100644
--- a/alpaka/thirdParty/catch2/docs/why-catch.md
+++ b/alpaka/thirdParty/catch2/docs/why-catch.md
@@ -30,7 +30,7 @@ So what does Catch2 bring to the party that differentiates it from these? Apart
 * Output is through modular reporter objects. Basic textual and XML reporters are included. Custom reporters can easily be added.
 * JUnit xml output is supported for integration with third-party tools, such as CI servers.
 * A default main() function is provided, but you can supply your own for complete control (e.g. integration into your own test runner GUI).
-* A command line parser is provided and can still be used if you choose to provided your own main() function.
+* A command line parser is provided and can still be used if you choose to provide your own main() function.
 * Alternative assertion macro(s) report failures but don't abort the test case
 * Good set of facilities for floating point comparisons (`Catch::Approx` and full set of matchers)
 * Internal and friendly macros are isolated so name clashes can be managed
@@ -41,8 +41,8 @@ So what does Catch2 bring to the party that differentiates it from these? Apart
 
 ## Who else is using Catch2?
 
-A whole lot of people. According to the 2021 JetBrains C++ ecosystem survey,
-about 11% of C++ programmers use Catch2 for unit testing, making it the
+A whole lot of people. According to [the 2022 JetBrains C++ ecosystem survey](https://www.jetbrains.com/lp/devecosystem-2022/cpp/#Which-unit-testing-frameworks-do-you-regularly-use),
+about 12% of C++ programmers use Catch2 for unit testing, making it the
 second most popular unit testing framework.
 
 You can also take a look at the (incomplete) list of [open source projects](opensource-users.md#top)
diff --git a/alpaka/thirdParty/catch2/examples/010-TestCase.cpp b/alpaka/thirdParty/catch2/examples/010-TestCase.cpp
index 7ec208d5..9e5cd8cd 100644
--- a/alpaka/thirdParty/catch2/examples/010-TestCase.cpp
+++ b/alpaka/thirdParty/catch2/examples/010-TestCase.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 010-TestCase.cpp
 // And write tests in the same file:
 #include <catch2/catch_test_macros.hpp>
diff --git a/alpaka/thirdParty/catch2/examples/020-TestCase-1.cpp b/alpaka/thirdParty/catch2/examples/020-TestCase-1.cpp
index cec55799..a9d87dbc 100644
--- a/alpaka/thirdParty/catch2/examples/020-TestCase-1.cpp
+++ b/alpaka/thirdParty/catch2/examples/020-TestCase-1.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 020-TestCase-1.cpp
 
 #include <catch2/catch_test_macros.hpp>
diff --git a/alpaka/thirdParty/catch2/examples/020-TestCase-2.cpp b/alpaka/thirdParty/catch2/examples/020-TestCase-2.cpp
index 3f5767b3..72dd0ffb 100644
--- a/alpaka/thirdParty/catch2/examples/020-TestCase-2.cpp
+++ b/alpaka/thirdParty/catch2/examples/020-TestCase-2.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 020-TestCase-2.cpp
 
 // main() provided by Catch in file 020-TestCase-1.cpp.
diff --git a/alpaka/thirdParty/catch2/examples/030-Asn-Require-Check.cpp b/alpaka/thirdParty/catch2/examples/030-Asn-Require-Check.cpp
index 0d027ca9..62cd3cfc 100644
--- a/alpaka/thirdParty/catch2/examples/030-Asn-Require-Check.cpp
+++ b/alpaka/thirdParty/catch2/examples/030-Asn-Require-Check.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 030-Asn-Require-Check.cpp
 
 // Catch has two natural expression assertion macro's:
diff --git a/alpaka/thirdParty/catch2/examples/100-Fix-Section.cpp b/alpaka/thirdParty/catch2/examples/100-Fix-Section.cpp
index cfbfa79f..7c8d8aa8 100644
--- a/alpaka/thirdParty/catch2/examples/100-Fix-Section.cpp
+++ b/alpaka/thirdParty/catch2/examples/100-Fix-Section.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 100-Fix-Section.cpp
 
 // Catch has two ways to express fixtures:
diff --git a/alpaka/thirdParty/catch2/examples/110-Fix-ClassFixture.cpp b/alpaka/thirdParty/catch2/examples/110-Fix-ClassFixture.cpp
index 75c10da6..614c3797 100644
--- a/alpaka/thirdParty/catch2/examples/110-Fix-ClassFixture.cpp
+++ b/alpaka/thirdParty/catch2/examples/110-Fix-ClassFixture.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 110-Fix-ClassFixture.cpp
 
 // Catch has two ways to express fixtures:
diff --git a/alpaka/thirdParty/catch2/examples/120-Bdd-ScenarioGivenWhenThen.cpp b/alpaka/thirdParty/catch2/examples/120-Bdd-ScenarioGivenWhenThen.cpp
index 99cdf9ab..345d53c3 100644
--- a/alpaka/thirdParty/catch2/examples/120-Bdd-ScenarioGivenWhenThen.cpp
+++ b/alpaka/thirdParty/catch2/examples/120-Bdd-ScenarioGivenWhenThen.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 120-Bdd-ScenarioGivenWhenThen.cpp
 
 // main() provided by linkage with Catch2WithMain
diff --git a/alpaka/thirdParty/catch2/examples/210-Evt-EventListeners.cpp b/alpaka/thirdParty/catch2/examples/210-Evt-EventListeners.cpp
index 6cedb885..56b050d4 100644
--- a/alpaka/thirdParty/catch2/examples/210-Evt-EventListeners.cpp
+++ b/alpaka/thirdParty/catch2/examples/210-Evt-EventListeners.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 210-Evt-EventListeners.cpp
 
 // Contents:
diff --git a/alpaka/thirdParty/catch2/examples/231-Cfg-OutputStreams.cpp b/alpaka/thirdParty/catch2/examples/231-Cfg-OutputStreams.cpp
index b77c1273..da1713cf 100644
--- a/alpaka/thirdParty/catch2/examples/231-Cfg-OutputStreams.cpp
+++ b/alpaka/thirdParty/catch2/examples/231-Cfg-OutputStreams.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 231-Cfg-OutputStreams.cpp
 // Show how to replace the streams with a simple custom made streambuf.
 
diff --git a/alpaka/thirdParty/catch2/examples/300-Gen-OwnGenerator.cpp b/alpaka/thirdParty/catch2/examples/300-Gen-OwnGenerator.cpp
index 09643d6f..b5d951ac 100644
--- a/alpaka/thirdParty/catch2/examples/300-Gen-OwnGenerator.cpp
+++ b/alpaka/thirdParty/catch2/examples/300-Gen-OwnGenerator.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 300-Gen-OwnGenerator.cpp
 // Shows how to define a custom generator.
 
diff --git a/alpaka/thirdParty/catch2/examples/301-Gen-MapTypeConversion.cpp b/alpaka/thirdParty/catch2/examples/301-Gen-MapTypeConversion.cpp
index ba55f65f..a065d87a 100644
--- a/alpaka/thirdParty/catch2/examples/301-Gen-MapTypeConversion.cpp
+++ b/alpaka/thirdParty/catch2/examples/301-Gen-MapTypeConversion.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 301-Gen-MapTypeConversion.cpp
 // Shows how to use map to modify generator's return type.
 
diff --git a/alpaka/thirdParty/catch2/examples/302-Gen-Table.cpp b/alpaka/thirdParty/catch2/examples/302-Gen-Table.cpp
index 74319518..3cdb1430 100644
--- a/alpaka/thirdParty/catch2/examples/302-Gen-Table.cpp
+++ b/alpaka/thirdParty/catch2/examples/302-Gen-Table.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 302-Gen-Table.cpp
 // Shows how to use table to run a test many times with different inputs. Lifted from examples on
 // issue #850.
@@ -44,11 +52,11 @@ TEST_CASE("Table allows pre-computed test inputs and outputs", "[example][genera
 
 /* Possible simplifications where less legacy toolchain support is needed:
  *
- * - With libstdc++6 or newer, the make_tuple() calls can be ommitted
+ * - With libstdc++6 or newer, the make_tuple() calls can be omitted
  * (technically C++17 but does not require -std in GCC/Clang). See
  *   https://stackoverflow.com/questions/12436586/tuple-vector-and-initializer-list
  *
- * - In C++17 mode std::tie() and the preceding variable delcarations can be
+ * - In C++17 mode std::tie() and the preceding variable declarations can be
  * replaced by structured bindings: auto [test_input, expected] = GENERATE(
  * table<std::string, size_t>({ ...
  */
diff --git a/alpaka/thirdParty/catch2/examples/310-Gen-VariablesInGenerators.cpp b/alpaka/thirdParty/catch2/examples/310-Gen-VariablesInGenerators.cpp
index 0339c5f1..5d24d45a 100644
--- a/alpaka/thirdParty/catch2/examples/310-Gen-VariablesInGenerators.cpp
+++ b/alpaka/thirdParty/catch2/examples/310-Gen-VariablesInGenerators.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 310-Gen-VariablesInGenerator.cpp
 // Shows how to use variables when creating generators.
 
diff --git a/alpaka/thirdParty/catch2/examples/311-Gen-CustomCapture.cpp b/alpaka/thirdParty/catch2/examples/311-Gen-CustomCapture.cpp
index d12ee709..ee310383 100644
--- a/alpaka/thirdParty/catch2/examples/311-Gen-CustomCapture.cpp
+++ b/alpaka/thirdParty/catch2/examples/311-Gen-CustomCapture.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 // 311-Gen-CustomCapture.cpp
 // Shows how to provide custom capture list to the generator expression
 
diff --git a/alpaka/thirdParty/catch2/examples/CMakeLists.txt b/alpaka/thirdParty/catch2/examples/CMakeLists.txt
index f9933341..82734ada 100644
--- a/alpaka/thirdParty/catch2/examples/CMakeLists.txt
+++ b/alpaka/thirdParty/catch2/examples/CMakeLists.txt
@@ -30,6 +30,7 @@ set( SOURCES_IDIOMATIC_EXAMPLES
     110-Fix-ClassFixture.cpp
     120-Bdd-ScenarioGivenWhenThen.cpp
     210-Evt-EventListeners.cpp
+    232-Cfg-CustomMain.cpp
     300-Gen-OwnGenerator.cpp
     301-Gen-MapTypeConversion.cpp
     302-Gen-Table.cpp
@@ -53,7 +54,7 @@ set(ALL_EXAMPLE_TARGETS
 )
 
 foreach( name ${ALL_EXAMPLE_TARGETS} )
-    target_link_libraries( ${name} Catch2 Catch2WithMain )
+    target_link_libraries( ${name} Catch2WithMain )
 endforeach()
 
 
diff --git a/alpaka/thirdParty/catch2/extras/Catch.cmake b/alpaka/thirdParty/catch2/extras/Catch.cmake
index bc553591..8f30688c 100644
--- a/alpaka/thirdParty/catch2/extras/Catch.cmake
+++ b/alpaka/thirdParty/catch2/extras/Catch.cmake
@@ -35,8 +35,9 @@ same as the Catch name; see also ``TEST_PREFIX`` and ``TEST_SUFFIX``.
                          [TEST_LIST var]
                          [REPORTER reporter]
                          [OUTPUT_DIR dir]
-                         [OUTPUT_PREFIX prefix}
+                         [OUTPUT_PREFIX prefix]
                          [OUTPUT_SUFFIX suffix]
+                         [DISCOVERY_MODE <POST_BUILD|PRE_TEST>]
     )
 
   ``catch_discover_tests`` sets up a post-build command on the test executable
@@ -123,14 +124,28 @@ same as the Catch name; see also ``TEST_PREFIX`` and ``TEST_SUFFIX``.
     test executable and when the tests are executed themselves. This requires
     cmake/ctest >= 3.22.
 
+  `DISCOVERY_MODE mode``
+    Provides control over when ``catch_discover_tests`` performs test discovery.
+    By default, ``POST_BUILD`` sets up a post-build command to perform test discovery
+    at build time. In certain scenarios, like cross-compiling, this ``POST_BUILD``
+    behavior is not desirable. By contrast, ``PRE_TEST`` delays test discovery until
+    just prior to test execution. This way test discovery occurs in the target environment
+    where the test has a better chance at finding appropriate runtime dependencies.
+
+    ``DISCOVERY_MODE`` defaults to the value of the
+    ``CMAKE_CATCH_DISCOVER_TESTS_DISCOVERY_MODE`` variable if it is not passed when
+    calling ``catch_discover_tests``. This provides a mechanism for globally selecting
+    a preferred test discovery behavior without having to modify each call site.
+
 #]=======================================================================]
 
 #------------------------------------------------------------------------------
 function(catch_discover_tests TARGET)
+
   cmake_parse_arguments(
     ""
     ""
-    "TEST_PREFIX;TEST_SUFFIX;WORKING_DIRECTORY;TEST_LIST;REPORTER;OUTPUT_DIR;OUTPUT_PREFIX;OUTPUT_SUFFIX"
+    "TEST_PREFIX;TEST_SUFFIX;WORKING_DIRECTORY;TEST_LIST;REPORTER;OUTPUT_DIR;OUTPUT_PREFIX;OUTPUT_SUFFIX;DISCOVERY_MODE"
     "TEST_SPEC;EXTRA_ARGS;PROPERTIES;DL_PATHS"
     ${ARGN}
   )
@@ -141,57 +156,128 @@ function(catch_discover_tests TARGET)
   if(NOT _TEST_LIST)
     set(_TEST_LIST ${TARGET}_TESTS)
   endif()
-
   if (_DL_PATHS)
     if(${CMAKE_VERSION} VERSION_LESS "3.22.0")
         message(FATAL_ERROR "The DL_PATHS option requires at least cmake 3.22")
     endif()
   endif()
+  if(NOT _DISCOVERY_MODE)
+    if(NOT CMAKE_CATCH_DISCOVER_TESTS_DISCOVERY_MODE)
+      set(CMAKE_CATCH_DISCOVER_TESTS_DISCOVERY_MODE "POST_BUILD")
+    endif()
+    set(_DISCOVERY_MODE ${CMAKE_CATCH_DISCOVER_TESTS_DISCOVERY_MODE})
+  endif()
+  if (NOT _DISCOVERY_MODE MATCHES "^(POST_BUILD|PRE_TEST)$")
+    message(FATAL_ERROR "Unknown DISCOVERY_MODE: ${_DISCOVERY_MODE}")
+  endif()
 
   ## Generate a unique name based on the extra arguments
   string(SHA1 args_hash "${_TEST_SPEC} ${_EXTRA_ARGS} ${_REPORTER} ${_OUTPUT_DIR} ${_OUTPUT_PREFIX} ${_OUTPUT_SUFFIX}")
   string(SUBSTRING ${args_hash} 0 7 args_hash)
 
   # Define rule to generate test list for aforementioned test executable
-  set(ctest_include_file "${CMAKE_CURRENT_BINARY_DIR}/${TARGET}_include-${args_hash}.cmake")
-  set(ctest_tests_file "${CMAKE_CURRENT_BINARY_DIR}/${TARGET}_tests-${args_hash}.cmake")
+  set(ctest_file_base "${CMAKE_CURRENT_BINARY_DIR}/${TARGET}-${args_hash}")
+  set(ctest_include_file "${ctest_file_base}_include.cmake")
+  set(ctest_tests_file "${ctest_file_base}_tests.cmake")
+
   get_property(crosscompiling_emulator
     TARGET ${TARGET}
     PROPERTY CROSSCOMPILING_EMULATOR
   )
-  add_custom_command(
-    TARGET ${TARGET} POST_BUILD
-    BYPRODUCTS "${ctest_tests_file}"
-    COMMAND "${CMAKE_COMMAND}"
-            -D "TEST_TARGET=${TARGET}"
-            -D "TEST_EXECUTABLE=$<TARGET_FILE:${TARGET}>"
-            -D "TEST_EXECUTOR=${crosscompiling_emulator}"
-            -D "TEST_WORKING_DIR=${_WORKING_DIRECTORY}"
-            -D "TEST_SPEC=${_TEST_SPEC}"
-            -D "TEST_EXTRA_ARGS=${_EXTRA_ARGS}"
-            -D "TEST_PROPERTIES=${_PROPERTIES}"
-            -D "TEST_PREFIX=${_TEST_PREFIX}"
-            -D "TEST_SUFFIX=${_TEST_SUFFIX}"
-            -D "TEST_LIST=${_TEST_LIST}"
-            -D "TEST_REPORTER=${_REPORTER}"
-            -D "TEST_OUTPUT_DIR=${_OUTPUT_DIR}"
-            -D "TEST_OUTPUT_PREFIX=${_OUTPUT_PREFIX}"
-            -D "TEST_OUTPUT_SUFFIX=${_OUTPUT_SUFFIX}"
-            -D "TEST_DL_PATHS=${_DL_PATHS}"
-            -D "CTEST_FILE=${ctest_tests_file}"
-            -P "${_CATCH_DISCOVER_TESTS_SCRIPT}"
-    VERBATIM
-  )
 
-  file(WRITE "${ctest_include_file}"
-    "if(EXISTS \"${ctest_tests_file}\")\n"
-    "  include(\"${ctest_tests_file}\")\n"
-    "else()\n"
-    "  add_test(${TARGET}_NOT_BUILT-${args_hash} ${TARGET}_NOT_BUILT-${args_hash})\n"
-    "endif()\n"
-  )
+  if(_DISCOVERY_MODE STREQUAL "POST_BUILD")
+    add_custom_command(
+      TARGET ${TARGET} POST_BUILD
+      BYPRODUCTS "${ctest_tests_file}"
+      COMMAND "${CMAKE_COMMAND}"
+              -D "TEST_TARGET=${TARGET}"
+              -D "TEST_EXECUTABLE=$<TARGET_FILE:${TARGET}>"
+              -D "TEST_EXECUTOR=${crosscompiling_emulator}"
+              -D "TEST_WORKING_DIR=${_WORKING_DIRECTORY}"
+              -D "TEST_SPEC=${_TEST_SPEC}"
+              -D "TEST_EXTRA_ARGS=${_EXTRA_ARGS}"
+              -D "TEST_PROPERTIES=${_PROPERTIES}"
+              -D "TEST_PREFIX=${_TEST_PREFIX}"
+              -D "TEST_SUFFIX=${_TEST_SUFFIX}"
+              -D "TEST_LIST=${_TEST_LIST}"
+              -D "TEST_REPORTER=${_REPORTER}"
+              -D "TEST_OUTPUT_DIR=${_OUTPUT_DIR}"
+              -D "TEST_OUTPUT_PREFIX=${_OUTPUT_PREFIX}"
+              -D "TEST_OUTPUT_SUFFIX=${_OUTPUT_SUFFIX}"
+              -D "TEST_DL_PATHS=${_DL_PATHS}"
+              -D "CTEST_FILE=${ctest_tests_file}"
+              -P "${_CATCH_DISCOVER_TESTS_SCRIPT}"
+      VERBATIM
+    )
+
+    file(WRITE "${ctest_include_file}"
+      "if(EXISTS \"${ctest_tests_file}\")\n"
+      "  include(\"${ctest_tests_file}\")\n"
+      "else()\n"
+      "  add_test(${TARGET}_NOT_BUILT-${args_hash} ${TARGET}_NOT_BUILT-${args_hash})\n"
+      "endif()\n"
+    )
 
-  if(NOT ${CMAKE_VERSION} VERSION_LESS "3.10.0") 
+  elseif(_DISCOVERY_MODE STREQUAL "PRE_TEST")
+
+    get_property(GENERATOR_IS_MULTI_CONFIG GLOBAL
+        PROPERTY GENERATOR_IS_MULTI_CONFIG
+    )
+
+    if(GENERATOR_IS_MULTI_CONFIG)
+      set(ctest_tests_file "${ctest_file_base}_tests-$<CONFIG>.cmake")
+    endif()
+
+    string(CONCAT ctest_include_content
+      "if(EXISTS \"$<TARGET_FILE:${TARGET}>\")"                                    "\n"
+      "  if(NOT EXISTS \"${ctest_tests_file}\" OR"                                 "\n"
+      "     NOT \"${ctest_tests_file}\" IS_NEWER_THAN \"$<TARGET_FILE:${TARGET}>\" OR\n"
+      "     NOT \"${ctest_tests_file}\" IS_NEWER_THAN \"\${CMAKE_CURRENT_LIST_FILE}\")\n"
+      "    include(\"${_CATCH_DISCOVER_TESTS_SCRIPT}\")"                           "\n"
+      "    catch_discover_tests_impl("                                             "\n"
+      "      TEST_EXECUTABLE"        " [==[" "$<TARGET_FILE:${TARGET}>"   "]==]"   "\n"
+      "      TEST_EXECUTOR"          " [==[" "${crosscompiling_emulator}" "]==]"   "\n"
+      "      TEST_WORKING_DIR"       " [==[" "${_WORKING_DIRECTORY}"      "]==]"   "\n"
+      "      TEST_SPEC"              " [==[" "${_TEST_SPEC}"              "]==]"   "\n"
+      "      TEST_EXTRA_ARGS"        " [==[" "${_EXTRA_ARGS}"             "]==]"   "\n"
+      "      TEST_PROPERTIES"        " [==[" "${_PROPERTIES}"             "]==]"   "\n"
+      "      TEST_PREFIX"            " [==[" "${_TEST_PREFIX}"            "]==]"   "\n"
+      "      TEST_SUFFIX"            " [==[" "${_TEST_SUFFIX}"            "]==]"   "\n"
+      "      TEST_LIST"              " [==[" "${_TEST_LIST}"              "]==]"   "\n"
+      "      TEST_REPORTER"          " [==[" "${_REPORTER}"               "]==]"   "\n"
+      "      TEST_OUTPUT_DIR"        " [==[" "${_OUTPUT_DIR}"             "]==]"   "\n"
+      "      TEST_OUTPUT_PREFIX"     " [==[" "${_OUTPUT_PREFIX}"          "]==]"   "\n"
+      "      TEST_OUTPUT_SUFFIX"     " [==[" "${_OUTPUT_SUFFIX}"          "]==]"   "\n"
+      "      CTEST_FILE"             " [==[" "${ctest_tests_file}"        "]==]"   "\n"
+      "      TEST_DL_PATHS"          " [==[" "${_DL_PATHS}"               "]==]"   "\n"
+      "      CTEST_FILE"             " [==[" "${CTEST_FILE}"              "]==]"   "\n"
+      "    )"                                                                      "\n"
+      "  endif()"                                                                  "\n"
+      "  include(\"${ctest_tests_file}\")"                                         "\n"
+      "else()"                                                                     "\n"
+      "  add_test(${TARGET}_NOT_BUILT ${TARGET}_NOT_BUILT)"                        "\n"
+      "endif()"                                                                    "\n"
+    )
+
+    if(GENERATOR_IS_MULTI_CONFIG)
+      foreach(_config ${CMAKE_CONFIGURATION_TYPES})
+        file(GENERATE OUTPUT "${ctest_file_base}_include-${_config}.cmake" CONTENT "${ctest_include_content}" CONDITION $<CONFIG:${_config}>)
+      endforeach()
+      string(CONCAT ctest_include_multi_content
+        "if(NOT CTEST_CONFIGURATION_TYPE)"                                              "\n"
+        "  message(\"No configuration for testing specified, use '-C <cfg>'.\")"        "\n"
+        "else()"                                                                        "\n"
+        "  include(\"${ctest_file_base}_include-\${CTEST_CONFIGURATION_TYPE}.cmake\")"  "\n"
+        "endif()"                                                                       "\n"
+      )
+      file(GENERATE OUTPUT "${ctest_include_file}" CONTENT "${ctest_include_multi_content}")
+    else()
+      file(GENERATE OUTPUT "${ctest_file_base}_include.cmake" CONTENT "${ctest_include_content}")
+      file(WRITE "${ctest_include_file}" "include(\"${ctest_file_base}_include.cmake\")")
+    endif()
+  endif()
+
+  if(NOT ${CMAKE_VERSION} VERSION_LESS "3.10.0")
     # Add discovered tests to directory TEST_INCLUDE_FILES
     set_property(DIRECTORY
       APPEND PROPERTY TEST_INCLUDE_FILES "${ctest_include_file}"
@@ -204,9 +290,7 @@ function(catch_discover_tests TARGET)
         PROPERTY TEST_INCLUDE_FILE "${ctest_include_file}"
       )
     else()
-      message(FATAL_ERROR
-        "Cannot set more than one TEST_INCLUDE_FILE"
-      )
+      message(FATAL_ERROR "Cannot set more than one TEST_INCLUDE_FILE")
     endif()
   endif()
 
diff --git a/alpaka/thirdParty/catch2/extras/CatchAddTests.cmake b/alpaka/thirdParty/catch2/extras/CatchAddTests.cmake
index beec3aed..692e3405 100644
--- a/alpaka/thirdParty/catch2/extras/CatchAddTests.cmake
+++ b/alpaka/thirdParty/catch2/extras/CatchAddTests.cmake
@@ -1,28 +1,6 @@
 # Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
 # file Copyright.txt or https://cmake.org/licensing for details.
 
-set(prefix "${TEST_PREFIX}")
-set(suffix "${TEST_SUFFIX}")
-set(spec ${TEST_SPEC})
-set(extra_args ${TEST_EXTRA_ARGS})
-set(properties ${TEST_PROPERTIES})
-set(reporter ${TEST_REPORTER})
-set(output_dir ${TEST_OUTPUT_DIR})
-set(output_prefix ${TEST_OUTPUT_PREFIX})
-set(output_suffix ${TEST_OUTPUT_SUFFIX})
-set(dl_paths ${TEST_DL_PATHS})
-set(script)
-set(suite)
-set(tests)
-
-if(WIN32)
-  set(dl_paths_variable_name PATH)
-elseif(APPLE)
-  set(dl_paths_variable_name DYLD_LIBRARY_PATH)
-else()
-  set(dl_paths_variable_name LD_LIBRARY_PATH)
-endif()
-
 function(add_command NAME)
   set(_args "")
   # use ARGV* instead of ARGN, because ARGN splits arrays into multiple arguments
@@ -38,119 +16,177 @@ function(add_command NAME)
   set(script "${script}${NAME}(${_args})\n" PARENT_SCOPE)
 endfunction()
 
-# Run test executable to get list of available tests
-if(NOT EXISTS "${TEST_EXECUTABLE}")
-  message(FATAL_ERROR
-    "Specified test executable '${TEST_EXECUTABLE}' does not exist"
+function(catch_discover_tests_impl)
+
+  cmake_parse_arguments(
+    ""
+    ""
+    "TEST_EXECUTABLE;TEST_WORKING_DIR;TEST_DL_PATHS;TEST_OUTPUT_DIR;TEST_OUTPUT_PREFIX;TEST_OUTPUT_SUFFIX;TEST_PREFIX;TEST_REPORTER;TEST_SPEC;TEST_SUFFIX;TEST_LIST;CTEST_FILE"
+    "TEST_EXTRA_ARGS;TEST_PROPERTIES;TEST_EXECUTOR"
+    ${ARGN}
   )
-endif()
 
-if(dl_paths)
-  cmake_path(CONVERT "${dl_paths}" TO_NATIVE_PATH_LIST paths)
-  set(ENV{${dl_paths_variable_name}} "${paths}")
-endif()
+  set(prefix "${_TEST_PREFIX}")
+  set(suffix "${_TEST_SUFFIX}")
+  set(spec ${_TEST_SPEC})
+  set(extra_args ${_TEST_EXTRA_ARGS})
+  set(properties ${_TEST_PROPERTIES})
+  set(reporter ${_TEST_REPORTER})
+  set(output_dir ${_TEST_OUTPUT_DIR})
+  set(output_prefix ${_TEST_OUTPUT_PREFIX})
+  set(output_suffix ${_TEST_OUTPUT_SUFFIX})
+  set(dl_paths ${_TEST_DL_PATHS})
+  set(script)
+  set(suite)
+  set(tests)
+
+  if(WIN32)
+    set(dl_paths_variable_name PATH)
+  elseif(APPLE)
+    set(dl_paths_variable_name DYLD_LIBRARY_PATH)
+  else()
+    set(dl_paths_variable_name LD_LIBRARY_PATH)
+  endif()
 
-execute_process(
-  COMMAND ${TEST_EXECUTOR} "${TEST_EXECUTABLE}" ${spec} --list-tests --verbosity quiet
-  OUTPUT_VARIABLE output
-  RESULT_VARIABLE result
-  WORKING_DIRECTORY "${TEST_WORKING_DIR}"
-)
-if(NOT ${result} EQUAL 0)
-  message(FATAL_ERROR
-    "Error running test executable '${TEST_EXECUTABLE}':\n"
-    "  Result: ${result}\n"
-    "  Output: ${output}\n"
-  )
-endif()
+  # Run test executable to get list of available tests
+  if(NOT EXISTS "${_TEST_EXECUTABLE}")
+    message(FATAL_ERROR
+      "Specified test executable '${_TEST_EXECUTABLE}' does not exist"
+    )
+  endif()
 
-string(REPLACE "\n" ";" output "${output}")
-
-# Run test executable to get list of available reporters
-execute_process(
-  COMMAND ${TEST_EXECUTOR} "${TEST_EXECUTABLE}" ${spec} --list-reporters
-  OUTPUT_VARIABLE reporters_output
-  RESULT_VARIABLE reporters_result
-  WORKING_DIRECTORY "${TEST_WORKING_DIR}"
-)
-if(NOT ${reporters_result} EQUAL 0)
-  message(FATAL_ERROR
-    "Error running test executable '${TEST_EXECUTABLE}':\n"
-    "  Result: ${reporters_result}\n"
-    "  Output: ${reporters_output}\n"
-  )
-endif()
-string(FIND "${reporters_output}" "${reporter}" reporter_is_valid)
-if(reporter AND ${reporter_is_valid} EQUAL -1)
-  message(FATAL_ERROR
-    "\"${reporter}\" is not a valid reporter!\n"
-  )
-endif()
+  if(dl_paths)
+    cmake_path(CONVERT "${dl_paths}" TO_NATIVE_PATH_LIST paths)
+    set(ENV{${dl_paths_variable_name}} "${paths}")
+  endif()
 
-# Prepare reporter
-if(reporter)
-  set(reporter_arg "--reporter ${reporter}")
-endif()
+  execute_process(
+    COMMAND ${_TEST_EXECUTOR} "${_TEST_EXECUTABLE}" ${spec} --list-tests --verbosity quiet
+    OUTPUT_VARIABLE output
+    RESULT_VARIABLE result
+    WORKING_DIRECTORY "${_TEST_WORKING_DIR}"
+  )
+  if(NOT ${result} EQUAL 0)
+    message(FATAL_ERROR
+      "Error running test executable '${_TEST_EXECUTABLE}':\n"
+      "  Result: ${result}\n"
+      "  Output: ${output}\n"
+    )
+  endif()
 
-# Prepare output dir
-if(output_dir AND NOT IS_ABSOLUTE ${output_dir})
-  set(output_dir "${TEST_WORKING_DIR}/${output_dir}")
-  if(NOT EXISTS ${output_dir})
-    file(MAKE_DIRECTORY ${output_dir})
+  # Make sure to escape ; (semicolons) in test names first, because
+  # that'd break the foreach loop for "Parse output" later and create
+  # wrongly splitted and thus failing test cases (false positives)
+  string(REPLACE ";" "\;" output "${output}")
+  string(REPLACE "\n" ";" output "${output}")
+
+  # Prepare reporter
+  if(reporter)
+    set(reporter_arg "--reporter ${reporter}")
+
+    # Run test executable to check whether reporter is available
+    # note that the use of --list-reporters is not the important part,
+    # we only want to check whether the execution succeeds with ${reporter_arg}
+    execute_process(
+      COMMAND ${_TEST_EXECUTOR} "${_TEST_EXECUTABLE}" ${spec} ${reporter_arg} --list-reporters
+      OUTPUT_VARIABLE reporter_check_output
+      RESULT_VARIABLE reporter_check_result
+      WORKING_DIRECTORY "${_TEST_WORKING_DIR}"
+    )
+    if(${reporter_check_result} EQUAL 255)
+      message(FATAL_ERROR
+        "\"${reporter}\" is not a valid reporter!\n"
+      )
+    elseif(NOT ${reporter_check_result} EQUAL 0)
+      message(FATAL_ERROR
+        "Error running test executable '${_TEST_EXECUTABLE}':\n"
+        "  Result: ${reporter_check_result}\n"
+        "  Output: ${reporter_check_output}\n"
+      )
+    endif()
   endif()
-endif()
 
-if(dl_paths)
-  foreach(path ${dl_paths})
-    cmake_path(NATIVE_PATH path native_path)
-    list(APPEND environment_modifications "${dl_paths_variable_name}=path_list_prepend:${native_path}")
-  endforeach()
-endif()
+  # Prepare output dir
+  if(output_dir AND NOT IS_ABSOLUTE ${output_dir})
+    set(output_dir "${_TEST_WORKING_DIR}/${output_dir}")
+    if(NOT EXISTS ${output_dir})
+      file(MAKE_DIRECTORY ${output_dir})
+    endif()
+  endif()
 
-# Parse output
-foreach(line ${output})
-  set(test ${line})
-  # Escape characters in test case names that would be parsed by Catch2
-  set(test_name ${test})
-  foreach(char , [ ])
-    string(REPLACE ${char} "\\${char}" test_name ${test_name})
-  endforeach(char)
-  # ...add output dir
-  if(output_dir)
-    string(REGEX REPLACE "[^A-Za-z0-9_]" "_" test_name_clean ${test_name})
-    set(output_dir_arg "--out ${output_dir}/${output_prefix}${test_name_clean}${output_suffix}")
+  if(dl_paths)
+    foreach(path ${dl_paths})
+      cmake_path(NATIVE_PATH path native_path)
+      list(APPEND environment_modifications "${dl_paths_variable_name}=path_list_prepend:${native_path}")
+    endforeach()
   endif()
-  
-  # ...and add to script
-  add_command(add_test
-    "${prefix}${test}${suffix}"
-    ${TEST_EXECUTOR}
-    "${TEST_EXECUTABLE}"
-    "${test_name}"
-    ${extra_args}
-    "${reporter_arg}"
-    "${output_dir_arg}"
-  )
-  add_command(set_tests_properties
-    "${prefix}${test}${suffix}"
-    PROPERTIES
-    WORKING_DIRECTORY "${TEST_WORKING_DIR}"
-    ${properties}
-  )
 
-   if(environment_modifications)
-     add_command(set_tests_properties
-       "${prefix}${test}${suffix}"
-       PROPERTIES
-       ENVIRONMENT_MODIFICATION "${environment_modifications}")
-   endif()
+  # Parse output
+  foreach(line ${output})
+    set(test "${line}")
+    # Escape characters in test case names that would be parsed by Catch2
+    # Note that the \ escaping must happen FIRST! Do not change the order.
+    set(test_name "${test}")
+    foreach(char \\ , [ ])
+      string(REPLACE ${char} "\\${char}" test_name "${test_name}")
+    endforeach(char)
+    # ...add output dir
+    if(output_dir)
+      string(REGEX REPLACE "[^A-Za-z0-9_]" "_" test_name_clean "${test_name}")
+      set(output_dir_arg "--out ${output_dir}/${output_prefix}${test_name_clean}${output_suffix}")
+    endif()
+
+    # ...and add to script
+    add_command(add_test
+      "${prefix}${test}${suffix}"
+      ${_TEST_EXECUTOR}
+      "${_TEST_EXECUTABLE}"
+      "${test_name}"
+      ${extra_args}
+      "${reporter_arg}"
+      "${output_dir_arg}"
+    )
+    add_command(set_tests_properties
+      "${prefix}${test}${suffix}"
+      PROPERTIES
+      WORKING_DIRECTORY "${_TEST_WORKING_DIR}"
+      ${properties}
+    )
+
+    if(environment_modifications)
+      add_command(set_tests_properties
+        "${prefix}${test}${suffix}"
+        PROPERTIES
+        ENVIRONMENT_MODIFICATION "${environment_modifications}")
+    endif()
+
+    list(APPEND tests "${prefix}${test}${suffix}")
+  endforeach()
 
-  list(APPEND tests "${prefix}${test}${suffix}")
-endforeach()
+  # Create a list of all discovered tests, which users may use to e.g. set
+  # properties on the tests
+  add_command(set ${_TEST_LIST} ${tests})
 
-# Create a list of all discovered tests, which users may use to e.g. set
-# properties on the tests
-add_command(set ${TEST_LIST} ${tests})
+  # Write CTest script
+  file(WRITE "${_CTEST_FILE}" "${script}")
+endfunction()
 
-# Write CTest script
-file(WRITE "${CTEST_FILE}" "${script}")
+if(CMAKE_SCRIPT_MODE_FILE)
+  catch_discover_tests_impl(
+    TEST_EXECUTABLE ${TEST_EXECUTABLE}
+    TEST_EXECUTOR ${TEST_EXECUTOR}
+    TEST_WORKING_DIR ${TEST_WORKING_DIR}
+    TEST_SPEC ${TEST_SPEC}
+    TEST_EXTRA_ARGS ${TEST_EXTRA_ARGS}
+    TEST_PROPERTIES ${TEST_PROPERTIES}
+    TEST_PREFIX ${TEST_PREFIX}
+    TEST_SUFFIX ${TEST_SUFFIX}
+    TEST_LIST ${TEST_LIST}
+    TEST_REPORTER ${TEST_REPORTER}
+    TEST_OUTPUT_DIR ${TEST_OUTPUT_DIR}
+    TEST_OUTPUT_PREFIX ${TEST_OUTPUT_PREFIX}
+    TEST_OUTPUT_SUFFIX ${TEST_OUTPUT_SUFFIX}
+    TEST_DL_PATHS ${TEST_DL_PATHS}
+    CTEST_FILE ${CTEST_FILE}
+  )
+endif()
diff --git a/alpaka/thirdParty/catch2/extras/CatchShardTests.cmake b/alpaka/thirdParty/catch2/extras/CatchShardTests.cmake
index 5e043cf0..68228f5a 100644
--- a/alpaka/thirdParty/catch2/extras/CatchShardTests.cmake
+++ b/alpaka/thirdParty/catch2/extras/CatchShardTests.cmake
@@ -46,7 +46,7 @@ function(catch_add_sharded_tests TARGET)
     APPEND PROPERTY TEST_INCLUDE_FILES "${ctest_include_file}"
   )
 
-  set(shard_impl_script_file "${CMAKE_CURRENT_LIST_DIR}/CatchShardTestsImpl.cmake")
+  set(shard_impl_script_file "${_CATCH_DISCOVER_SHARD_TESTS_IMPL_SCRIPT}")
 
   add_custom_command(
     TARGET ${TARGET} POST_BUILD
@@ -64,3 +64,11 @@ function(catch_add_sharded_tests TARGET)
 
 
 endfunction()
+
+
+###############################################################################
+
+set(_CATCH_DISCOVER_SHARD_TESTS_IMPL_SCRIPT
+    ${CMAKE_CURRENT_LIST_DIR}/CatchShardTestsImpl.cmake
+  CACHE INTERNAL "Catch2 full path to CatchShardTestsImpl.cmake helper file"
+)
diff --git a/alpaka/thirdParty/catch2/extras/catch_amalgamated.cpp b/alpaka/thirdParty/catch2/extras/catch_amalgamated.cpp
index a81b1b6a..f68c9005 100644
--- a/alpaka/thirdParty/catch2/extras/catch_amalgamated.cpp
+++ b/alpaka/thirdParty/catch2/extras/catch_amalgamated.cpp
@@ -1,3 +1,4 @@
+
 //              Copyright Catch2 Authors
 // Distributed under the Boost Software License, Version 1.0.
 //   (See accompanying file LICENSE.txt or copy at
@@ -5,8 +6,8 @@
 
 // SPDX-License-Identifier: BSL-1.0
 
-//  Catch v3.3.2
-//  Generated: 2023-02-26 10:28:48.270752
+//  Catch v3.5.2
+//  Generated: 2024-01-15 14:06:36.675713
 //  ----------------------------------------------------------
 //  This file is an amalgamation of multiple different files.
 //  You probably shouldn't edit it directly.
@@ -48,6 +49,80 @@ namespace Catch {
 } // namespace Catch
 
 
+// Adapted from donated nonius code.
+
+
+#include <vector>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            SampleAnalysis analyse(const IConfig &cfg, FDuration* first, FDuration* last) {
+                if (!cfg.benchmarkNoAnalysis()) {
+                    std::vector<double> samples;
+                    samples.reserve(static_cast<size_t>(last - first));
+                    for (auto current = first; current != last; ++current) {
+                        samples.push_back( current->count() );
+                    }
+
+                    auto analysis = Catch::Benchmark::Detail::analyse_samples(
+                        cfg.benchmarkConfidenceInterval(),
+                        cfg.benchmarkResamples(),
+                        samples.data(),
+                        samples.data() + samples.size() );
+                    auto outliers = Catch::Benchmark::Detail::classify_outliers(
+                        samples.data(), samples.data() + samples.size() );
+
+                    auto wrap_estimate = [](Estimate<double> e) {
+                        return Estimate<FDuration> {
+                            FDuration(e.point),
+                                FDuration(e.lower_bound),
+                                FDuration(e.upper_bound),
+                                e.confidence_interval,
+                        };
+                    };
+                    std::vector<FDuration> samples2;
+                    samples2.reserve(samples.size());
+                    for (auto s : samples) {
+                        samples2.push_back( FDuration( s ) );
+                    }
+
+                    return {
+                        CATCH_MOVE(samples2),
+                        wrap_estimate(analysis.mean),
+                        wrap_estimate(analysis.standard_deviation),
+                        outliers,
+                        analysis.outlier_variance,
+                    };
+                } else {
+                    std::vector<FDuration> samples;
+                    samples.reserve(static_cast<size_t>(last - first));
+
+                    FDuration mean = FDuration(0);
+                    int i = 0;
+                    for (auto it = first; it < last; ++it, ++i) {
+                        samples.push_back(FDuration(*it));
+                        mean += FDuration(*it);
+                    }
+                    mean /= i;
+
+                    return SampleAnalysis{
+                        CATCH_MOVE(samples),
+                        Estimate<FDuration>{ mean, mean, mean, 0.0 },
+                        Estimate<FDuration>{ FDuration( 0 ),
+                                             FDuration( 0 ),
+                                             FDuration( 0 ),
+                                             0.0 },
+                        OutlierClassification{},
+                        0.0
+                    };
+                }
+            }
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+
 
 
 namespace Catch {
@@ -60,6 +135,7 @@ namespace Catch {
 
 
 
+
 #include <exception>
 
 namespace Catch {
@@ -86,9 +162,11 @@ namespace Catch {
 
 
 
+#include <algorithm>
 #include <cassert>
+#include <cmath>
 #include <cstddef>
-#include <iterator>
+#include <numeric>
 #include <random>
 
 
@@ -96,139 +174,199 @@ namespace Catch {
 #include <future>
 #endif
 
-namespace {
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            namespace {
+
+                template <typename URng, typename Estimator>
+                static sample
+                resample( URng& rng,
+                          unsigned int resamples,
+                          double const* first,
+                          double const* last,
+                          Estimator& estimator ) {
+                    auto n = static_cast<size_t>( last - first );
+                    std::uniform_int_distribution<size_t> dist( 0, n - 1 );
+
+                    sample out;
+                    out.reserve( resamples );
+                    std::vector<double> resampled;
+                    resampled.reserve( n );
+                    for ( size_t i = 0; i < resamples; ++i ) {
+                        resampled.clear();
+                        for ( size_t s = 0; s < n; ++s ) {
+                            resampled.push_back( first[dist( rng )] );
+                        }
+                        const auto estimate =
+                            estimator( resampled.data(), resampled.data() + resampled.size() );
+                        out.push_back( estimate );
+                    }
+                    std::sort( out.begin(), out.end() );
+                    return out;
+                }
 
-using Catch::Benchmark::Detail::sample;
-
-     template <typename URng, typename Estimator>
-     sample resample(URng& rng, unsigned int resamples, std::vector<double>::iterator first, std::vector<double>::iterator last, Estimator& estimator) {
-         auto n = static_cast<size_t>(last - first);
-         std::uniform_int_distribution<decltype(n)> dist(0, n - 1);
-
-         sample out;
-         out.reserve(resamples);
-         std::generate_n(std::back_inserter(out), resamples, [n, first, &estimator, &dist, &rng] {
-             std::vector<double> resampled;
-             resampled.reserve(n);
-             std::generate_n(std::back_inserter(resampled), n, [first, &dist, &rng] { return first[static_cast<std::ptrdiff_t>(dist(rng))]; });
-             return estimator(resampled.begin(), resampled.end());
-         });
-         std::sort(out.begin(), out.end());
-         return out;
-     }
-
-
-    double erf_inv(double x) {
-        // Code accompanying the article "Approximating the erfinv function" in GPU Computing Gems, Volume 2
-        double w, p;
-
-        w = -log((1.0 - x) * (1.0 + x));
-
-        if (w < 6.250000) {
-            w = w - 3.125000;
-            p = -3.6444120640178196996e-21;
-            p = -1.685059138182016589e-19 + p * w;
-            p = 1.2858480715256400167e-18 + p * w;
-            p = 1.115787767802518096e-17 + p * w;
-            p = -1.333171662854620906e-16 + p * w;
-            p = 2.0972767875968561637e-17 + p * w;
-            p = 6.6376381343583238325e-15 + p * w;
-            p = -4.0545662729752068639e-14 + p * w;
-            p = -8.1519341976054721522e-14 + p * w;
-            p = 2.6335093153082322977e-12 + p * w;
-            p = -1.2975133253453532498e-11 + p * w;
-            p = -5.4154120542946279317e-11 + p * w;
-            p = 1.051212273321532285e-09 + p * w;
-            p = -4.1126339803469836976e-09 + p * w;
-            p = -2.9070369957882005086e-08 + p * w;
-            p = 4.2347877827932403518e-07 + p * w;
-            p = -1.3654692000834678645e-06 + p * w;
-            p = -1.3882523362786468719e-05 + p * w;
-            p = 0.0001867342080340571352 + p * w;
-            p = -0.00074070253416626697512 + p * w;
-            p = -0.0060336708714301490533 + p * w;
-            p = 0.24015818242558961693 + p * w;
-            p = 1.6536545626831027356 + p * w;
-        } else if (w < 16.000000) {
-            w = sqrt(w) - 3.250000;
-            p = 2.2137376921775787049e-09;
-            p = 9.0756561938885390979e-08 + p * w;
-            p = -2.7517406297064545428e-07 + p * w;
-            p = 1.8239629214389227755e-08 + p * w;
-            p = 1.5027403968909827627e-06 + p * w;
-            p = -4.013867526981545969e-06 + p * w;
-            p = 2.9234449089955446044e-06 + p * w;
-            p = 1.2475304481671778723e-05 + p * w;
-            p = -4.7318229009055733981e-05 + p * w;
-            p = 6.8284851459573175448e-05 + p * w;
-            p = 2.4031110387097893999e-05 + p * w;
-            p = -0.0003550375203628474796 + p * w;
-            p = 0.00095328937973738049703 + p * w;
-            p = -0.0016882755560235047313 + p * w;
-            p = 0.0024914420961078508066 + p * w;
-            p = -0.0037512085075692412107 + p * w;
-            p = 0.005370914553590063617 + p * w;
-            p = 1.0052589676941592334 + p * w;
-            p = 3.0838856104922207635 + p * w;
-        } else {
-            w = sqrt(w) - 5.000000;
-            p = -2.7109920616438573243e-11;
-            p = -2.5556418169965252055e-10 + p * w;
-            p = 1.5076572693500548083e-09 + p * w;
-            p = -3.7894654401267369937e-09 + p * w;
-            p = 7.6157012080783393804e-09 + p * w;
-            p = -1.4960026627149240478e-08 + p * w;
-            p = 2.9147953450901080826e-08 + p * w;
-            p = -6.7711997758452339498e-08 + p * w;
-            p = 2.2900482228026654717e-07 + p * w;
-            p = -9.9298272942317002539e-07 + p * w;
-            p = 4.5260625972231537039e-06 + p * w;
-            p = -1.9681778105531670567e-05 + p * w;
-            p = 7.5995277030017761139e-05 + p * w;
-            p = -0.00021503011930044477347 + p * w;
-            p = -0.00013871931833623122026 + p * w;
-            p = 1.0103004648645343977 + p * w;
-            p = 4.8499064014085844221 + p * w;
-        }
-        return p * x;
-    }
-
-    double standard_deviation(std::vector<double>::iterator first, std::vector<double>::iterator last) {
-        auto m = Catch::Benchmark::Detail::mean(first, last);
-        double variance = std::accumulate( first,
-                                           last,
-                                           0.,
-                                           [m]( double a, double b ) {
-                                               double diff = b - m;
-                                               return a + diff * diff;
-                                           } ) /
-                          ( last - first );
-        return std::sqrt( variance );
-    }
+                static double outlier_variance( Estimate<double> mean,
+                                                Estimate<double> stddev,
+                                                int n ) {
+                    double sb = stddev.point;
+                    double mn = mean.point / n;
+                    double mg_min = mn / 2.;
+                    double sg = (std::min)( mg_min / 4., sb / std::sqrt( n ) );
+                    double sg2 = sg * sg;
+                    double sb2 = sb * sb;
+
+                    auto c_max = [n, mn, sb2, sg2]( double x ) -> double {
+                        double k = mn - x;
+                        double d = k * k;
+                        double nd = n * d;
+                        double k0 = -n * nd;
+                        double k1 = sb2 - n * sg2 + nd;
+                        double det = k1 * k1 - 4 * sg2 * k0;
+                        return static_cast<int>( -2. * k0 /
+                                                 ( k1 + std::sqrt( det ) ) );
+                    };
+
+                    auto var_out = [n, sb2, sg2]( double c ) {
+                        double nc = n - c;
+                        return ( nc / n ) * ( sb2 - nc * sg2 );
+                    };
+
+                    return (std::min)( var_out( 1 ),
+                                       var_out(
+                                           (std::min)( c_max( 0. ),
+                                                       c_max( mg_min ) ) ) ) /
+                           sb2;
+                }
 
-}
+                static double erf_inv( double x ) {
+                    // Code accompanying the article "Approximating the erfinv
+                    // function" in GPU Computing Gems, Volume 2
+                    double w, p;
+
+                    w = -log( ( 1.0 - x ) * ( 1.0 + x ) );
+
+                    if ( w < 6.250000 ) {
+                        w = w - 3.125000;
+                        p = -3.6444120640178196996e-21;
+                        p = -1.685059138182016589e-19 + p * w;
+                        p = 1.2858480715256400167e-18 + p * w;
+                        p = 1.115787767802518096e-17 + p * w;
+                        p = -1.333171662854620906e-16 + p * w;
+                        p = 2.0972767875968561637e-17 + p * w;
+                        p = 6.6376381343583238325e-15 + p * w;
+                        p = -4.0545662729752068639e-14 + p * w;
+                        p = -8.1519341976054721522e-14 + p * w;
+                        p = 2.6335093153082322977e-12 + p * w;
+                        p = -1.2975133253453532498e-11 + p * w;
+                        p = -5.4154120542946279317e-11 + p * w;
+                        p = 1.051212273321532285e-09 + p * w;
+                        p = -4.1126339803469836976e-09 + p * w;
+                        p = -2.9070369957882005086e-08 + p * w;
+                        p = 4.2347877827932403518e-07 + p * w;
+                        p = -1.3654692000834678645e-06 + p * w;
+                        p = -1.3882523362786468719e-05 + p * w;
+                        p = 0.0001867342080340571352 + p * w;
+                        p = -0.00074070253416626697512 + p * w;
+                        p = -0.0060336708714301490533 + p * w;
+                        p = 0.24015818242558961693 + p * w;
+                        p = 1.6536545626831027356 + p * w;
+                    } else if ( w < 16.000000 ) {
+                        w = sqrt( w ) - 3.250000;
+                        p = 2.2137376921775787049e-09;
+                        p = 9.0756561938885390979e-08 + p * w;
+                        p = -2.7517406297064545428e-07 + p * w;
+                        p = 1.8239629214389227755e-08 + p * w;
+                        p = 1.5027403968909827627e-06 + p * w;
+                        p = -4.013867526981545969e-06 + p * w;
+                        p = 2.9234449089955446044e-06 + p * w;
+                        p = 1.2475304481671778723e-05 + p * w;
+                        p = -4.7318229009055733981e-05 + p * w;
+                        p = 6.8284851459573175448e-05 + p * w;
+                        p = 2.4031110387097893999e-05 + p * w;
+                        p = -0.0003550375203628474796 + p * w;
+                        p = 0.00095328937973738049703 + p * w;
+                        p = -0.0016882755560235047313 + p * w;
+                        p = 0.0024914420961078508066 + p * w;
+                        p = -0.0037512085075692412107 + p * w;
+                        p = 0.005370914553590063617 + p * w;
+                        p = 1.0052589676941592334 + p * w;
+                        p = 3.0838856104922207635 + p * w;
+                    } else {
+                        w = sqrt( w ) - 5.000000;
+                        p = -2.7109920616438573243e-11;
+                        p = -2.5556418169965252055e-10 + p * w;
+                        p = 1.5076572693500548083e-09 + p * w;
+                        p = -3.7894654401267369937e-09 + p * w;
+                        p = 7.6157012080783393804e-09 + p * w;
+                        p = -1.4960026627149240478e-08 + p * w;
+                        p = 2.9147953450901080826e-08 + p * w;
+                        p = -6.7711997758452339498e-08 + p * w;
+                        p = 2.2900482228026654717e-07 + p * w;
+                        p = -9.9298272942317002539e-07 + p * w;
+                        p = 4.5260625972231537039e-06 + p * w;
+                        p = -1.9681778105531670567e-05 + p * w;
+                        p = 7.5995277030017761139e-05 + p * w;
+                        p = -0.00021503011930044477347 + p * w;
+                        p = -0.00013871931833623122026 + p * w;
+                        p = 1.0103004648645343977 + p * w;
+                        p = 4.8499064014085844221 + p * w;
+                    }
+                    return p * x;
+                }
+
+                static double
+                standard_deviation( double const* first, double const* last ) {
+                    auto m = Catch::Benchmark::Detail::mean( first, last );
+                    double variance =
+                        std::accumulate( first,
+                                         last,
+                                         0.,
+                                         [m]( double a, double b ) {
+                                             double diff = b - m;
+                                             return a + diff * diff;
+                                         } ) /
+                        ( last - first );
+                    return std::sqrt( variance );
+                }
+
+                static sample jackknife( double ( *estimator )( double const*,
+                                                                double const* ),
+                                         double* first,
+                                         double* last ) {
+                    const auto second = first + 1;
+                    sample results;
+                    results.reserve( static_cast<size_t>( last - first ) );
+
+                    for ( auto it = first; it != last; ++it ) {
+                        std::iter_swap( it, first );
+                        results.push_back( estimator( second, last ) );
+                    }
+
+                    return results;
+                }
+
+
+            } // namespace
+        }     // namespace Detail
+    }         // namespace Benchmark
+} // namespace Catch
 
 namespace Catch {
     namespace Benchmark {
         namespace Detail {
 
-#if defined( __GNUC__ ) || defined( __clang__ )
-#    pragma GCC diagnostic push
-#    pragma GCC diagnostic ignored "-Wfloat-equal"
-#endif
-            bool directCompare( double lhs, double rhs ) { return lhs == rhs; }
-#if defined( __GNUC__ ) || defined( __clang__ )
-#    pragma GCC diagnostic pop
-#endif
-
-            double weighted_average_quantile(int k, int q, std::vector<double>::iterator first, std::vector<double>::iterator last) {
+            double weighted_average_quantile( int k,
+                                              int q,
+                                              double* first,
+                                              double* last ) {
                 auto count = last - first;
                 double idx = (count - 1) * k / static_cast<double>(q);
                 int j = static_cast<int>(idx);
                 double g = idx - j;
                 std::nth_element(first, first + j, last);
                 auto xj = first[j];
-                if ( directCompare( g, 0 ) ) {
+                if ( Catch::Detail::directCompare( g, 0 ) ) {
                     return xj;
                 }
 
@@ -236,6 +374,48 @@ namespace Catch {
                 return xj + g * (xj1 - xj);
             }
 
+            OutlierClassification
+            classify_outliers( double const* first, double const* last ) {
+                std::vector<double> copy( first, last );
+
+                auto q1 = weighted_average_quantile( 1, 4, copy.data(), copy.data() + copy.size() );
+                auto q3 = weighted_average_quantile( 3, 4, copy.data(), copy.data() + copy.size() );
+                auto iqr = q3 - q1;
+                auto los = q1 - ( iqr * 3. );
+                auto lom = q1 - ( iqr * 1.5 );
+                auto him = q3 + ( iqr * 1.5 );
+                auto his = q3 + ( iqr * 3. );
+
+                OutlierClassification o;
+                for ( ; first != last; ++first ) {
+                    const double t = *first;
+                    if ( t < los ) {
+                        ++o.low_severe;
+                    } else if ( t < lom ) {
+                        ++o.low_mild;
+                    } else if ( t > his ) {
+                        ++o.high_severe;
+                    } else if ( t > him ) {
+                        ++o.high_mild;
+                    }
+                    ++o.samples_seen;
+                }
+                return o;
+            }
+
+            double mean( double const* first, double const* last ) {
+                auto count = last - first;
+                double sum = 0.;
+                while (first != last) {
+                    sum += *first;
+                    ++first;
+                }
+                return sum / static_cast<double>(count);
+            }
+
+            double normal_cdf( double x ) {
+                return std::erfc( -x / std::sqrt( 2.0 ) ) / 2.0;
+            }
 
             double erfc_inv(double x) {
                 return erf_inv(1.0 - x);
@@ -257,50 +437,77 @@ namespace Catch {
                 return result;
             }
 
+            Estimate<double>
+            bootstrap( double confidence_level,
+                       double* first,
+                       double* last,
+                       sample const& resample,
+                       double ( *estimator )( double const*, double const* ) ) {
+                auto n_samples = last - first;
+
+                double point = estimator( first, last );
+                // Degenerate case with a single sample
+                if ( n_samples == 1 )
+                    return { point, point, point, confidence_level };
+
+                sample jack = jackknife( estimator, first, last );
+                double jack_mean =
+                    mean( jack.data(), jack.data() + jack.size() );
+                double sum_squares = 0, sum_cubes = 0;
+                for ( double x : jack ) {
+                    auto difference = jack_mean - x;
+                    auto square = difference * difference;
+                    auto cube = square * difference;
+                    sum_squares += square;
+                    sum_cubes += cube;
+                }
 
-            double outlier_variance(Estimate<double> mean, Estimate<double> stddev, int n) {
-                double sb = stddev.point;
-                double mn = mean.point / n;
-                double mg_min = mn / 2.;
-                double sg = (std::min)(mg_min / 4., sb / std::sqrt(n));
-                double sg2 = sg * sg;
-                double sb2 = sb * sb;
+                double accel = sum_cubes / ( 6 * std::pow( sum_squares, 1.5 ) );
+                long n = static_cast<long>( resample.size() );
+                double prob_n =
+                    std::count_if( resample.begin(),
+                                   resample.end(),
+                                   [point]( double x ) { return x < point; } ) /
+                    static_cast<double>( n );
+                // degenerate case with uniform samples
+                if ( Catch::Detail::directCompare( prob_n, 0. ) ) {
+                    return { point, point, point, confidence_level };
+                }
 
-                auto c_max = [n, mn, sb2, sg2](double x) -> double {
-                    double k = mn - x;
-                    double d = k * k;
-                    double nd = n * d;
-                    double k0 = -n * nd;
-                    double k1 = sb2 - n * sg2 + nd;
-                    double det = k1 * k1 - 4 * sg2 * k0;
-                    return static_cast<int>(-2. * k0 / (k1 + std::sqrt(det)));
-                };
+                double bias = normal_quantile( prob_n );
+                double z1 = normal_quantile( ( 1. - confidence_level ) / 2. );
 
-                auto var_out = [n, sb2, sg2](double c) {
-                    double nc = n - c;
-                    return (nc / n) * (sb2 - nc * sg2);
+                auto cumn = [n]( double x ) -> long {
+                    return std::lround( normal_cdf( x ) *
+                                        static_cast<double>( n ) );
                 };
-
-                return (std::min)(var_out(1), var_out((std::min)(c_max(0.), c_max(mg_min)))) / sb2;
-            }
-
-
-            bootstrap_analysis analyse_samples(double confidence_level, unsigned int n_resamples, std::vector<double>::iterator first, std::vector<double>::iterator last) {
-                CATCH_INTERNAL_START_WARNINGS_SUPPRESSION
-                CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS
-                static std::random_device entropy;
-                CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
-
-                auto n = static_cast<int>(last - first); // seriously, one can't use integral types without hell in C++
-
-                auto mean = &Detail::mean<std::vector<double>::iterator>;
+                auto a = [bias, accel]( double b ) {
+                    return bias + b / ( 1. - accel * b );
+                };
+                double b1 = bias + z1;
+                double b2 = bias - z1;
+                double a1 = a( b1 );
+                double a2 = a( b2 );
+                auto lo = static_cast<size_t>( (std::max)( cumn( a1 ), 0l ) );
+                auto hi =
+                    static_cast<size_t>( (std::min)( cumn( a2 ), n - 1 ) );
+
+                return { point, resample[lo], resample[hi], confidence_level };
+            }
+
+            bootstrap_analysis analyse_samples(double confidence_level,
+                                               unsigned int n_resamples,
+                                               double* first,
+                                               double* last) {
+                auto mean = &Detail::mean;
                 auto stddev = &standard_deviation;
 
 #if defined(CATCH_CONFIG_USE_ASYNC)
-                auto Estimate = [=](double(*f)(std::vector<double>::iterator, std::vector<double>::iterator)) {
-                    auto seed = entropy();
+                auto Estimate = [=](double(*f)(double const*, double const*)) {
+                    std::random_device rd;
+                    auto seed = rd();
                     return std::async(std::launch::async, [=] {
-                        std::mt19937 rng(seed);
+                        SimplePcg32 rng( seed );
                         auto resampled = resample(rng, n_resamples, first, last, f);
                         return bootstrap(confidence_level, first, last, resampled, f);
                     });
@@ -312,9 +519,10 @@ namespace Catch {
                 auto mean_estimate = mean_future.get();
                 auto stddev_estimate = stddev_future.get();
 #else
-                auto Estimate = [=](double(*f)(std::vector<double>::iterator, std::vector<double>::iterator)) {
-                    auto seed = entropy();
-                    std::mt19937 rng(seed);
+                auto Estimate = [=](double(*f)(double const* , double const*)) {
+                    std::random_device rd;
+                    auto seed = rd();
+                    SimplePcg32 rng( seed );
                     auto resampled = resample(rng, n_resamples, first, last, f);
                     return bootstrap(confidence_level, first, last, resampled, f);
                 };
@@ -323,6 +531,7 @@ namespace Catch {
                 auto stddev_estimate = Estimate(stddev);
 #endif // CATCH_USE_ASYNC
 
+                auto n = static_cast<int>(last - first); // seriously, one can't use integral types without hell in C++
                 double outlier_variance = Detail::outlier_variance(mean_estimate, stddev_estimate, n);
 
                 return { mean_estimate, stddev_estimate, outlier_variance };
@@ -394,10 +603,10 @@ namespace Catch {
     }
 
 namespace literals {
-    Approx operator "" _a(long double val) {
+    Approx operator ""_a(long double val) {
         return Approx(val);
     }
-    Approx operator "" _a(unsigned long long val) {
+    Approx operator ""_a(unsigned long long val) {
         return Approx(val);
     }
 } // end namespace literals
@@ -596,7 +805,7 @@ namespace Catch {
             elem = trim(elem);
         }
 
-        // Insert the default reporter if user hasn't asked for a specfic one
+        // Insert the default reporter if user hasn't asked for a specific one
         if ( m_data.reporterSpecifications.empty() ) {
             m_data.reporterSpecifications.push_back( {
 #if defined( CATCH_CONFIG_DEFAULT_REPORTER )
@@ -775,7 +984,11 @@ namespace Catch {
     }
 
 
-    Capturer::Capturer( StringRef macroName, SourceLineInfo const& lineInfo, ResultWas::OfType resultType, StringRef names ) {
+    Capturer::Capturer( StringRef macroName,
+                        SourceLineInfo const& lineInfo,
+                        ResultWas::OfType resultType,
+                        StringRef names ):
+        m_resultCapture( getResultCapture() ) {
         auto trimmed = [&] (size_t start, size_t end) {
             while (names[start] == ',' || isspace(static_cast<unsigned char>(names[start]))) {
                 ++start;
@@ -852,6 +1065,8 @@ namespace Catch {
 
 
 
+#include <exception>
+
 namespace Catch {
 
     namespace {
@@ -862,7 +1077,7 @@ namespace Catch {
 
         public: // IRegistryHub
             RegistryHub() = default;
-            IReporterRegistry const& getReporterRegistry() const override {
+            ReporterRegistry const& getReporterRegistry() const override {
                 return m_reporterRegistry;
             }
             ITestCaseRegistry const& getTestCaseRegistry() const override {
@@ -938,6 +1153,7 @@ namespace Catch {
 
 #include <algorithm>
 #include <cassert>
+#include <exception>
 #include <iomanip>
 #include <set>
 
@@ -1420,12 +1636,20 @@ namespace Catch {
         for (size_t idx = 0; idx < originalTags.size(); ++idx) {
             auto c = originalTags[idx];
             if (c == '[') {
-                assert(!inTag);
+                CATCH_ENFORCE(
+                    !inTag,
+                    "Found '[' inside a tag while registering test case '"
+                        << _nameAndTags.name << "' at " << _lineInfo );
+
                 inTag = true;
                 tagStart = idx;
             }
             if (c == ']') {
-                assert(inTag);
+                CATCH_ENFORCE(
+                    inTag,
+                    "Found unmatched ']' while registering test case '"
+                        << _nameAndTags.name << "' at " << _lineInfo );
+
                 inTag = false;
                 tagEnd = idx;
                 assert(tagStart < tagEnd);
@@ -1434,7 +1658,11 @@ namespace Catch {
                 // it over to backing storage and actually reference the
                 // backing storage in the saved tags
                 StringRef tagStr = originalTags.substr(tagStart+1, tagEnd - tagStart - 1);
-                CATCH_ENFORCE(!tagStr.empty(), "Empty tags are not allowed");
+                CATCH_ENFORCE( !tagStr.empty(),
+                               "Found an empty tag while registering test case '"
+                                   << _nameAndTags.name << "' at "
+                                   << _lineInfo );
+
                 enforceNotReservedTag(tagStr, lineInfo);
                 properties |= parseSpecialTag(tagStr);
                 // When copying a tag to the backing storage, we need to
@@ -1448,8 +1676,12 @@ namespace Catch {
                 // the tags.
                 internalAppendTag(tagStr);
             }
-            (void)inTag; // Silence "set-but-unused" warning in release mode.
         }
+        CATCH_ENFORCE( !inTag,
+                       "Found an unclosed tag while registering test case '"
+                           << _nameAndTags.name << "' at " << _lineInfo );
+
+
         // Add [.] if relevant
         if (isHidden()) {
             internalAppendTag("."_sr);
@@ -1625,16 +1857,18 @@ namespace Catch {
         return std::any_of( m_filters.begin(), m_filters.end(), [&]( Filter const& f ){ return f.matches( testCase ); } );
     }
 
-    TestSpec::Matches TestSpec::matchesByFilter( std::vector<TestCaseHandle> const& testCases, IConfig const& config ) const
-    {
-        Matches matches( m_filters.size() );
-        std::transform( m_filters.begin(), m_filters.end(), matches.begin(), [&]( Filter const& filter ){
+    TestSpec::Matches TestSpec::matchesByFilter( std::vector<TestCaseHandle> const& testCases, IConfig const& config ) const {
+        Matches matches;
+        matches.reserve( m_filters.size() );
+        for ( auto const& filter : m_filters ) {
             std::vector<TestCaseHandle const*> currentMatches;
-            for( auto const& test : testCases )
-                if( isThrowSafe( test, config ) && filter.matches( test.getTestCaseInfo() ) )
+            for ( auto const& test : testCases )
+                if ( isThrowSafe( test, config ) &&
+                     filter.matches( test.getTestCaseInfo() ) )
                     currentMatches.emplace_back( &test );
-            return FilterMatch{ extractFilterName(filter), currentMatches };
-        } );
+            matches.push_back(
+                FilterMatch{ extractFilterName( filter ), currentMatches } );
+        }
         return matches;
     }
 
@@ -1991,6 +2225,19 @@ namespace Catch {
 }
 
 
+
+
+namespace Catch {
+    namespace Detail {
+        void registerTranslatorImpl(
+            Detail::unique_ptr<IExceptionTranslator>&& translator ) {
+            getMutableRegistryHub().registerTranslator(
+                CATCH_MOVE( translator ) );
+        }
+    } // namespace Detail
+} // namespace Catch
+
+
 #include <ostream>
 
 namespace Catch {
@@ -2021,7 +2268,7 @@ namespace Catch {
     }
 
     Version const& libraryVersion() {
-        static Version version( 3, 3, 2, "", 0 );
+        static Version version( 3, 5, 2, "", 0 );
         return version;
     }
 
@@ -2074,8 +2321,36 @@ namespace Detail {
 
 
 
+#include <random>
 
-std::uint32_t Catch::Generators::Detail::getSeed() { return sharedRng()(); }
+namespace Catch {
+    namespace Generators {
+        namespace Detail {
+            std::uint32_t getSeed() { return sharedRng()(); }
+        } // namespace Detail
+
+        struct RandomFloatingGenerator<long double>::PImpl {
+            PImpl( long double a, long double b, uint32_t seed ):
+                rng( seed ), dist( a, b ) {}
+
+            Catch::SimplePcg32 rng;
+            std::uniform_real_distribution<long double> dist;
+        };
+
+        RandomFloatingGenerator<long double>::RandomFloatingGenerator(
+            long double a, long double b, std::uint32_t seed) :
+            m_pimpl(Catch::Detail::make_unique<PImpl>(a, b, seed)) {
+            static_cast<void>( next() );
+        }
+
+        RandomFloatingGenerator<long double>::~RandomFloatingGenerator() =
+            default;
+        bool RandomFloatingGenerator<long double>::next() {
+            m_current_number = m_pimpl->dist( m_pimpl->rng );
+            return true;
+        }
+    } // namespace Generators
+} // namespace Catch
 
 
 
@@ -2135,9 +2410,7 @@ namespace Catch {
 
 
 
-#include <algorithm>
 #include <cassert>
-#include <iomanip>
 
 namespace Catch {
 
@@ -2172,8 +2445,6 @@ namespace Catch {
         infoMessages( _infoMessages ),
         totals( _totals )
     {
-        assertionResult.m_resultData.lazyExpression.m_transientExpression = _assertionResult.m_resultData.lazyExpression.m_transientExpression;
-
         if( assertionResult.hasMessage() ) {
             // Copy message into messages list.
             // !TBD This should have been done earlier, somewhere
@@ -2232,14 +2503,6 @@ namespace Catch {
 
 
 namespace Catch {
-    IReporterRegistry::~IReporterRegistry() = default;
-}
-
-
-
-
-namespace Catch {
-    ITestInvoker::~ITestInvoker() = default;
     ITestCaseRegistry::~ITestCaseRegistry() = default;
 }
 
@@ -2254,7 +2517,9 @@ namespace Catch {
             ResultDisposition::Flags resultDisposition )
     :   m_assertionInfo{ macroName, lineInfo, capturedExpression, resultDisposition },
         m_resultCapture( getResultCapture() )
-    {}
+    {
+        m_resultCapture.notifyAssertionStarted( m_assertionInfo );
+    }
 
     void AssertionHandler::handleExpr( ITransientExpression const& expr ) {
         m_resultCapture.handleExpr( m_assertionInfo, expr, m_reaction );
@@ -2268,7 +2533,7 @@ namespace Catch {
     }
 
     void AssertionHandler::complete() {
-        setCompleted();
+        m_completed = true;
         if( m_reaction.shouldDebugBreak ) {
 
             // If you find your debugger stopping you here then go one level up on the
@@ -2281,16 +2546,9 @@ namespace Catch {
             throw_test_failure_exception();
         }
         if ( m_reaction.shouldSkip ) {
-#if !defined( CATCH_CONFIG_DISABLE_EXCEPTIONS )
-            throw Catch::TestSkipException();
-#else
-            CATCH_ERROR( "Explicitly skipping tests during runtime requires exceptions" );
-#endif
+            throw_test_skip_exception();
         }
     }
-    void AssertionHandler::setCompleted() {
-        m_completed = true;
-    }
 
     void AssertionHandler::handleUnexpectedInflightException() {
         m_resultCapture.handleUnexpectedInflightException( m_assertionInfo, Catch::translateActiveException(), m_reaction );
@@ -2362,13 +2620,29 @@ namespace {
             ;
     }
 
-    std::string normaliseOpt( std::string const& optName ) {
-#ifdef CATCH_PLATFORM_WINDOWS
-        if ( optName[0] == '/' )
-            return "-" + optName.substr( 1 );
-        else
+    Catch::StringRef normaliseOpt( Catch::StringRef optName ) {
+        if ( optName[0] == '-'
+#if defined(CATCH_PLATFORM_WINDOWS)
+             || optName[0] == '/'
 #endif
-            return optName;
+        ) {
+            return optName.substr( 1, optName.size() );
+        }
+
+        return optName;
+    }
+
+    static size_t find_first_separator(Catch::StringRef sr) {
+        auto is_separator = []( char c ) {
+            return c == ' ' || c == ':' || c == '=';
+        };
+        size_t pos = 0;
+        while (pos < sr.size()) {
+            if (is_separator(sr[pos])) { return pos; }
+            ++pos;
+        }
+
+        return Catch::StringRef::npos;
     }
 
 } // namespace
@@ -2386,23 +2660,23 @@ namespace Catch {
                 }
 
                 if ( it != itEnd ) {
-                    auto const& next = *it;
+                    StringRef next = *it;
                     if ( isOptPrefix( next[0] ) ) {
-                        auto delimiterPos = next.find_first_of( " :=" );
-                        if ( delimiterPos != std::string::npos ) {
+                        auto delimiterPos = find_first_separator(next);
+                        if ( delimiterPos != StringRef::npos ) {
                             m_tokenBuffer.push_back(
                                 { TokenType::Option,
                                   next.substr( 0, delimiterPos ) } );
                             m_tokenBuffer.push_back(
                                 { TokenType::Argument,
-                                  next.substr( delimiterPos + 1 ) } );
+                                  next.substr( delimiterPos + 1, next.size() ) } );
                         } else {
                             if ( next[1] != '-' && next.size() > 2 ) {
-                                std::string opt = "- ";
+                                // Combined short args, e.g. "-ab" for "-a -b"
                                 for ( size_t i = 1; i < next.size(); ++i ) {
-                                    opt[1] = next[i];
                                     m_tokenBuffer.push_back(
-                                        { TokenType::Option, opt } );
+                                        { TokenType::Option,
+                                          next.substr( i, 1 ) } );
                                 }
                             } else {
                                 m_tokenBuffer.push_back(
@@ -2462,12 +2736,12 @@ namespace Catch {
             size_t ParserBase::cardinality() const { return 1; }
 
             InternalParseResult ParserBase::parse( Args const& args ) const {
-                return parse( args.exeName(), TokenStream( args ) );
+                return parse( static_cast<std::string>(args.exeName()), TokenStream( args ) );
             }
 
             ParseState::ParseState( ParseResultType type,
-                                    TokenStream const& remainingTokens ):
-                m_type( type ), m_remainingTokens( remainingTokens ) {}
+                                    TokenStream remainingTokens ):
+                m_type( type ), m_remainingTokens( CATCH_MOVE(remainingTokens) ) {}
 
             ParserResult BoundFlagRef::setFlag( bool flag ) {
                 m_ref = flag;
@@ -2485,34 +2759,34 @@ namespace Catch {
 } // namespace Detail
 
         Detail::InternalParseResult Arg::parse(std::string const&,
-                                               Detail::TokenStream const& tokens) const {
+                                               Detail::TokenStream tokens) const {
             auto validationResult = validate();
             if (!validationResult)
                 return Detail::InternalParseResult(validationResult);
 
-            auto remainingTokens = tokens;
-            auto const& token = *remainingTokens;
+            auto token = *tokens;
             if (token.type != Detail::TokenType::Argument)
                 return Detail::InternalParseResult::ok(Detail::ParseState(
-                    ParseResultType::NoMatch, remainingTokens));
+                    ParseResultType::NoMatch, CATCH_MOVE(tokens)));
 
             assert(!m_ref->isFlag());
             auto valueRef =
                 static_cast<Detail::BoundValueRefBase*>(m_ref.get());
 
-            auto result = valueRef->setValue(remainingTokens->token);
-            if (!result)
-                return Detail::InternalParseResult(result);
+            auto result = valueRef->setValue(static_cast<std::string>(token.token));
+            if ( !result )
+                return Detail::InternalParseResult( result );
             else
-                return Detail::InternalParseResult::ok(Detail::ParseState(
-                    ParseResultType::Matched, ++remainingTokens));
+                return Detail::InternalParseResult::ok(
+                    Detail::ParseState( ParseResultType::Matched,
+                                        CATCH_MOVE( ++tokens ) ) );
         }
 
         Opt::Opt(bool& ref) :
             ParserRefImpl(std::make_shared<Detail::BoundFlagRef>(ref)) {}
 
-        std::vector<Detail::HelpColumns> Opt::getHelpColumns() const {
-            std::ostringstream oss;
+        Detail::HelpColumns Opt::getHelpColumns() const {
+            ReusableStringStream oss;
             bool first = true;
             for (auto const& opt : m_optNames) {
                 if (first)
@@ -2523,10 +2797,10 @@ namespace Catch {
             }
             if (!m_hint.empty())
                 oss << " <" << m_hint << '>';
-            return { { oss.str(), m_description } };
+            return { oss.str(), m_description };
         }
 
-        bool Opt::isMatch(std::string const& optToken) const {
+        bool Opt::isMatch(StringRef optToken) const {
             auto normalisedToken = normaliseOpt(optToken);
             for (auto const& name : m_optNames) {
                 if (normaliseOpt(name) == normalisedToken)
@@ -2536,15 +2810,14 @@ namespace Catch {
         }
 
         Detail::InternalParseResult Opt::parse(std::string const&,
-                                       Detail::TokenStream const& tokens) const {
+                                       Detail::TokenStream tokens) const {
             auto validationResult = validate();
             if (!validationResult)
                 return Detail::InternalParseResult(validationResult);
 
-            auto remainingTokens = tokens;
-            if (remainingTokens &&
-                remainingTokens->type == Detail::TokenType::Option) {
-                auto const& token = *remainingTokens;
+            if (tokens &&
+                tokens->type == Detail::TokenType::Option) {
+                auto const& token = *tokens;
                 if (isMatch(token.token)) {
                     if (m_ref->isFlag()) {
                         auto flagRef =
@@ -2556,35 +2829,35 @@ namespace Catch {
                         if (result.value() ==
                             ParseResultType::ShortCircuitAll)
                             return Detail::InternalParseResult::ok(Detail::ParseState(
-                                result.value(), remainingTokens));
+                                result.value(), CATCH_MOVE(tokens)));
                     } else {
                         auto valueRef =
                             static_cast<Detail::BoundValueRefBase*>(
                                 m_ref.get());
-                        ++remainingTokens;
-                        if (!remainingTokens)
+                        ++tokens;
+                        if (!tokens)
                             return Detail::InternalParseResult::runtimeError(
                                 "Expected argument following " +
                                 token.token);
-                        auto const& argToken = *remainingTokens;
+                        auto const& argToken = *tokens;
                         if (argToken.type != Detail::TokenType::Argument)
                             return Detail::InternalParseResult::runtimeError(
                                 "Expected argument following " +
                                 token.token);
-                        const auto result = valueRef->setValue(argToken.token);
+                        const auto result = valueRef->setValue(static_cast<std::string>(argToken.token));
                         if (!result)
                             return Detail::InternalParseResult(result);
                         if (result.value() ==
                             ParseResultType::ShortCircuitAll)
                             return Detail::InternalParseResult::ok(Detail::ParseState(
-                                result.value(), remainingTokens));
+                                result.value(), CATCH_MOVE(tokens)));
                     }
                     return Detail::InternalParseResult::ok(Detail::ParseState(
-                        ParseResultType::Matched, ++remainingTokens));
+                        ParseResultType::Matched, CATCH_MOVE(++tokens)));
                 }
             }
             return Detail::InternalParseResult::ok(
-                Detail::ParseState(ParseResultType::NoMatch, remainingTokens));
+                Detail::ParseState(ParseResultType::NoMatch, CATCH_MOVE(tokens)));
         }
 
         Detail::Result Opt::validate() const {
@@ -2616,9 +2889,9 @@ namespace Catch {
 
         Detail::InternalParseResult
             ExeName::parse(std::string const&,
-                           Detail::TokenStream const& tokens) const {
+                           Detail::TokenStream tokens) const {
             return Detail::InternalParseResult::ok(
-                Detail::ParseState(ParseResultType::NoMatch, tokens));
+                Detail::ParseState(ParseResultType::NoMatch, CATCH_MOVE(tokens)));
         }
 
         ParserResult ExeName::set(std::string const& newName) {
@@ -2648,9 +2921,9 @@ namespace Catch {
 
         std::vector<Detail::HelpColumns> Parser::getHelpColumns() const {
             std::vector<Detail::HelpColumns> cols;
+            cols.reserve( m_options.size() );
             for ( auto const& o : m_options ) {
-                auto childCols = o.getHelpColumns();
-                cols.insert( cols.end(), childCols.begin(), childCols.end() );
+                cols.push_back(o.getHelpColumns());
             }
             return cols;
         }
@@ -2688,12 +2961,12 @@ namespace Catch {
 
             optWidth = ( std::min )( optWidth, consoleWidth / 2 );
 
-            for ( auto const& cols : rows ) {
-                auto row = TextFlow::Column( cols.left )
+            for ( auto& cols : rows ) {
+                auto row = TextFlow::Column( CATCH_MOVE(cols.left) )
                                .width( optWidth )
                                .indent( 2 ) +
                            TextFlow::Spacer( 4 ) +
-                           TextFlow::Column( cols.right )
+                           TextFlow::Column( static_cast<std::string>(cols.descriptions) )
                                .width( consoleWidth - 7 - optWidth );
                 os << row << '\n';
             }
@@ -2715,7 +2988,7 @@ namespace Catch {
 
         Detail::InternalParseResult
         Parser::parse( std::string const& exeName,
-                       Detail::TokenStream const& tokens ) const {
+                       Detail::TokenStream tokens ) const {
 
             struct ParserInfo {
                 ParserBase const* parser = nullptr;
@@ -2733,7 +3006,7 @@ namespace Catch {
             m_exeName.set( exeName );
 
             auto result = Detail::InternalParseResult::ok(
-                Detail::ParseState( ParseResultType::NoMatch, tokens ) );
+                Detail::ParseState( ParseResultType::NoMatch, CATCH_MOVE(tokens) ) );
             while ( result.value().remainingTokens() ) {
                 bool tokenParsed = false;
 
@@ -2741,7 +3014,7 @@ namespace Catch {
                     if ( parseInfo.parser->cardinality() == 0 ||
                          parseInfo.count < parseInfo.parser->cardinality() ) {
                         result = parseInfo.parser->parse(
-                            exeName, result.value().remainingTokens() );
+                            exeName, CATCH_MOVE(result).value().remainingTokens() );
                         if ( !result )
                             return result;
                         if ( result.value().type() !=
@@ -2767,7 +3040,7 @@ namespace Catch {
         Args::Args(int argc, char const* const* argv) :
             m_exeName(argv[0]), m_args(argv + 1, argv + argc) {}
 
-        Args::Args(std::initializer_list<std::string> args) :
+        Args::Args(std::initializer_list<StringRef> args) :
             m_exeName(*args.begin()),
             m_args(args.begin() + 1, args.end()) {}
 
@@ -2917,7 +3190,7 @@ namespace Catch {
 
             auto const& reporterSpec = *parsed;
 
-            IReporterRegistry::FactoryMap const& factories =
+            auto const& factories =
                 getRegistryHub().getReporterRegistry().getFactories();
             auto result = factories.find( reporterSpec.name() );
 
@@ -3073,8 +3346,8 @@ namespace Catch {
                 ( "split the tests to execute into this many groups" )
             | Opt( setShardIndex, "shard index" )
                 ["--shard-index"]
-                ( "index of the group of tests to execute (see --shard-count)" ) |
-            Opt( config.allowZeroTests )
+                ( "index of the group of tests to execute (see --shard-count)" )
+            | Opt( config.allowZeroTests )
                 ["--allow-running-no-tests"]
                 ( "Treat 'No tests run' as a success" )
             | Arg( config.testsOrTags, "test name|pattern|tags" )
@@ -3155,7 +3428,7 @@ namespace Catch {
     namespace {
         //! A do-nothing implementation of colour, used as fallback for unknown
         //! platforms, and when the user asks to deactivate all colours.
-        class NoColourImpl : public ColourImpl {
+        class NoColourImpl final : public ColourImpl {
         public:
             NoColourImpl( IStream* stream ): ColourImpl( stream ) {}
 
@@ -3173,7 +3446,7 @@ namespace Catch {
 namespace Catch {
 namespace {
 
-    class Win32ColourImpl : public ColourImpl {
+    class Win32ColourImpl final : public ColourImpl {
     public:
         Win32ColourImpl(IStream* stream):
             ColourImpl(stream) {
@@ -3239,7 +3512,7 @@ namespace {
 namespace Catch {
 namespace {
 
-    class ANSIColourImpl : public ColourImpl {
+    class ANSIColourImpl final : public ColourImpl {
     public:
         ANSIColourImpl( IStream* stream ): ColourImpl( stream ) {}
 
@@ -3355,49 +3628,27 @@ namespace Catch {
 
 namespace Catch {
 
-    class Context : public IMutableContext, private Detail::NonCopyable {
-
-    public: // IContext
-        IResultCapture* getResultCapture() override {
-            return m_resultCapture;
-        }
-
-        IConfig const* getConfig() const override {
-            return m_config;
-        }
-
-        ~Context() override;
-
-    public: // IMutableContext
-        void setResultCapture( IResultCapture* resultCapture ) override {
-            m_resultCapture = resultCapture;
-        }
-        void setConfig( IConfig const* config ) override {
-            m_config = config;
-        }
+    Context* Context::currentContext = nullptr;
 
-        friend IMutableContext& getCurrentMutableContext();
-
-    private:
-        IConfig const* m_config = nullptr;
-        IResultCapture* m_resultCapture = nullptr;
-    };
-
-    IMutableContext *IMutableContext::currentContext = nullptr;
-
-    void IMutableContext::createContext()
-    {
+    void cleanUpContext() {
+        delete Context::currentContext;
+        Context::currentContext = nullptr;
+    }
+    void Context::createContext() {
         currentContext = new Context();
     }
 
-    void cleanUpContext() {
-        delete IMutableContext::currentContext;
-        IMutableContext::currentContext = nullptr;
+    Context& getCurrentMutableContext() {
+        if ( !Context::currentContext ) { Context::createContext(); }
+        // NOLINTNEXTLINE(clang-analyzer-core.uninitialized.UndefReturn)
+        return *Context::currentContext;
     }
-    IContext::~IContext() = default;
-    IMutableContext::~IMutableContext() = default;
-    Context::~Context() = default;
 
+    void Context::setResultCapture( IResultCapture* resultCapture ) {
+        m_resultCapture = resultCapture;
+    }
+
+    void Context::setConfig( IConfig const* config ) { m_config = config; }
 
     SimplePcg32& sharedRng() {
         static SimplePcg32 s_rng;
@@ -3635,7 +3886,7 @@ namespace Catch {
             return parsed;
         }
 
-        EnumInfo::~EnumInfo() {}
+        EnumInfo::~EnumInfo() = default;
 
         StringRef EnumInfo::lookup( int value ) const {
             for( auto const& valueToName : m_values ) {
@@ -3680,10 +3931,27 @@ namespace Catch {
 
 
 
+#include <exception>
+
 namespace Catch {
 
-    ExceptionTranslatorRegistry::~ExceptionTranslatorRegistry() {
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+    namespace {
+        static std::string tryTranslators(
+            std::vector<
+                Detail::unique_ptr<IExceptionTranslator const>> const& translators ) {
+            if ( translators.empty() ) {
+                std::rethrow_exception( std::current_exception() );
+            } else {
+                return translators[0]->translate( translators.begin() + 1,
+                                                  translators.end() );
+            }
+        }
+
     }
+#endif //!defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+
+    ExceptionTranslatorRegistry::~ExceptionTranslatorRegistry() = default;
 
     void ExceptionTranslatorRegistry::registerTranslator( Detail::unique_ptr<IExceptionTranslator>&& translator ) {
         m_translators.push_back( CATCH_MOVE( translator ) );
@@ -3706,7 +3974,7 @@ namespace Catch {
         // First we try user-registered translators. If none of them can
         // handle the exception, it will be rethrown handled by our defaults.
         try {
-            return tryTranslators();
+            return tryTranslators(m_translators);
         }
         // To avoid having to handle TFE explicitly everywhere, we just
         // rethrow it so that it goes back up the caller.
@@ -3730,25 +3998,12 @@ namespace Catch {
         }
     }
 
-    std::string ExceptionTranslatorRegistry::tryTranslators() const {
-        if (m_translators.empty()) {
-            std::rethrow_exception(std::current_exception());
-        } else {
-            return m_translators[0]->translate(m_translators.begin() + 1, m_translators.end());
-        }
-    }
-
 #else // ^^ Exceptions are enabled // Exceptions are disabled vv
     std::string ExceptionTranslatorRegistry::translateActiveException() const {
         CATCH_INTERNAL_ERROR("Attempted to translate active exception under CATCH_CONFIG_DISABLE_EXCEPTIONS!");
     }
-
-    std::string ExceptionTranslatorRegistry::tryTranslators() const {
-        CATCH_INTERNAL_ERROR("Attempted to use exception translators under CATCH_CONFIG_DISABLE_EXCEPTIONS!");
-    }
 #endif
 
-
 }
 
 
@@ -4005,6 +4260,17 @@ namespace Catch {
             return i;
         }
 
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        bool directCompare( float lhs, float rhs ) { return lhs == rhs; }
+        bool directCompare( double lhs, double rhs ) { return lhs == rhs; }
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic pop
+#endif
+
+
     } // end namespace Detail
 } // end namespace Catch
 
@@ -4053,7 +4319,7 @@ namespace Catch {
 namespace Detail {
     namespace {
         template<typename WriterF, std::size_t bufferSize=256>
-        class StreamBufImpl : public std::streambuf {
+        class StreamBufImpl final : public std::streambuf {
             char data[bufferSize];
             WriterF m_writer;
 
@@ -4101,7 +4367,7 @@ namespace Detail {
 
         ///////////////////////////////////////////////////////////////////////////
 
-        class FileStream : public IStream {
+        class FileStream final : public IStream {
             std::ofstream m_ofs;
         public:
             FileStream( std::string const& filename ) {
@@ -4109,7 +4375,6 @@ namespace Detail {
                 CATCH_ENFORCE( !m_ofs.fail(), "Unable to open file: '" << filename << '\'' );
                 m_ofs << std::unitbuf;
             }
-            ~FileStream() override = default;
         public: // IStream
             std::ostream& stream() override {
                 return m_ofs;
@@ -4118,13 +4383,12 @@ namespace Detail {
 
         ///////////////////////////////////////////////////////////////////////////
 
-        class CoutStream : public IStream {
+        class CoutStream final : public IStream {
             std::ostream m_os;
         public:
             // Store the streambuf from cout up-front because
             // cout may get redirected when running tests
             CoutStream() : m_os( Catch::cout().rdbuf() ) {}
-            ~CoutStream() override = default;
 
         public: // IStream
             std::ostream& stream() override { return m_os; }
@@ -4138,7 +4402,6 @@ namespace Detail {
             // Store the streambuf from cerr up-front because
             // cout may get redirected when running tests
             CerrStream(): m_os( Catch::cerr().rdbuf() ) {}
-            ~CerrStream() override = default;
 
         public: // IStream
             std::ostream& stream() override { return m_os; }
@@ -4147,7 +4410,7 @@ namespace Detail {
 
         ///////////////////////////////////////////////////////////////////////////
 
-        class DebugOutStream : public IStream {
+        class DebugOutStream final : public IStream {
             Detail::unique_ptr<StreamBufImpl<OutputDebugWriter>> m_streamBuf;
             std::ostream m_os;
         public:
@@ -4156,8 +4419,6 @@ namespace Detail {
                 m_os( m_streamBuf.get() )
             {}
 
-            ~DebugOutStream() override = default;
-
         public: // IStream
             std::ostream& stream() override { return m_os; }
         };
@@ -4189,6 +4450,147 @@ namespace Detail {
 
 
 
+namespace Catch {
+    void JsonUtils::indent( std::ostream& os, std::uint64_t level ) {
+        for ( std::uint64_t i = 0; i < level; ++i ) {
+            os << "  ";
+        }
+    }
+    void JsonUtils::appendCommaNewline( std::ostream& os,
+                                        bool& should_comma,
+                                        std::uint64_t level ) {
+        if ( should_comma ) { os << ','; }
+        should_comma = true;
+        os << '\n';
+        indent( os, level );
+    }
+
+    JsonObjectWriter::JsonObjectWriter( std::ostream& os ):
+        JsonObjectWriter{ os, 0 } {}
+
+    JsonObjectWriter::JsonObjectWriter( std::ostream& os,
+                                        std::uint64_t indent_level ):
+        m_os{ os }, m_indent_level{ indent_level } {
+        m_os << '{';
+    }
+    JsonObjectWriter::JsonObjectWriter( JsonObjectWriter&& source ):
+        m_os{ source.m_os },
+        m_indent_level{ source.m_indent_level },
+        m_should_comma{ source.m_should_comma },
+        m_active{ source.m_active } {
+        source.m_active = false;
+    }
+
+    JsonObjectWriter::~JsonObjectWriter() {
+        if ( !m_active ) { return; }
+
+        m_os << '\n';
+        JsonUtils::indent( m_os, m_indent_level );
+        m_os << '}';
+    }
+
+    JsonValueWriter JsonObjectWriter::write( StringRef key ) {
+        JsonUtils::appendCommaNewline(
+            m_os, m_should_comma, m_indent_level + 1 );
+
+        m_os << '"' << key << "\": ";
+        return JsonValueWriter{ m_os, m_indent_level + 1 };
+    }
+
+    JsonArrayWriter::JsonArrayWriter( std::ostream& os ):
+        JsonArrayWriter{ os, 0 } {}
+    JsonArrayWriter::JsonArrayWriter( std::ostream& os,
+                                      std::uint64_t indent_level ):
+        m_os{ os }, m_indent_level{ indent_level } {
+        m_os << '[';
+    }
+    JsonArrayWriter::JsonArrayWriter( JsonArrayWriter&& source ):
+        m_os{ source.m_os },
+        m_indent_level{ source.m_indent_level },
+        m_should_comma{ source.m_should_comma },
+        m_active{ source.m_active } {
+        source.m_active = false;
+    }
+    JsonArrayWriter::~JsonArrayWriter() {
+        if ( !m_active ) { return; }
+
+        m_os << '\n';
+        JsonUtils::indent( m_os, m_indent_level );
+        m_os << ']';
+    }
+
+    JsonObjectWriter JsonArrayWriter::writeObject() {
+        JsonUtils::appendCommaNewline(
+            m_os, m_should_comma, m_indent_level + 1 );
+        return JsonObjectWriter{ m_os, m_indent_level + 1 };
+    }
+
+    JsonArrayWriter JsonArrayWriter::writeArray() {
+        JsonUtils::appendCommaNewline(
+            m_os, m_should_comma, m_indent_level + 1 );
+        return JsonArrayWriter{ m_os, m_indent_level + 1 };
+    }
+
+    JsonArrayWriter& JsonArrayWriter::write( bool value ) {
+        return writeImpl( value );
+    }
+
+    JsonValueWriter::JsonValueWriter( std::ostream& os ):
+        JsonValueWriter{ os, 0 } {}
+
+    JsonValueWriter::JsonValueWriter( std::ostream& os,
+                                      std::uint64_t indent_level ):
+        m_os{ os }, m_indent_level{ indent_level } {}
+
+    JsonObjectWriter JsonValueWriter::writeObject() && {
+        return JsonObjectWriter{ m_os, m_indent_level };
+    }
+
+    JsonArrayWriter JsonValueWriter::writeArray() && {
+        return JsonArrayWriter{ m_os, m_indent_level };
+    }
+
+    void JsonValueWriter::write( Catch::StringRef value ) && {
+        writeImpl( value, true );
+    }
+
+    void JsonValueWriter::write( bool value ) && {
+        writeImpl( value ? "true"_sr : "false"_sr, false );
+    }
+
+    void JsonValueWriter::writeImpl( Catch::StringRef value, bool quote ) {
+        if ( quote ) { m_os << '"'; }
+        for (char c : value) {
+            // Escape list taken from https://www.json.org/json-en.html,
+            // string definition.
+            // Note that while forward slash _can_ be escaped, it does
+            // not have to be, if JSON is not further embedded somewhere
+            // where forward slash is meaningful.
+            if ( c == '"' ) {
+                m_os << "\\\"";
+            } else if ( c == '\\' ) {
+                m_os << "\\\\";
+            } else if ( c == '\b' ) {
+                m_os << "\\b";
+            } else if ( c == '\f' ) {
+                m_os << "\\f";
+            } else if ( c == '\n' ) {
+                m_os << "\\n";
+            } else if ( c == '\r' ) {
+                m_os << "\\r";
+            } else if ( c == '\t' ) {
+                m_os << "\\t";
+            } else {
+                m_os << c;
+            }
+        }
+        if ( quote ) { m_os << '"'; }
+    }
+
+} // namespace Catch
+
+
+
 
 namespace Catch {
 
@@ -4231,7 +4633,7 @@ namespace Catch {
 
 #else // ^^ Windows crt debug heap enabled // Windows crt debug heap disabled vv
 
-    Catch::LeakDetector::LeakDetector() {}
+    Catch::LeakDetector::LeakDetector() = default;
 
 #endif // CATCH_CONFIG_WINDOWS_CRTDBG
 
@@ -4242,7 +4644,6 @@ Catch::LeakDetector::~LeakDetector() {
 
 
 
-
 namespace Catch {
     namespace {
 
@@ -4277,7 +4678,7 @@ namespace Catch {
         void listReporters(IEventListener& reporter) {
             std::vector<ReporterDescription> descriptions;
 
-            IReporterRegistry::FactoryMap const& factories = getRegistryHub().getReporterRegistry().getFactories();
+            auto const& factories = getRegistryHub().getReporterRegistry().getFactories();
             descriptions.reserve(factories.size());
             for (auto const& fac : factories) {
                 descriptions.push_back({ fac.first, fac.second->getDescription() });
@@ -4599,6 +5000,14 @@ namespace Catch {
     }
 #endif
 
+#if !defined( CATCH_CONFIG_GLOBAL_NEXTAFTER )
+    float nextafter( float x, float y ) { return std::nextafter( x, y ); }
+    double nextafter( double x, double y ) { return std::nextafter( x, y ); }
+#else
+    float nextafter( float x, float y ) { return ::nextafterf( x, y ); }
+    double nextafter( double x, double y ) { return ::nextafter( x, y ); }
+#endif
+
 } // end namespace Catch
 
 
@@ -4680,10 +5089,10 @@ namespace Catch {
             return static_cast<std::uint32_t>( std::time( nullptr ) );
 
         case GenerateFrom::Default:
-        case GenerateFrom::RandomDevice:
-            // In theory, a platform could have random_device that returns just
-            // 16 bits. That is still some randomness, so we don't care too much
-            return static_cast<std::uint32_t>( std::random_device{}() );
+        case GenerateFrom::RandomDevice: {
+            std::random_device rd;
+            return Detail::fillBitsFrom<std::uint32_t>( rd );
+        }
 
         default:
             CATCH_ERROR("Unknown generation method");
@@ -4696,49 +5105,73 @@ namespace Catch {
 
 
 namespace Catch {
+    struct ReporterRegistry::ReporterRegistryImpl {
+        std::vector<Detail::unique_ptr<EventListenerFactory>> listeners;
+        std::map<std::string, IReporterFactoryPtr, Detail::CaseInsensitiveLess>
+            factories;
+    };
 
-    ReporterRegistry::ReporterRegistry() {
+    ReporterRegistry::ReporterRegistry():
+        m_impl( Detail::make_unique<ReporterRegistryImpl>() ) {
         // Because it is impossible to move out of initializer list,
         // we have to add the elements manually
-        m_factories["Automake"] = Detail::make_unique<ReporterFactory<AutomakeReporter>>();
-        m_factories["compact"] = Detail::make_unique<ReporterFactory<CompactReporter>>();
-        m_factories["console"] = Detail::make_unique<ReporterFactory<ConsoleReporter>>();
-        m_factories["JUnit"] = Detail::make_unique<ReporterFactory<JunitReporter>>();
-        m_factories["SonarQube"] = Detail::make_unique<ReporterFactory<SonarQubeReporter>>();
-        m_factories["TAP"] = Detail::make_unique<ReporterFactory<TAPReporter>>();
-        m_factories["TeamCity"] = Detail::make_unique<ReporterFactory<TeamCityReporter>>();
-        m_factories["XML"] = Detail::make_unique<ReporterFactory<XmlReporter>>();
+        m_impl->factories["Automake"] =
+            Detail::make_unique<ReporterFactory<AutomakeReporter>>();
+        m_impl->factories["compact"] =
+            Detail::make_unique<ReporterFactory<CompactReporter>>();
+        m_impl->factories["console"] =
+            Detail::make_unique<ReporterFactory<ConsoleReporter>>();
+        m_impl->factories["JUnit"] =
+            Detail::make_unique<ReporterFactory<JunitReporter>>();
+        m_impl->factories["SonarQube"] =
+            Detail::make_unique<ReporterFactory<SonarQubeReporter>>();
+        m_impl->factories["TAP"] =
+            Detail::make_unique<ReporterFactory<TAPReporter>>();
+        m_impl->factories["TeamCity"] =
+            Detail::make_unique<ReporterFactory<TeamCityReporter>>();
+        m_impl->factories["XML"] =
+            Detail::make_unique<ReporterFactory<XmlReporter>>();
+        m_impl->factories["JSON"] =
+            Detail::make_unique<ReporterFactory<JsonReporter>>();
     }
 
     ReporterRegistry::~ReporterRegistry() = default;
 
-
-    IEventListenerPtr ReporterRegistry::create( std::string const& name, ReporterConfig&& config ) const {
-        auto it =  m_factories.find( name );
-        if( it == m_factories.end() )
-            return nullptr;
-        return it->second->create( CATCH_MOVE(config) );
+    IEventListenerPtr
+    ReporterRegistry::create( std::string const& name,
+                              ReporterConfig&& config ) const {
+        auto it = m_impl->factories.find( name );
+        if ( it == m_impl->factories.end() ) return nullptr;
+        return it->second->create( CATCH_MOVE( config ) );
     }
 
-    void ReporterRegistry::registerReporter( std::string const& name, IReporterFactoryPtr factory ) {
+    void ReporterRegistry::registerReporter( std::string const& name,
+                                             IReporterFactoryPtr factory ) {
         CATCH_ENFORCE( name.find( "::" ) == name.npos,
-                       "'::' is not allowed in reporter name: '" + name + '\'' );
-        auto ret = m_factories.emplace(name, CATCH_MOVE(factory));
-        CATCH_ENFORCE( ret.second, "reporter using '" + name + "' as name was already registered" );
+                       "'::' is not allowed in reporter name: '" + name +
+                           '\'' );
+        auto ret = m_impl->factories.emplace( name, CATCH_MOVE( factory ) );
+        CATCH_ENFORCE( ret.second,
+                       "reporter using '" + name +
+                           "' as name was already registered" );
     }
     void ReporterRegistry::registerListener(
         Detail::unique_ptr<EventListenerFactory> factory ) {
-        m_listeners.push_back( CATCH_MOVE(factory) );
+        m_impl->listeners.push_back( CATCH_MOVE( factory ) );
     }
 
-    IReporterRegistry::FactoryMap const& ReporterRegistry::getFactories() const {
-        return m_factories;
-    }
-    IReporterRegistry::Listeners const& ReporterRegistry::getListeners() const {
-        return m_listeners;
+    std::map<std::string,
+             IReporterFactoryPtr,
+             Detail::CaseInsensitiveLess> const&
+    ReporterRegistry::getFactories() const {
+        return m_impl->factories;
     }
 
-}
+    std::vector<Detail::unique_ptr<EventListenerFactory>> const&
+    ReporterRegistry::getListeners() const {
+        return m_impl->listeners;
+    }
+} // namespace Catch
 
 
 
@@ -4754,9 +5187,9 @@ namespace Catch {
         };
 
         kvPair splitKVPair(StringRef kvString) {
-            auto splitPos = static_cast<size_t>( std::distance(
-                kvString.begin(),
-                std::find( kvString.begin(), kvString.end(), '=' ) ) );
+            auto splitPos = static_cast<size_t>(
+                std::find( kvString.begin(), kvString.end(), '=' ) -
+                kvString.begin() );
 
             return { kvString.substr( 0, splitPos ),
                      kvString.substr( splitPos + 1, kvString.size() ) };
@@ -4988,146 +5421,151 @@ namespace Catch {
 namespace Catch {
 
     namespace Generators {
-        struct GeneratorTracker : TestCaseTracking::TrackerBase, IGeneratorTracker {
-            GeneratorBasePtr m_generator;
+        namespace {
+            struct GeneratorTracker final : TestCaseTracking::TrackerBase,
+                                      IGeneratorTracker {
+                GeneratorBasePtr m_generator;
+
+                GeneratorTracker(
+                    TestCaseTracking::NameAndLocation&& nameAndLocation,
+                    TrackerContext& ctx,
+                    ITracker* parent ):
+                    TrackerBase( CATCH_MOVE( nameAndLocation ), ctx, parent ) {}
+
+                static GeneratorTracker*
+                acquire( TrackerContext& ctx,
+                         TestCaseTracking::NameAndLocationRef const&
+                             nameAndLocation ) {
+                    GeneratorTracker* tracker;
+
+                    ITracker& currentTracker = ctx.currentTracker();
+                    // Under specific circumstances, the generator we want
+                    // to acquire is also the current tracker. If this is
+                    // the case, we have to avoid looking through current
+                    // tracker's children, and instead return the current
+                    // tracker.
+                    // A case where this check is important is e.g.
+                    //     for (int i = 0; i < 5; ++i) {
+                    //         int n = GENERATE(1, 2);
+                    //     }
+                    //
+                    // without it, the code above creates 5 nested generators.
+                    if ( currentTracker.nameAndLocation() == nameAndLocation ) {
+                        auto thisTracker = currentTracker.parent()->findChild(
+                            nameAndLocation );
+                        assert( thisTracker );
+                        assert( thisTracker->isGeneratorTracker() );
+                        tracker = static_cast<GeneratorTracker*>( thisTracker );
+                    } else if ( ITracker* childTracker =
+                                    currentTracker.findChild(
+                                        nameAndLocation ) ) {
+                        assert( childTracker );
+                        assert( childTracker->isGeneratorTracker() );
+                        tracker =
+                            static_cast<GeneratorTracker*>( childTracker );
+                    } else {
+                        return nullptr;
+                    }
 
-            GeneratorTracker( TestCaseTracking::NameAndLocation&& nameAndLocation, TrackerContext& ctx, ITracker* parent )
-            :   TrackerBase( CATCH_MOVE(nameAndLocation), ctx, parent )
-            {}
-            ~GeneratorTracker() override;
-
-            static GeneratorTracker* acquire( TrackerContext& ctx, TestCaseTracking::NameAndLocationRef const& nameAndLocation ) {
-                GeneratorTracker* tracker;
-
-                ITracker& currentTracker = ctx.currentTracker();
-                // Under specific circumstances, the generator we want
-                // to acquire is also the current tracker. If this is
-                // the case, we have to avoid looking through current
-                // tracker's children, and instead return the current
-                // tracker.
-                // A case where this check is important is e.g.
-                //     for (int i = 0; i < 5; ++i) {
-                //         int n = GENERATE(1, 2);
-                //     }
-                //
-                // without it, the code above creates 5 nested generators.
-                if ( currentTracker.nameAndLocation() == nameAndLocation ) {
-                    auto thisTracker =
-                        currentTracker.parent()->findChild( nameAndLocation );
-                    assert( thisTracker );
-                    assert( thisTracker->isGeneratorTracker() );
-                    tracker = static_cast<GeneratorTracker*>( thisTracker );
-                } else if ( ITracker* childTracker =
-                                currentTracker.findChild( nameAndLocation ) ) {
-                    assert( childTracker );
-                    assert( childTracker->isGeneratorTracker() );
-                    tracker = static_cast<GeneratorTracker*>( childTracker );
-                } else {
-                    return nullptr;
-                }
+                    if ( !tracker->isComplete() ) { tracker->open(); }
 
-                if( !tracker->isComplete() ) {
-                    tracker->open();
+                    return tracker;
                 }
 
-                return tracker;
-            }
-
-            // TrackerBase interface
-            bool isGeneratorTracker() const override { return true; }
-            auto hasGenerator() const -> bool override {
-                return !!m_generator;
-            }
-            void close() override {
-                TrackerBase::close();
-                // If a generator has a child (it is followed by a section)
-                // and none of its children have started, then we must wait
-                // until later to start consuming its values.
-                // This catches cases where `GENERATE` is placed between two
-                // `SECTION`s.
-                // **The check for m_children.empty cannot be removed**.
-                // doing so would break `GENERATE` _not_ followed by `SECTION`s.
-                const bool should_wait_for_child = [&]() {
-                    // No children -> nobody to wait for
-                    if ( m_children.empty() ) {
-                        return false;
-                    }
-                    // If at least one child started executing, don't wait
-                    if ( std::find_if(
-                             m_children.begin(),
-                             m_children.end(),
-                             []( TestCaseTracking::ITrackerPtr const& tracker ) {
-                                 return tracker->hasStarted();
-                             } ) != m_children.end() ) {
-                        return false;
-                    }
-
-                    // No children have started. We need to check if they _can_
-                    // start, and thus we should wait for them, or they cannot
-                    // start (due to filters), and we shouldn't wait for them
-                    ITracker* parent = m_parent;
-                    // This is safe: there is always at least one section
-                    // tracker in a test case tracking tree
-                    while ( !parent->isSectionTracker() ) {
-                        parent = parent->parent();
-                    }
-                    assert( parent &&
-                            "Missing root (test case) level section" );
-
-                    auto const& parentSection =
-                        static_cast<SectionTracker const&>( *parent );
-                    auto const& filters = parentSection.getFilters();
-                    // No filters -> no restrictions on running sections
-                    if ( filters.empty() ) {
-                        return true;
-                    }
+                // TrackerBase interface
+                bool isGeneratorTracker() const override { return true; }
+                auto hasGenerator() const -> bool override {
+                    return !!m_generator;
+                }
+                void close() override {
+                    TrackerBase::close();
+                    // If a generator has a child (it is followed by a section)
+                    // and none of its children have started, then we must wait
+                    // until later to start consuming its values.
+                    // This catches cases where `GENERATE` is placed between two
+                    // `SECTION`s.
+                    // **The check for m_children.empty cannot be removed**.
+                    // doing so would break `GENERATE` _not_ followed by
+                    // `SECTION`s.
+                    const bool should_wait_for_child = [&]() {
+                        // No children -> nobody to wait for
+                        if ( m_children.empty() ) { return false; }
+                        // If at least one child started executing, don't wait
+                        if ( std::find_if(
+                                 m_children.begin(),
+                                 m_children.end(),
+                                 []( TestCaseTracking::ITrackerPtr const&
+                                         tracker ) {
+                                     return tracker->hasStarted();
+                                 } ) != m_children.end() ) {
+                            return false;
+                        }
 
-                    for ( auto const& child : m_children ) {
-                        if ( child->isSectionTracker() &&
-                             std::find(
-                                 filters.begin(),
-                                 filters.end(),
-                                 static_cast<SectionTracker const&>( *child )
-                                     .trimmedName() ) != filters.end() ) {
-                            return true;
+                        // No children have started. We need to check if they
+                        // _can_ start, and thus we should wait for them, or
+                        // they cannot start (due to filters), and we shouldn't
+                        // wait for them
+                        ITracker* parent = m_parent;
+                        // This is safe: there is always at least one section
+                        // tracker in a test case tracking tree
+                        while ( !parent->isSectionTracker() ) {
+                            parent = parent->parent();
                         }
+                        assert( parent &&
+                                "Missing root (test case) level section" );
+
+                        auto const& parentSection =
+                            static_cast<SectionTracker const&>( *parent );
+                        auto const& filters = parentSection.getFilters();
+                        // No filters -> no restrictions on running sections
+                        if ( filters.empty() ) { return true; }
+
+                        for ( auto const& child : m_children ) {
+                            if ( child->isSectionTracker() &&
+                                 std::find( filters.begin(),
+                                            filters.end(),
+                                            static_cast<SectionTracker const&>(
+                                                *child )
+                                                .trimmedName() ) !=
+                                     filters.end() ) {
+                                return true;
+                            }
+                        }
+                        return false;
+                    }();
+
+                    // This check is a bit tricky, because m_generator->next()
+                    // has a side-effect, where it consumes generator's current
+                    // value, but we do not want to invoke the side-effect if
+                    // this generator is still waiting for any child to start.
+                    assert( m_generator && "Tracker without generator" );
+                    if ( should_wait_for_child ||
+                         ( m_runState == CompletedSuccessfully &&
+                           m_generator->countedNext() ) ) {
+                        m_children.clear();
+                        m_runState = Executing;
                     }
-                    return false;
-                }();
-
-                // This check is a bit tricky, because m_generator->next()
-                // has a side-effect, where it consumes generator's current
-                // value, but we do not want to invoke the side-effect if
-                // this generator is still waiting for any child to start.
-                assert( m_generator && "Tracker without generator" );
-                if ( should_wait_for_child ||
-                     ( m_runState == CompletedSuccessfully &&
-                       m_generator->countedNext() ) ) {
-                    m_children.clear();
-                    m_runState = Executing;
                 }
-            }
 
-            // IGeneratorTracker interface
-            auto getGenerator() const -> GeneratorBasePtr const& override {
-                return m_generator;
-            }
-            void setGenerator( GeneratorBasePtr&& generator ) override {
-                m_generator = CATCH_MOVE( generator );
-            }
-        };
-        GeneratorTracker::~GeneratorTracker() = default;
+                // IGeneratorTracker interface
+                auto getGenerator() const -> GeneratorBasePtr const& override {
+                    return m_generator;
+                }
+                void setGenerator( GeneratorBasePtr&& generator ) override {
+                    m_generator = CATCH_MOVE( generator );
+                }
+            };
+        } // namespace
     }
 
     RunContext::RunContext(IConfig const* _config, IEventListenerPtr&& reporter)
     :   m_runInfo(_config->name()),
-        m_context(getCurrentMutableContext()),
         m_config(_config),
         m_reporter(CATCH_MOVE(reporter)),
         m_lastAssertionInfo{ StringRef(), SourceLineInfo("",0), StringRef(), ResultDisposition::Normal },
         m_includeSuccessfulResults( m_config->includeSuccessfulResults() || m_reporter->getPreferences().shouldReportAllAssertions )
     {
-        m_context.setResultCapture(this);
+        getCurrentMutableContext().setResultCapture( this );
         m_reporter->testRunStarting(m_runInfo);
     }
 
@@ -5222,7 +5660,7 @@ namespace Catch {
     }
 
 
-    void RunContext::assertionEnded(AssertionResult const & result) {
+    void RunContext::assertionEnded(AssertionResult&& result) {
         if (result.getResultType() == ResultWas::Ok) {
             m_totals.assertions.passed++;
             m_lastAssertionPassed = true;
@@ -5244,19 +5682,27 @@ namespace Catch {
 
         m_reporter->assertionEnded(AssertionStats(result, m_messages, m_totals));
 
-        if (result.getResultType() != ResultWas::Warning)
+        if ( result.getResultType() != ResultWas::Warning ) {
             m_messageScopes.clear();
+        }
 
-        // Reset working state
-        resetAssertionInfo();
-        m_lastResult = result;
+        // Reset working state. assertion info will be reset after
+        // populateReaction is run if it is needed
+        m_lastResult = CATCH_MOVE( result );
     }
     void RunContext::resetAssertionInfo() {
         m_lastAssertionInfo.macroName = StringRef();
         m_lastAssertionInfo.capturedExpression = "{Unknown expression after the reported line}"_sr;
+        m_lastAssertionInfo.resultDisposition = ResultDisposition::Normal;
+    }
+
+    void RunContext::notifyAssertionStarted( AssertionInfo const& info ) {
+        m_reporter->assertionStarting( info );
     }
 
-    bool RunContext::sectionStarted(StringRef sectionName, SourceLineInfo const& sectionLineInfo, Counts & assertions) {
+    bool RunContext::sectionStarted( StringRef sectionName,
+                                     SourceLineInfo const& sectionLineInfo,
+                                     Counts& assertions ) {
         ITracker& sectionTracker =
             SectionTracker::acquire( m_trackerContext,
                                      TestCaseTracking::NameAndLocationRef(
@@ -5394,7 +5840,8 @@ namespace Catch {
         tempResult.message = static_cast<std::string>(message);
         AssertionResult result(m_lastAssertionInfo, CATCH_MOVE(tempResult));
 
-        assertionEnded(result);
+        assertionEnded(CATCH_MOVE(result) );
+        resetAssertionInfo();
 
         handleUnfinishedSections();
 
@@ -5516,8 +5963,6 @@ namespace Catch {
         ITransientExpression const& expr,
         AssertionReaction& reaction
     ) {
-        m_reporter->assertionStarting( info );
-
         bool negated = isFalseTest( info.resultDisposition );
         bool result = expr.getResult() != negated;
 
@@ -5533,6 +5978,7 @@ namespace Catch {
             reportExpr(info, ResultWas::ExpressionFailed, &expr, negated );
             populateReaction( reaction );
         }
+        resetAssertionInfo();
     }
     void RunContext::reportExpr(
             AssertionInfo const &info,
@@ -5546,7 +5992,7 @@ namespace Catch {
         AssertionResult assertionResult{ info, CATCH_MOVE( data ) };
         assertionResult.m_resultData.lazyExpression.m_transientExpression = expr;
 
-        assertionEnded( assertionResult );
+        assertionEnded( CATCH_MOVE(assertionResult) );
     }
 
     void RunContext::handleMessage(
@@ -5555,22 +6001,23 @@ namespace Catch {
             StringRef message,
             AssertionReaction& reaction
     ) {
-        m_reporter->assertionStarting( info );
-
         m_lastAssertionInfo = info;
 
         AssertionResultData data( resultType, LazyExpression( false ) );
         data.message = static_cast<std::string>(message);
         AssertionResult assertionResult{ m_lastAssertionInfo,
                                          CATCH_MOVE( data ) };
-        assertionEnded( assertionResult );
-        if ( !assertionResult.isOk() ) {
+
+        const auto isOk = assertionResult.isOk();
+        assertionEnded( CATCH_MOVE(assertionResult) );
+        if ( !isOk ) {
             populateReaction( reaction );
         } else if ( resultType == ResultWas::ExplicitSkip ) {
             // TODO: Need to handle this explicitly, as ExplicitSkip is
             // considered "OK"
             reaction.shouldSkip = true;
         }
+        resetAssertionInfo();
     }
     void RunContext::handleUnexpectedExceptionNotThrown(
             AssertionInfo const& info,
@@ -5581,16 +6028,17 @@ namespace Catch {
 
     void RunContext::handleUnexpectedInflightException(
             AssertionInfo const& info,
-            std::string const& message,
+            std::string&& message,
             AssertionReaction& reaction
     ) {
         m_lastAssertionInfo = info;
 
         AssertionResultData data( ResultWas::ThrewException, LazyExpression( false ) );
-        data.message = message;
+        data.message = CATCH_MOVE(message);
         AssertionResult assertionResult{ info, CATCH_MOVE(data) };
-        assertionEnded( assertionResult );
+        assertionEnded( CATCH_MOVE(assertionResult) );
         populateReaction( reaction );
+        resetAssertionInfo();
     }
 
     void RunContext::populateReaction( AssertionReaction& reaction ) {
@@ -5607,7 +6055,8 @@ namespace Catch {
         AssertionResultData data( ResultWas::ThrewException, LazyExpression( false ) );
         data.message = "Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE"s;
         AssertionResult assertionResult{ info, CATCH_MOVE( data ) };
-        assertionEnded( assertionResult );
+        assertionEnded( CATCH_MOVE(assertionResult) );
+        resetAssertionInfo();
     }
     void RunContext::handleNonExpr(
             AssertionInfo const &info,
@@ -5618,10 +6067,11 @@ namespace Catch {
 
         AssertionResultData data( resultType, LazyExpression( false ) );
         AssertionResult assertionResult{ info, CATCH_MOVE( data ) };
-        assertionEnded( assertionResult );
 
-        if( !assertionResult.isOk() )
-            populateReaction( reaction );
+        const auto isOk = assertionResult.isOk();
+        assertionEnded( CATCH_MOVE(assertionResult) );
+        if ( !isOk ) { populateReaction( reaction ); }
+        resetAssertionInfo();
     }
 
 
@@ -5790,7 +6240,6 @@ namespace Catch {
 
 
 
-#include <algorithm>
 #include <ostream>
 #include <cstring>
 #include <cctype>
@@ -5814,9 +6263,9 @@ namespace Catch {
         return s.find( infix ) != std::string::npos;
     }
     void toLowerInPlace( std::string& s ) {
-        std::transform( s.begin(), s.end(), s.begin(), []( char c ) {
-            return toLower( c );
-        } );
+        for ( char& c : s ) {
+            c = toLower( c );
+        }
     }
     std::string toLower( std::string const& s ) {
         std::string lc = s;
@@ -5949,7 +6398,7 @@ namespace Catch {
 
 namespace Catch {
 
-    TagAliasRegistry::~TagAliasRegistry() {}
+    TagAliasRegistry::~TagAliasRegistry() = default;
 
     TagAlias const* TagAliasRegistry::find( std::string const& alias ) const {
         auto it = m_registry.find( alias );
@@ -6030,6 +6479,38 @@ namespace Catch {
 
 namespace Catch {
 
+    namespace {
+        static void enforceNoDuplicateTestCases(
+            std::vector<TestCaseHandle> const& tests ) {
+            auto testInfoCmp = []( TestCaseInfo const* lhs,
+                                   TestCaseInfo const* rhs ) {
+                return *lhs < *rhs;
+            };
+            std::set<TestCaseInfo const*, decltype( testInfoCmp )&> seenTests(
+                testInfoCmp );
+            for ( auto const& test : tests ) {
+                const auto infoPtr = &test.getTestCaseInfo();
+                const auto prev = seenTests.insert( infoPtr );
+                CATCH_ENFORCE( prev.second,
+                               "error: test case \""
+                                   << infoPtr->name << "\", with tags \""
+                                   << infoPtr->tagsAsString()
+                                   << "\" already defined.\n"
+                                   << "\tFirst seen at "
+                                   << ( *prev.first )->lineInfo << "\n"
+                                   << "\tRedefined at " << infoPtr->lineInfo );
+            }
+        }
+
+        static bool matchTest( TestCaseHandle const& testCase,
+                               TestSpec const& testSpec,
+                               IConfig const& config ) {
+            return testSpec.matches( testCase.getTestCaseInfo() ) &&
+                   isThrowSafe( testCase, config );
+        }
+
+    } // end unnamed namespace
+
     std::vector<TestCaseHandle> sortTests( IConfig const& config, std::vector<TestCaseHandle> const& unsortedTestCases ) {
         switch (config.runOrder()) {
         case TestRunOrder::Declared:
@@ -6047,7 +6528,6 @@ namespace Catch {
             return sorted;
         }
         case TestRunOrder::Randomized: {
-            seedRng(config);
             using TestWithHash = std::pair<TestCaseInfoHasher::hash_t, TestCaseHandle>;
 
             TestCaseInfoHasher h{ config.rngSeed() };
@@ -6086,29 +6566,6 @@ namespace Catch {
         return !testCase.getTestCaseInfo().throws() || config.allowThrows();
     }
 
-    bool matchTest( TestCaseHandle const& testCase, TestSpec const& testSpec, IConfig const& config ) {
-        return testSpec.matches( testCase.getTestCaseInfo() ) && isThrowSafe( testCase, config );
-    }
-
-    void
-    enforceNoDuplicateTestCases( std::vector<TestCaseHandle> const& tests ) {
-        auto testInfoCmp = []( TestCaseInfo const* lhs,
-                               TestCaseInfo const* rhs ) {
-            return *lhs < *rhs;
-        };
-        std::set<TestCaseInfo const*, decltype(testInfoCmp) &> seenTests(testInfoCmp);
-        for ( auto const& test : tests ) {
-            const auto infoPtr = &test.getTestCaseInfo();
-            const auto prev = seenTests.insert( infoPtr );
-            CATCH_ENFORCE(
-                prev.second,
-                "error: test case \"" << infoPtr->name << "\", with tags \""
-                    << infoPtr->tagsAsString() << "\" already defined.\n"
-                    << "\tFirst seen at " << ( *prev.first )->lineInfo << "\n"
-                    << "\tRedefined at " << infoPtr->lineInfo );
-        }
-    }
-
     std::vector<TestCaseHandle> filterTests( std::vector<TestCaseHandle> const& testCases, TestSpec const& testSpec, IConfig const& config ) {
         std::vector<TestCaseHandle> filtered;
         filtered.reserve( testCases.size() );
@@ -6149,13 +6606,6 @@ namespace Catch {
         return m_sortedFunctions;
     }
 
-
-
-    ///////////////////////////////////////////////////////////////////////////
-    void TestInvokerAsFunction::invoke() const {
-        m_testAsFunction();
-    }
-
 } // end namespace Catch
 
 
@@ -6401,6 +6851,14 @@ namespace Catch {
 #endif
     }
 
+    void throw_test_skip_exception() {
+#if !defined( CATCH_CONFIG_DISABLE_EXCEPTIONS )
+        throw Catch::TestSkipException();
+#else
+        CATCH_ERROR( "Explicitly skipping tests during runtime requires exceptions" );
+#endif
+    }
+
 } // namespace Catch
 
 
@@ -6409,9 +6867,10 @@ namespace Catch {
 #include <iterator>
 
 namespace Catch {
+    ITestInvoker::~ITestInvoker() = default;
 
     namespace {
-        StringRef extractClassName( StringRef classOrMethodName ) {
+        static StringRef extractClassName( StringRef classOrMethodName ) {
             if ( !startsWith( classOrMethodName, '&' ) ) {
                 return classOrMethodName;
             }
@@ -6438,6 +6897,18 @@ namespace Catch {
                 static_cast<std::size_t>( startIdx ),
                 static_cast<std::size_t>( classNameSize ) );
         }
+
+        class TestInvokerAsFunction final : public ITestInvoker {
+            using TestType = void ( * )();
+            TestType m_testAsFunction;
+
+        public:
+            TestInvokerAsFunction( TestType testAsFunction ) noexcept:
+                m_testAsFunction( testAsFunction ) {}
+
+            void invoke() const override { m_testAsFunction(); }
+        };
+
     } // namespace
 
     Detail::unique_ptr<ITestInvoker> makeTestInvoker( void(*testAsFunction)() ) {
@@ -6919,23 +7390,36 @@ namespace Catch {
             return os;
         }
 
-        Columns Column::operator+( Column const& other ) {
+        Columns operator+(Column const& lhs, Column const& rhs) {
             Columns cols;
-            cols += *this;
-            cols += other;
+            cols += lhs;
+            cols += rhs;
             return cols;
         }
-
-        Columns& Columns::operator+=( Column const& col ) {
-            m_columns.push_back( col );
-            return *this;
+        Columns operator+(Column&& lhs, Column&& rhs) {
+            Columns cols;
+            cols += CATCH_MOVE( lhs );
+            cols += CATCH_MOVE( rhs );
+            return cols;
         }
 
-        Columns Columns::operator+( Column const& col ) {
-            Columns combined = *this;
-            combined += col;
+        Columns& operator+=(Columns& lhs, Column const& rhs) {
+            lhs.m_columns.push_back( rhs );
+            return lhs;
+        }
+        Columns& operator+=(Columns& lhs, Column&& rhs) {
+            lhs.m_columns.push_back( CATCH_MOVE(rhs) );
+            return lhs;
+        }
+        Columns operator+( Columns const& lhs, Column const& rhs ) {
+            auto combined( lhs );
+            combined += rhs;
             return combined;
         }
+        Columns operator+( Columns&& lhs, Column&& rhs ) {
+            lhs += CATCH_MOVE( rhs );
+            return CATCH_MOVE( lhs );
+        }
 
     } // namespace TextFlow
 } // namespace Catch
@@ -7431,26 +7915,11 @@ namespace {
         return ulpDist <= maxUlpDiff;
     }
 
-#if defined(CATCH_CONFIG_GLOBAL_NEXTAFTER)
-
-    float nextafter(float x, float y) {
-        return ::nextafterf(x, y);
-    }
-
-    double nextafter(double x, double y) {
-        return ::nextafter(x, y);
-    }
-
-#endif // ^^^ CATCH_CONFIG_GLOBAL_NEXTAFTER ^^^
 
 template <typename FP>
 FP step(FP start, FP direction, uint64_t steps) {
     for (uint64_t i = 0; i < steps; ++i) {
-#if defined(CATCH_CONFIG_GLOBAL_NEXTAFTER)
         start = Catch::nextafter(start, direction);
-#else
-        start = std::nextafter(start, direction);
-#endif
     }
     return start;
 }
@@ -7824,7 +8293,7 @@ namespace Catch {
 
 namespace Catch {
 
-    AutomakeReporter::~AutomakeReporter() {}
+    AutomakeReporter::~AutomakeReporter() = default;
 
     void AutomakeReporter::testCaseEnded(TestCaseStats const& _testCaseStats) {
         // Possible values to emit are PASS, XFAIL, SKIP, FAIL, XPASS and ERROR.
@@ -8046,7 +8515,7 @@ class AssertionPrinter {
             return;
 
         const auto itEnd = messages.cend();
-        const auto N = static_cast<std::size_t>(std::distance(itMessage, itEnd));
+        const auto N = static_cast<std::size_t>(itEnd - itMessage);
 
         stream << colourImpl->guardColour( colour ) << " with "
                << pluralise( N, "message"_sr ) << ':';
@@ -8124,7 +8593,7 @@ class AssertionPrinter {
             StreamingReporterBase::testRunEnded( _testRunStats );
         }
 
-        CompactReporter::~CompactReporter() {}
+        CompactReporter::~CompactReporter() = default;
 
 } // end namespace Catch
 
@@ -8319,15 +8788,9 @@ findMax( std::size_t& i, std::size_t& j, std::size_t& k, std::size_t& l ) {
         return l;
 }
 
-enum class Justification { Left, Right };
-
-struct ColumnInfo {
-    std::string name;
-    std::size_t width;
-    Justification justification;
-};
 struct ColumnBreak {};
 struct RowBreak {};
+struct OutputFlush {};
 
 class Duration {
     enum class Unit {
@@ -8402,6 +8865,14 @@ class Duration {
 };
 } // end anon namespace
 
+enum class Justification { Left, Right };
+
+struct ColumnInfo {
+    std::string name;
+    std::size_t width;
+    Justification justification;
+};
+
 class TablePrinter {
     std::ostream& m_os;
     std::vector<ColumnInfo> m_columnInfos;
@@ -8424,11 +8895,10 @@ class TablePrinter {
             *this << RowBreak();
 
 			TextFlow::Columns headerCols;
-			auto spacer = TextFlow::Spacer(2);
 			for (auto const& info : m_columnInfos) {
                 assert(info.width > 2);
 				headerCols += TextFlow::Column(info.name).width(info.width - 2);
-				headerCols += spacer;
+                headerCols += TextFlow::Spacer( 2 );
 			}
 			m_os << headerCols << '\n';
 
@@ -8444,12 +8914,12 @@ class TablePrinter {
     }
 
     template<typename T>
-    friend TablePrinter& operator << (TablePrinter& tp, T const& value) {
+    friend TablePrinter& operator<< (TablePrinter& tp, T const& value) {
         tp.m_oss << value;
         return tp;
     }
 
-    friend TablePrinter& operator << (TablePrinter& tp, ColumnBreak) {
+    friend TablePrinter& operator<< (TablePrinter& tp, ColumnBreak) {
         auto colStr = tp.m_oss.str();
         const auto strSize = colStr.size();
         tp.m_oss.str("");
@@ -8471,13 +8941,18 @@ class TablePrinter {
         return tp;
     }
 
-    friend TablePrinter& operator << (TablePrinter& tp, RowBreak) {
+    friend TablePrinter& operator<< (TablePrinter& tp, RowBreak) {
         if (tp.m_currentColumn > 0) {
             tp.m_os << '\n';
             tp.m_currentColumn = -1;
         }
         return tp;
     }
+
+    friend TablePrinter& operator<<(TablePrinter& tp, OutputFlush) {
+        tp.m_os << std::flush;
+        return tp;
+    }
 };
 
 ConsoleReporter::ConsoleReporter(ReporterConfig&& config):
@@ -8499,7 +8974,7 @@ ConsoleReporter::ConsoleReporter(ReporterConfig&& config):
                 { "benchmark name", CATCH_CONFIG_CONSOLE_WIDTH - 43, Justification::Left },
                 { "samples      mean       std dev", 14, Justification::Right },
                 { "iterations   low mean   low std dev", 14, Justification::Right },
-                { "estimated    high mean  high std dev", 14, Justification::Right }
+                { "est run time high mean  high std dev", 14, Justification::Right }
             };
         }
     }())) {}
@@ -8583,8 +9058,11 @@ void ConsoleReporter::benchmarkPreparing( StringRef name ) {
 void ConsoleReporter::benchmarkStarting(BenchmarkInfo const& info) {
     (*m_tablePrinter) << info.samples << ColumnBreak()
         << info.iterations << ColumnBreak();
-    if (!m_config->benchmarkNoAnalysis())
-        (*m_tablePrinter) << Duration(info.estimatedDuration) << ColumnBreak();
+    if ( !m_config->benchmarkNoAnalysis() ) {
+        ( *m_tablePrinter )
+            << Duration( info.estimatedDuration ) << ColumnBreak();
+    }
+    ( *m_tablePrinter ) << OutputFlush{};
 }
 void ConsoleReporter::benchmarkEnded(BenchmarkStats<> const& stats) {
     if (m_config->benchmarkNoAnalysis())
@@ -9280,6 +9758,366 @@ namespace Catch {
 } // namespace Catch
 
 
+//
+
+namespace Catch {
+    namespace {
+        void writeSourceInfo( JsonObjectWriter& writer,
+                              SourceLineInfo const& sourceInfo ) {
+            auto source_location_writer =
+                writer.write( "source-location"_sr ).writeObject();
+            source_location_writer.write( "filename"_sr )
+                .write( sourceInfo.file );
+            source_location_writer.write( "line"_sr ).write( sourceInfo.line );
+        }
+
+        void writeTags( JsonArrayWriter writer, std::vector<Tag> const& tags ) {
+            for ( auto const& tag : tags ) {
+                writer.write( tag.original );
+            }
+        }
+
+        void writeProperties( JsonArrayWriter writer,
+                              TestCaseInfo const& info ) {
+            if ( info.isHidden() ) { writer.write( "is-hidden"_sr ); }
+            if ( info.okToFail() ) { writer.write( "ok-to-fail"_sr ); }
+            if ( info.expectedToFail() ) {
+                writer.write( "expected-to-fail"_sr );
+            }
+            if ( info.throws() ) { writer.write( "throws"_sr ); }
+        }
+
+    } // namespace
+
+    JsonReporter::JsonReporter( ReporterConfig&& config ):
+        StreamingReporterBase{ CATCH_MOVE( config ) } {
+
+        m_preferences.shouldRedirectStdOut = true;
+        // TBD: Do we want to report all assertions? XML reporter does
+        //      not, but for machine-parseable reporters I think the answer
+        //      should be yes.
+        m_preferences.shouldReportAllAssertions = true;
+
+        m_objectWriters.emplace( m_stream );
+        m_writers.emplace( Writer::Object );
+        auto& writer = m_objectWriters.top();
+
+        writer.write( "version"_sr ).write( 1 );
+
+        {
+            auto metadata_writer = writer.write( "metadata"_sr ).writeObject();
+            metadata_writer.write( "name"_sr ).write( m_config->name() );
+            metadata_writer.write( "rng-seed"_sr ).write( m_config->rngSeed() );
+            metadata_writer.write( "catch2-version"_sr )
+                .write( libraryVersion() );
+            if ( m_config->testSpec().hasFilters() ) {
+                metadata_writer.write( "filters"_sr )
+                    .write( m_config->testSpec() );
+            }
+        }
+    }
+
+    JsonReporter::~JsonReporter() {
+        endListing();
+        // TODO: Ensure this closes the top level object, add asserts
+        assert( m_writers.size() == 1 && "Only the top level object should be open" );
+        assert( m_writers.top() == Writer::Object );
+        endObject();
+        m_stream << '\n' << std::flush;
+        assert( m_writers.empty() );
+    }
+
+    JsonArrayWriter& JsonReporter::startArray() {
+        m_arrayWriters.emplace( m_arrayWriters.top().writeArray() );
+        m_writers.emplace( Writer::Array );
+        return m_arrayWriters.top();
+    }
+    JsonArrayWriter& JsonReporter::startArray( StringRef key ) {
+        m_arrayWriters.emplace(
+            m_objectWriters.top().write( key ).writeArray() );
+        m_writers.emplace( Writer::Array );
+        return m_arrayWriters.top();
+    }
+
+    JsonObjectWriter& JsonReporter::startObject() {
+        m_objectWriters.emplace( m_arrayWriters.top().writeObject() );
+        m_writers.emplace( Writer::Object );
+        return m_objectWriters.top();
+    }
+    JsonObjectWriter& JsonReporter::startObject( StringRef key ) {
+        m_objectWriters.emplace(
+            m_objectWriters.top().write( key ).writeObject() );
+        m_writers.emplace( Writer::Object );
+        return m_objectWriters.top();
+    }
+
+    void JsonReporter::endObject() {
+        assert( isInside( Writer::Object ) );
+        m_objectWriters.pop();
+        m_writers.pop();
+    }
+    void JsonReporter::endArray() {
+        assert( isInside( Writer::Array ) );
+        m_arrayWriters.pop();
+        m_writers.pop();
+    }
+
+    bool JsonReporter::isInside( Writer writer ) {
+        return !m_writers.empty() && m_writers.top() == writer;
+    }
+
+    void JsonReporter::startListing() {
+        if ( !m_startedListing ) { startObject( "listings"_sr ); }
+        m_startedListing = true;
+    }
+    void JsonReporter::endListing() {
+        if ( m_startedListing ) { endObject(); }
+        m_startedListing = false;
+    }
+
+    std::string JsonReporter::getDescription() {
+        return "Outputs listings as JSON. Test listing is Work-in-Progress!";
+    }
+
+    void JsonReporter::testRunStarting( TestRunInfo const& testInfo ) {
+        StreamingReporterBase::testRunStarting( testInfo );
+        endListing();
+
+        assert( isInside( Writer::Object ) );
+        startObject( "test-run"_sr );
+        startArray( "test-cases"_sr );
+    }
+
+     static void writeCounts( JsonObjectWriter&& writer, Counts const& counts ) {
+        writer.write( "passed"_sr ).write( counts.passed );
+        writer.write( "failed"_sr ).write( counts.failed );
+        writer.write( "fail-but-ok"_sr ).write( counts.failedButOk );
+        writer.write( "skipped"_sr ).write( counts.skipped );
+    }
+
+    void JsonReporter::testRunEnded(TestRunStats const& runStats) {
+        assert( isInside( Writer::Array ) );
+        // End "test-cases"
+        endArray();
+
+        {
+            auto totals =
+                m_objectWriters.top().write( "totals"_sr ).writeObject();
+            writeCounts( totals.write( "assertions"_sr ).writeObject(),
+                         runStats.totals.assertions );
+            writeCounts( totals.write( "test-cases"_sr ).writeObject(),
+                         runStats.totals.testCases );
+        }
+
+        // End the "test-run" object
+        endObject();
+    }
+
+    void JsonReporter::testCaseStarting( TestCaseInfo const& tcInfo ) {
+        StreamingReporterBase::testCaseStarting( tcInfo );
+
+        assert( isInside( Writer::Array ) &&
+                "We should be in the 'test-cases' array" );
+        startObject();
+        // "test-info" prelude
+        {
+            auto testInfo =
+                m_objectWriters.top().write( "test-info"_sr ).writeObject();
+            // TODO: handle testName vs className!!
+            testInfo.write( "name"_sr ).write( tcInfo.name );
+            writeSourceInfo(testInfo, tcInfo.lineInfo);
+            writeTags( testInfo.write( "tags"_sr ).writeArray(), tcInfo.tags );
+            writeProperties( testInfo.write( "properties"_sr ).writeArray(),
+                             tcInfo );
+        }
+
+
+        // Start the array for individual test runs (testCasePartial pairs)
+        startArray( "runs"_sr );
+    }
+
+    void JsonReporter::testCaseEnded( TestCaseStats const& tcStats ) {
+        StreamingReporterBase::testCaseEnded( tcStats );
+
+        // We need to close the 'runs' array before finishing the test case
+        assert( isInside( Writer::Array ) );
+        endArray();
+
+        {
+            auto totals =
+                m_objectWriters.top().write( "totals"_sr ).writeObject();
+            writeCounts( totals.write( "assertions"_sr ).writeObject(),
+                         tcStats.totals.assertions );
+            // We do not write the test case totals, because there will always be just one test case here.
+            // TODO: overall "result" -> success, skip, fail here? Or in partial result?
+        }
+        // We do not write out stderr/stdout, because we instead wrote those out in partial runs
+
+        // TODO: aborting?
+
+        // And we also close this test case's object
+        assert( isInside( Writer::Object ) );
+        endObject();
+    }
+
+    void JsonReporter::testCasePartialStarting( TestCaseInfo const& /*tcInfo*/,
+                                                uint64_t index ) {
+        startObject();
+        m_objectWriters.top().write( "run-idx"_sr ).write( index );
+        startArray( "path"_sr );
+        // TODO: we want to delay most of the printing to the 'root' section
+        // TODO: childSection key name?
+    }
+
+    void JsonReporter::testCasePartialEnded( TestCaseStats const& tcStats,
+                                             uint64_t /*index*/ ) {
+        // Fixme: the top level section handles this.
+        //// path object
+        endArray();
+        if ( !tcStats.stdOut.empty() ) {
+            m_objectWriters.top()
+                .write( "captured-stdout"_sr )
+                .write( tcStats.stdOut );
+        }
+        if ( !tcStats.stdErr.empty() ) {
+            m_objectWriters.top()
+                .write( "captured-stderr"_sr )
+                .write( tcStats.stdErr );
+        }
+        {
+            auto totals =
+                m_objectWriters.top().write( "totals"_sr ).writeObject();
+            writeCounts( totals.write( "assertions"_sr ).writeObject(),
+                         tcStats.totals.assertions );
+            // We do not write the test case totals, because there will
+            // always be just one test case here.
+            // TODO: overall "result" -> success, skip, fail here? Or in
+            // partial result?
+        }
+        // TODO: aborting?
+        // run object
+        endObject();
+    }
+
+    void JsonReporter::sectionStarting( SectionInfo const& sectionInfo ) {
+        assert( isInside( Writer::Array ) &&
+                "Section should always start inside an object" );
+        // We want to nest top level sections, even though it shares name
+        // and source loc with the TEST_CASE
+        auto& sectionObject = startObject();
+        sectionObject.write( "kind"_sr ).write( "section"_sr );
+        sectionObject.write( "name"_sr ).write( sectionInfo.name );
+        writeSourceInfo( m_objectWriters.top(), sectionInfo.lineInfo );
+
+
+        // TBD: Do we want to create this event lazily? It would become
+        //      rather complex, but we could do it, and it would look
+        //      better for empty sections. OTOH, empty sections should
+        //      be rare.
+        startArray( "path"_sr );
+    }
+    void JsonReporter::sectionEnded( SectionStats const& /*sectionStats */) {
+        // End the subpath array
+        endArray();
+        // TODO: metadata
+        // TODO: what info do we have here?
+
+        // End the section object
+        endObject();
+    }
+
+    void JsonReporter::assertionStarting( AssertionInfo const& /*assertionInfo*/ ) {}
+    void JsonReporter::assertionEnded( AssertionStats const& assertionStats ) {
+        // TODO: There is lot of different things to handle here, but
+        //       we can fill it in later, after we show that the basic
+        //       outline and streaming reporter impl works well enough.
+        //if ( !m_config->includeSuccessfulResults()
+        //    && assertionStats.assertionResult.isOk() ) {
+        //    return;
+        //}
+        assert( isInside( Writer::Array ) );
+        auto assertionObject = m_arrayWriters.top().writeObject();
+
+        assertionObject.write( "kind"_sr ).write( "assertion"_sr );
+        writeSourceInfo( assertionObject,
+                         assertionStats.assertionResult.getSourceInfo() );
+        assertionObject.write( "status"_sr )
+            .write( assertionStats.assertionResult.isOk() );
+        // TODO: handling of result.
+        // TODO: messages
+        // TODO: totals?
+    }
+
+
+    void JsonReporter::benchmarkPreparing( StringRef name ) { (void)name; }
+    void JsonReporter::benchmarkStarting( BenchmarkInfo const& ) {}
+    void JsonReporter::benchmarkEnded( BenchmarkStats<> const& ) {}
+    void JsonReporter::benchmarkFailed( StringRef error ) { (void)error; }
+
+    void JsonReporter::listReporters(
+        std::vector<ReporterDescription> const& descriptions ) {
+        startListing();
+
+        auto writer =
+            m_objectWriters.top().write( "reporters"_sr ).writeArray();
+        for ( auto const& desc : descriptions ) {
+            auto desc_writer = writer.writeObject();
+            desc_writer.write( "name"_sr ).write( desc.name );
+            desc_writer.write( "description"_sr ).write( desc.description );
+        }
+    }
+    void JsonReporter::listListeners(
+        std::vector<ListenerDescription> const& descriptions ) {
+        startListing();
+
+        auto writer =
+            m_objectWriters.top().write( "listeners"_sr ).writeArray();
+
+        for ( auto const& desc : descriptions ) {
+            auto desc_writer = writer.writeObject();
+            desc_writer.write( "name"_sr ).write( desc.name );
+            desc_writer.write( "description"_sr ).write( desc.description );
+        }
+    }
+    void JsonReporter::listTests( std::vector<TestCaseHandle> const& tests ) {
+        startListing();
+
+        auto writer = m_objectWriters.top().write( "tests"_sr ).writeArray();
+
+        for ( auto const& test : tests ) {
+            auto desc_writer = writer.writeObject();
+            auto const& info = test.getTestCaseInfo();
+
+            desc_writer.write( "name"_sr ).write( info.name );
+            desc_writer.write( "class-name"_sr ).write( info.className );
+            {
+                auto tag_writer = desc_writer.write( "tags"_sr ).writeArray();
+                for ( auto const& tag : info.tags ) {
+                    tag_writer.write( tag.original );
+                }
+            }
+            writeSourceInfo( desc_writer, info.lineInfo );
+        }
+    }
+    void JsonReporter::listTags( std::vector<TagInfo> const& tags ) {
+        startListing();
+
+        auto writer = m_objectWriters.top().write( "tags"_sr ).writeArray();
+        for ( auto const& tag : tags ) {
+            auto tag_writer = writer.writeObject();
+            {
+                auto aliases_writer =
+                    tag_writer.write( "aliases"_sr ).writeArray();
+                for ( auto alias : tag.spellings ) {
+                    aliases_writer.write( alias );
+                }
+            }
+            tag_writer.write( "count"_sr ).write( tag.count );
+        }
+    }
+} // namespace Catch
+
+
 
 
 #include <cassert>
@@ -9299,6 +10137,8 @@ namespace Catch {
             gmtime_s(&timeInfo, &rawtime);
 #elif defined (CATCH_PLATFORM_PLAYSTATION)
             gmtime_s(&rawtime, &timeInfo);
+#elif defined (__IAR_SYSTEMS_ICC__)
+            timeInfo = *std::gmtime(&rawtime);
 #else
             gmtime_r(&rawtime, &timeInfo);
 #endif
@@ -9559,7 +10399,7 @@ namespace Catch {
                 }
             }
 
-            if( !result.getMessage().empty() )
+            if( result.hasMessage() )
                 rss << result.getMessage() << '\n';
             for( auto const& msg : stats.infoMessages )
                 if( msg.type == ResultWas::Info )
@@ -9678,7 +10518,6 @@ namespace Catch {
         }
     }
 
-    // The return value indicates if the messages buffer should be cleared:
     void MultiReporter::assertionEnded( AssertionStats const& assertionStats ) {
         const bool reportByDefault =
             assertionStats.assertionResult.getResultType() != ResultWas::Ok ||
@@ -9781,6 +10620,11 @@ namespace Catch {
             }
         }
 
+        void registerListenerImpl( Detail::unique_ptr<EventListenerFactory> listenerFactory ) {
+            getMutableRegistryHub().registerListener( CATCH_MOVE(listenerFactory) );
+        }
+
+
     } // namespace Detail
 } // namespace Catch
 
@@ -9920,7 +10764,7 @@ namespace Catch {
                 }
             }
 
-            if (!result.getMessage().empty())
+            if (result.hasMessage())
                 textRss << result.getMessage() << '\n';
 
             for (auto const& msg : stats.infoMessages)
@@ -9954,7 +10798,6 @@ namespace Catch {
 
 
 #include <algorithm>
-#include <iterator>
 #include <ostream>
 
 namespace Catch {
@@ -10105,7 +10948,7 @@ namespace Catch {
 
                 // using messages.end() directly (or auto) yields compilation error:
                 std::vector<MessageInfo>::const_iterator itEnd = messages.end();
-                const std::size_t N = static_cast<std::size_t>(std::distance(itMessage, itEnd));
+                const std::size_t N = static_cast<std::size_t>(itEnd - itMessage);
 
                 stream << colourImpl->guardColour( colour ) << " with "
                        << pluralise( N, "message"_sr ) << ':';
@@ -10203,7 +11046,7 @@ namespace Catch {
     } // end anonymous namespace
 
 
-    TeamCityReporter::~TeamCityReporter() {}
+    TeamCityReporter::~TeamCityReporter() = default;
 
     void TeamCityReporter::testRunStarting( TestRunInfo const& runInfo ) {
         m_stream << "##teamcity[testSuiteStarted name='" << escape( runInfo.name )
@@ -10377,7 +11220,7 @@ namespace Catch {
         m_xml.startElement("Catch2TestRun")
              .writeAttribute("name"_sr, m_config->name())
              .writeAttribute("rng-seed"_sr, m_config->rngSeed())
-             .writeAttribute("xml-format-version"_sr, 2)
+             .writeAttribute("xml-format-version"_sr, 3)
              .writeAttribute("catch2-version"_sr, libraryVersion());
         if ( m_config->testSpec().hasFilters() ) {
             m_xml.writeAttribute( "filters"_sr, m_config->testSpec() );
@@ -10419,11 +11262,13 @@ namespace Catch {
             // Print any info messages in <Info> tags.
             for( auto const& msg : assertionStats.infoMessages ) {
                 if( msg.type == ResultWas::Info && includeResults ) {
-                    m_xml.scopedElement( "Info" )
-                            .writeText( msg.message );
+                    auto t = m_xml.scopedElement( "Info" );
+                    writeSourceInfo( msg.lineInfo );
+                    t.writeText( msg.message );
                 } else if ( msg.type == ResultWas::Warning ) {
-                    m_xml.scopedElement( "Warning" )
-                            .writeText( msg.message );
+                    auto t = m_xml.scopedElement( "Warning" );
+                    writeSourceInfo( msg.lineInfo );
+                    t.writeText( msg.message );
                 }
             }
         }
@@ -10553,26 +11398,23 @@ namespace Catch {
     }
 
     void XmlReporter::benchmarkEnded(BenchmarkStats<> const& benchmarkStats) {
-        m_xml.startElement("mean")
+        m_xml.scopedElement("mean")
             .writeAttribute("value"_sr, benchmarkStats.mean.point.count())
             .writeAttribute("lowerBound"_sr, benchmarkStats.mean.lower_bound.count())
             .writeAttribute("upperBound"_sr, benchmarkStats.mean.upper_bound.count())
             .writeAttribute("ci"_sr, benchmarkStats.mean.confidence_interval);
-        m_xml.endElement();
-        m_xml.startElement("standardDeviation")
+        m_xml.scopedElement("standardDeviation")
             .writeAttribute("value"_sr, benchmarkStats.standardDeviation.point.count())
             .writeAttribute("lowerBound"_sr, benchmarkStats.standardDeviation.lower_bound.count())
             .writeAttribute("upperBound"_sr, benchmarkStats.standardDeviation.upper_bound.count())
             .writeAttribute("ci"_sr, benchmarkStats.standardDeviation.confidence_interval);
-        m_xml.endElement();
-        m_xml.startElement("outliers")
+        m_xml.scopedElement("outliers")
             .writeAttribute("variance"_sr, benchmarkStats.outlierVariance)
             .writeAttribute("lowMild"_sr, benchmarkStats.outliers.low_mild)
             .writeAttribute("lowSevere"_sr, benchmarkStats.outliers.low_severe)
             .writeAttribute("highMild"_sr, benchmarkStats.outliers.high_mild)
             .writeAttribute("highSevere"_sr, benchmarkStats.outliers.high_severe);
         m_xml.endElement();
-        m_xml.endElement();
     }
 
     void XmlReporter::benchmarkFailed(StringRef error) {
diff --git a/alpaka/thirdParty/catch2/extras/catch_amalgamated.hpp b/alpaka/thirdParty/catch2/extras/catch_amalgamated.hpp
index 321cec5d..fdba759a 100644
--- a/alpaka/thirdParty/catch2/extras/catch_amalgamated.hpp
+++ b/alpaka/thirdParty/catch2/extras/catch_amalgamated.hpp
@@ -1,3 +1,4 @@
+
 //              Copyright Catch2 Authors
 // Distributed under the Boost Software License, Version 1.0.
 //   (See accompanying file LICENSE.txt or copy at
@@ -5,8 +6,8 @@
 
 // SPDX-License-Identifier: BSL-1.0
 
-//  Catch v3.3.2
-//  Generated: 2023-02-26 10:28:46.785908
+//  Catch v3.5.2
+//  Generated: 2024-01-15 14:06:34.036475
 //  ----------------------------------------------------------
 //  This file is an amalgamation of multiple different files.
 //  You probably shouldn't edit it directly.
@@ -59,238 +60,6 @@
 
 
 
-#ifndef CATCH_INTERFACES_CONFIG_HPP_INCLUDED
-#define CATCH_INTERFACES_CONFIG_HPP_INCLUDED
-
-
-
-#ifndef CATCH_NONCOPYABLE_HPP_INCLUDED
-#define CATCH_NONCOPYABLE_HPP_INCLUDED
-
-namespace Catch {
-    namespace Detail {
-
-        //! Deriving classes become noncopyable and nonmovable
-        class NonCopyable {
-            NonCopyable( NonCopyable const& ) = delete;
-            NonCopyable( NonCopyable&& ) = delete;
-            NonCopyable& operator=( NonCopyable const& ) = delete;
-            NonCopyable& operator=( NonCopyable&& ) = delete;
-
-        protected:
-            NonCopyable() noexcept = default;
-        };
-
-    } // namespace Detail
-} // namespace Catch
-
-#endif // CATCH_NONCOPYABLE_HPP_INCLUDED
-
-
-#ifndef CATCH_STRINGREF_HPP_INCLUDED
-#define CATCH_STRINGREF_HPP_INCLUDED
-
-#include <cstddef>
-#include <string>
-#include <iosfwd>
-#include <cassert>
-
-#include <cstring>
-
-namespace Catch {
-
-    /// A non-owning string class (similar to the forthcoming std::string_view)
-    /// Note that, because a StringRef may be a substring of another string,
-    /// it may not be null terminated.
-    class StringRef {
-    public:
-        using size_type = std::size_t;
-        using const_iterator = const char*;
-
-    private:
-        static constexpr char const* const s_empty = "";
-
-        char const* m_start = s_empty;
-        size_type m_size = 0;
-
-    public: // construction
-        constexpr StringRef() noexcept = default;
-
-        StringRef( char const* rawChars ) noexcept;
-
-        constexpr StringRef( char const* rawChars, size_type size ) noexcept
-        :   m_start( rawChars ),
-            m_size( size )
-        {}
-
-        StringRef( std::string const& stdString ) noexcept
-        :   m_start( stdString.c_str() ),
-            m_size( stdString.size() )
-        {}
-
-        explicit operator std::string() const {
-            return std::string(m_start, m_size);
-        }
-
-    public: // operators
-        auto operator == ( StringRef other ) const noexcept -> bool {
-            return m_size == other.m_size
-                && (std::memcmp( m_start, other.m_start, m_size ) == 0);
-        }
-        auto operator != (StringRef other) const noexcept -> bool {
-            return !(*this == other);
-        }
-
-        constexpr auto operator[] ( size_type index ) const noexcept -> char {
-            assert(index < m_size);
-            return m_start[index];
-        }
-
-        bool operator<(StringRef rhs) const noexcept;
-
-    public: // named queries
-        constexpr auto empty() const noexcept -> bool {
-            return m_size == 0;
-        }
-        constexpr auto size() const noexcept -> size_type {
-            return m_size;
-        }
-
-        // Returns a substring of [start, start + length).
-        // If start + length > size(), then the substring is [start, start + size()).
-        // If start > size(), then the substring is empty.
-        constexpr StringRef substr(size_type start, size_type length) const noexcept {
-            if (start < m_size) {
-                const auto shortened_size = m_size - start;
-                return StringRef(m_start + start, (shortened_size < length) ? shortened_size : length);
-            } else {
-                return StringRef();
-            }
-        }
-
-        // Returns the current start pointer. May not be null-terminated.
-        constexpr char const* data() const noexcept {
-            return m_start;
-        }
-
-        constexpr const_iterator begin() const { return m_start; }
-        constexpr const_iterator end() const { return m_start + m_size; }
-
-
-        friend std::string& operator += (std::string& lhs, StringRef sr);
-        friend std::ostream& operator << (std::ostream& os, StringRef sr);
-        friend std::string operator+(StringRef lhs, StringRef rhs);
-
-        /**
-         * Provides a three-way comparison with rhs
-         *
-         * Returns negative number if lhs < rhs, 0 if lhs == rhs, and a positive
-         * number if lhs > rhs
-         */
-        int compare( StringRef rhs ) const;
-    };
-
-
-    constexpr auto operator ""_sr( char const* rawChars, std::size_t size ) noexcept -> StringRef {
-        return StringRef( rawChars, size );
-    }
-} // namespace Catch
-
-constexpr auto operator ""_catch_sr( char const* rawChars, std::size_t size ) noexcept -> Catch::StringRef {
-    return Catch::StringRef( rawChars, size );
-}
-
-#endif // CATCH_STRINGREF_HPP_INCLUDED
-
-#include <chrono>
-#include <iosfwd>
-#include <string>
-#include <vector>
-
-namespace Catch {
-
-    enum class Verbosity {
-        Quiet = 0,
-        Normal,
-        High
-    };
-
-    struct WarnAbout { enum What {
-        Nothing = 0x00,
-        //! A test case or leaf section did not run any assertions
-        NoAssertions = 0x01,
-        //! A command line test spec matched no test cases
-        UnmatchedTestSpec = 0x02,
-    }; };
-
-    enum class ShowDurations {
-        DefaultForReporter,
-        Always,
-        Never
-    };
-    enum class TestRunOrder {
-        Declared,
-        LexicographicallySorted,
-        Randomized
-    };
-    enum class ColourMode : std::uint8_t {
-        //! Let Catch2 pick implementation based on platform detection
-        PlatformDefault,
-        //! Use ANSI colour code escapes
-        ANSI,
-        //! Use Win32 console colour API
-        Win32,
-        //! Don't use any colour
-        None
-    };
-    struct WaitForKeypress { enum When {
-        Never,
-        BeforeStart = 1,
-        BeforeExit = 2,
-        BeforeStartAndExit = BeforeStart | BeforeExit
-    }; };
-
-    class TestSpec;
-    class IStream;
-
-    class IConfig : public Detail::NonCopyable {
-    public:
-        virtual ~IConfig();
-
-        virtual bool allowThrows() const = 0;
-        virtual StringRef name() const = 0;
-        virtual bool includeSuccessfulResults() const = 0;
-        virtual bool shouldDebugBreak() const = 0;
-        virtual bool warnAboutMissingAssertions() const = 0;
-        virtual bool warnAboutUnmatchedTestSpecs() const = 0;
-        virtual bool zeroTestsCountAsSuccess() const = 0;
-        virtual int abortAfter() const = 0;
-        virtual bool showInvisibles() const = 0;
-        virtual ShowDurations showDurations() const = 0;
-        virtual double minDuration() const = 0;
-        virtual TestSpec const& testSpec() const = 0;
-        virtual bool hasTestFilters() const = 0;
-        virtual std::vector<std::string> const& getTestsOrTags() const = 0;
-        virtual TestRunOrder runOrder() const = 0;
-        virtual uint32_t rngSeed() const = 0;
-        virtual unsigned int shardCount() const = 0;
-        virtual unsigned int shardIndex() const = 0;
-        virtual ColourMode defaultColourMode() const = 0;
-        virtual std::vector<std::string> const& getSectionsToRun() const = 0;
-        virtual Verbosity verbosity() const = 0;
-
-        virtual bool skipBenchmarks() const = 0;
-        virtual bool benchmarkNoAnalysis() const = 0;
-        virtual unsigned int benchmarkSamples() const = 0;
-        virtual double benchmarkConfidenceInterval() const = 0;
-        virtual unsigned int benchmarkResamples() const = 0;
-        virtual std::chrono::milliseconds benchmarkWarmupTime() const = 0;
-    };
-}
-
-#endif // CATCH_INTERFACES_CONFIG_HPP_INCLUDED
-
-
 #ifndef CATCH_COMPILER_CAPABILITIES_HPP_INCLUDED
 #define CATCH_COMPILER_CAPABILITIES_HPP_INCLUDED
 
@@ -366,12 +135,18 @@ namespace Catch {
 #    define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS \
          _Pragma( "GCC diagnostic ignored \"-Wparentheses\"" )
 
+#    define CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT \
+         _Pragma( "GCC diagnostic ignored \"-Wunused-result\"" )
+
 #    define CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
          _Pragma( "GCC diagnostic ignored \"-Wunused-variable\"" )
 
 #    define CATCH_INTERNAL_SUPPRESS_USELESS_CAST_WARNINGS \
          _Pragma( "GCC diagnostic ignored \"-Wuseless-cast\"" )
 
+#    define CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS \
+         _Pragma( "GCC diagnostic ignored \"-Wshadow\"" )
+
 #    define CATCH_INTERNAL_IGNORE_BUT_WARN(...) (void)__builtin_constant_p(__VA_ARGS__)
 
 #endif
@@ -444,6 +219,9 @@ namespace Catch {
 #    define CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS \
         _Pragma( "clang diagnostic ignored \"-Wcomma\"" )
 
+#    define CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS \
+        _Pragma( "clang diagnostic ignored \"-Wshadow\"" )
+
 #endif // __clang__
 
 
@@ -463,7 +241,9 @@ namespace Catch {
 
 ////////////////////////////////////////////////////////////////////////////////
 // Assume that some platforms do not support getenv.
-#if defined(CATCH_PLATFORM_WINDOWS_UWP) || defined(CATCH_PLATFORM_PLAYSTATION)
+#if defined( CATCH_PLATFORM_WINDOWS_UWP ) ||                                   \
+    defined( CATCH_PLATFORM_PLAYSTATION ) ||                                   \
+    defined( _GAMING_XBOX )
 #    define CATCH_INTERNAL_CONFIG_NO_GETENV
 #else
 #    define CATCH_INTERNAL_CONFIG_GETENV
@@ -681,6 +461,9 @@ namespace Catch {
 #if !defined(CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS)
 #   define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS
 #endif
+#if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT)
+#   define CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT
+#endif
 #if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS)
 #   define CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS
 #endif
@@ -690,6 +473,16 @@ namespace Catch {
 #if !defined(CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS)
 #   define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS
 #endif
+#if !defined( CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS )
+#    define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS
+#endif
+#if !defined( CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS )
+#    define CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS
+#endif
+#if !defined( CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS )
+#    define CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS
+#endif
+
 
 // The goal of this macro is to avoid evaluation of the arguments, but
 // still have the compiler warn on problems inside...
@@ -703,13 +496,6 @@ namespace Catch {
 #   undef CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS
 #endif
 
-#if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS)
-#   define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS
-#endif
-
-#if !defined(CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS)
-#   define CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS
-#endif
 
 #if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
 #define CATCH_TRY if ((true))
@@ -755,38 +541,31 @@ namespace Catch {
     class IResultCapture;
     class IConfig;
 
-    class IContext {
-    public:
-        virtual ~IContext(); // = default
+    class Context {
+        IConfig const* m_config = nullptr;
+        IResultCapture* m_resultCapture = nullptr;
 
-        virtual IResultCapture* getResultCapture() = 0;
-        virtual IConfig const* getConfig() const = 0;
-    };
+        CATCH_EXPORT static Context* currentContext;
+        friend Context& getCurrentMutableContext();
+        friend Context const& getCurrentContext();
+        static void createContext();
+        friend void cleanUpContext();
 
-    class IMutableContext : public IContext {
     public:
-        ~IMutableContext() override; // = default
-        virtual void setResultCapture( IResultCapture* resultCapture ) = 0;
-        virtual void setConfig( IConfig const* config ) = 0;
-
-    private:
-        CATCH_EXPORT static IMutableContext* currentContext;
-        friend IMutableContext& getCurrentMutableContext();
-        friend void cleanUpContext();
-        static void createContext();
+        IResultCapture* getResultCapture() const { return m_resultCapture; }
+        IConfig const* getConfig() const { return m_config; }
+        void setResultCapture( IResultCapture* resultCapture );
+        void setConfig( IConfig const* config );
     };
 
-    inline IMutableContext& getCurrentMutableContext()
-    {
-        if( !IMutableContext::currentContext )
-            IMutableContext::createContext();
+    Context& getCurrentMutableContext();
+
+    inline Context const& getCurrentContext() {
+        // We duplicate the logic from `getCurrentMutableContext` here,
+        // to avoid paying the call overhead in debug mode.
+        if ( !Context::currentContext ) { Context::createContext(); }
         // NOLINTNEXTLINE(clang-analyzer-core.uninitialized.UndefReturn)
-        return *IMutableContext::currentContext;
-    }
-
-    inline IContext& getCurrentContext()
-    {
-        return getCurrentMutableContext();
+        return *Context::currentContext;
     }
 
     void cleanUpContext();
@@ -798,16 +577,6 @@ namespace Catch {
 #endif // CATCH_CONTEXT_HPP_INCLUDED
 
 
-#ifndef CATCH_INTERFACES_REPORTER_HPP_INCLUDED
-#define CATCH_INTERFACES_REPORTER_HPP_INCLUDED
-
-
-
-#ifndef CATCH_SECTION_INFO_HPP_INCLUDED
-#define CATCH_SECTION_INFO_HPP_INCLUDED
-
-
-
 #ifndef CATCH_MOVE_AND_FORWARD_HPP_INCLUDED
 #define CATCH_MOVE_AND_FORWARD_HPP_INCLUDED
 
@@ -822,110 +591,201 @@ namespace Catch {
 #endif // CATCH_MOVE_AND_FORWARD_HPP_INCLUDED
 
 
-#ifndef CATCH_SOURCE_LINE_INFO_HPP_INCLUDED
-#define CATCH_SOURCE_LINE_INFO_HPP_INCLUDED
-
-#include <cstddef>
-#include <iosfwd>
+#ifndef CATCH_TEST_FAILURE_EXCEPTION_HPP_INCLUDED
+#define CATCH_TEST_FAILURE_EXCEPTION_HPP_INCLUDED
 
 namespace Catch {
 
-    struct SourceLineInfo {
+    //! Used to signal that an assertion macro failed
+    struct TestFailureException{};
+    //! Used to signal that the remainder of a test should be skipped
+    struct TestSkipException {};
 
-        SourceLineInfo() = delete;
-        constexpr SourceLineInfo( char const* _file, std::size_t _line ) noexcept:
-            file( _file ),
-            line( _line )
-        {}
+    /**
+     * Outlines throwing of `TestFailureException` into a single TU
+     *
+     * Also handles `CATCH_CONFIG_DISABLE_EXCEPTIONS` for callers.
+     */
+    [[noreturn]] void throw_test_failure_exception();
 
-        bool operator == ( SourceLineInfo const& other ) const noexcept;
-        bool operator < ( SourceLineInfo const& other ) const noexcept;
+    /**
+     * Outlines throwing of `TestSkipException` into a single TU
+     *
+     * Also handles `CATCH_CONFIG_DISABLE_EXCEPTIONS` for callers.
+     */
+    [[noreturn]] void throw_test_skip_exception();
 
-        char const* file;
-        std::size_t line;
+} // namespace Catch
 
-        friend std::ostream& operator << (std::ostream& os, SourceLineInfo const& info);
-    };
-}
+#endif // CATCH_TEST_FAILURE_EXCEPTION_HPP_INCLUDED
 
-#define CATCH_INTERNAL_LINEINFO \
-    ::Catch::SourceLineInfo( __FILE__, static_cast<std::size_t>( __LINE__ ) )
 
-#endif // CATCH_SOURCE_LINE_INFO_HPP_INCLUDED
+#ifndef CATCH_UNIQUE_NAME_HPP_INCLUDED
+#define CATCH_UNIQUE_NAME_HPP_INCLUDED
 
 
-#ifndef CATCH_TOTALS_HPP_INCLUDED
-#define CATCH_TOTALS_HPP_INCLUDED
 
-#include <cstdint>
 
-namespace Catch {
+/** \file
+ * Wrapper for the CONFIG configuration option
+ *
+ * When generating internal unique names, there are two options. Either
+ * we mix in the current line number, or mix in an incrementing number.
+ * We prefer the latter, using `__COUNTER__`, but users might want to
+ * use the former.
+ */
 
-    struct Counts {
-        Counts operator - ( Counts const& other ) const;
-        Counts& operator += ( Counts const& other );
+#ifndef CATCH_CONFIG_COUNTER_HPP_INCLUDED
+#define CATCH_CONFIG_COUNTER_HPP_INCLUDED
 
-        std::uint64_t total() const;
-        bool allPassed() const;
-        bool allOk() const;
 
-        std::uint64_t passed = 0;
-        std::uint64_t failed = 0;
-        std::uint64_t failedButOk = 0;
-        std::uint64_t skipped = 0;
-    };
+#if ( !defined(__JETBRAINS_IDE__) || __JETBRAINS_IDE__ >= 20170300L )
+    #define CATCH_INTERNAL_CONFIG_COUNTER
+#endif
 
-    struct Totals {
+#if defined( CATCH_INTERNAL_CONFIG_COUNTER ) && \
+    !defined( CATCH_CONFIG_NO_COUNTER ) && \
+    !defined( CATCH_CONFIG_COUNTER )
+#    define CATCH_CONFIG_COUNTER
+#endif
 
-        Totals operator - ( Totals const& other ) const;
-        Totals& operator += ( Totals const& other );
 
-        Totals delta( Totals const& prevTotals ) const;
+#endif // CATCH_CONFIG_COUNTER_HPP_INCLUDED
+#define INTERNAL_CATCH_UNIQUE_NAME_LINE2( name, line ) name##line
+#define INTERNAL_CATCH_UNIQUE_NAME_LINE( name, line ) INTERNAL_CATCH_UNIQUE_NAME_LINE2( name, line )
+#ifdef CATCH_CONFIG_COUNTER
+#  define INTERNAL_CATCH_UNIQUE_NAME( name ) INTERNAL_CATCH_UNIQUE_NAME_LINE( name, __COUNTER__ )
+#else
+#  define INTERNAL_CATCH_UNIQUE_NAME( name ) INTERNAL_CATCH_UNIQUE_NAME_LINE( name, __LINE__ )
+#endif
 
-        Counts assertions;
-        Counts testCases;
-    };
-}
+#endif // CATCH_UNIQUE_NAME_HPP_INCLUDED
 
-#endif // CATCH_TOTALS_HPP_INCLUDED
 
+#ifndef CATCH_INTERFACES_CAPTURE_HPP_INCLUDED
+#define CATCH_INTERFACES_CAPTURE_HPP_INCLUDED
+
+#include <string>
+#include <chrono>
+
+
+
+#ifndef CATCH_STRINGREF_HPP_INCLUDED
+#define CATCH_STRINGREF_HPP_INCLUDED
+
+#include <cstddef>
 #include <string>
+#include <iosfwd>
+#include <cassert>
+
+#include <cstring>
 
 namespace Catch {
 
-    struct SectionInfo {
-        // The last argument is ignored, so that people can write
-        // SECTION("ShortName", "Proper description that is long") and
-        // still use the `-c` flag comfortably.
-        SectionInfo( SourceLineInfo const& _lineInfo, std::string _name,
-                    const char* const = nullptr ):
-            name(CATCH_MOVE(_name)),
-            lineInfo(_lineInfo)
-            {}
+    /// A non-owning string class (similar to the forthcoming std::string_view)
+    /// Note that, because a StringRef may be a substring of another string,
+    /// it may not be null terminated.
+    class StringRef {
+    public:
+        using size_type = std::size_t;
+        using const_iterator = const char*;
 
-        std::string name;
-        SourceLineInfo lineInfo;
-    };
+        static constexpr size_type npos{ static_cast<size_type>( -1 ) };
 
-    struct SectionEndInfo {
-        SectionInfo sectionInfo;
-        Counts prevAssertions;
-        double durationInSeconds;
-    };
+    private:
+        static constexpr char const* const s_empty = "";
 
-} // end namespace Catch
+        char const* m_start = s_empty;
+        size_type m_size = 0;
 
-#endif // CATCH_SECTION_INFO_HPP_INCLUDED
+    public: // construction
+        constexpr StringRef() noexcept = default;
 
+        StringRef( char const* rawChars ) noexcept;
 
-#ifndef CATCH_ASSERTION_RESULT_HPP_INCLUDED
-#define CATCH_ASSERTION_RESULT_HPP_INCLUDED
+        constexpr StringRef( char const* rawChars, size_type size ) noexcept
+        :   m_start( rawChars ),
+            m_size( size )
+        {}
+
+        StringRef( std::string const& stdString ) noexcept
+        :   m_start( stdString.c_str() ),
+            m_size( stdString.size() )
+        {}
 
+        explicit operator std::string() const {
+            return std::string(m_start, m_size);
+        }
 
+    public: // operators
+        auto operator == ( StringRef other ) const noexcept -> bool {
+            return m_size == other.m_size
+                && (std::memcmp( m_start, other.m_start, m_size ) == 0);
+        }
+        auto operator != (StringRef other) const noexcept -> bool {
+            return !(*this == other);
+        }
 
-#ifndef CATCH_ASSERTION_INFO_HPP_INCLUDED
-#define CATCH_ASSERTION_INFO_HPP_INCLUDED
+        constexpr auto operator[] ( size_type index ) const noexcept -> char {
+            assert(index < m_size);
+            return m_start[index];
+        }
+
+        bool operator<(StringRef rhs) const noexcept;
+
+    public: // named queries
+        constexpr auto empty() const noexcept -> bool {
+            return m_size == 0;
+        }
+        constexpr auto size() const noexcept -> size_type {
+            return m_size;
+        }
+
+        // Returns a substring of [start, start + length).
+        // If start + length > size(), then the substring is [start, size()).
+        // If start > size(), then the substring is empty.
+        constexpr StringRef substr(size_type start, size_type length) const noexcept {
+            if (start < m_size) {
+                const auto shortened_size = m_size - start;
+                return StringRef(m_start + start, (shortened_size < length) ? shortened_size : length);
+            } else {
+                return StringRef();
+            }
+        }
+
+        // Returns the current start pointer. May not be null-terminated.
+        constexpr char const* data() const noexcept {
+            return m_start;
+        }
+
+        constexpr const_iterator begin() const { return m_start; }
+        constexpr const_iterator end() const { return m_start + m_size; }
+
+
+        friend std::string& operator += (std::string& lhs, StringRef sr);
+        friend std::ostream& operator << (std::ostream& os, StringRef sr);
+        friend std::string operator+(StringRef lhs, StringRef rhs);
+
+        /**
+         * Provides a three-way comparison with rhs
+         *
+         * Returns negative number if lhs < rhs, 0 if lhs == rhs, and a positive
+         * number if lhs > rhs
+         */
+        int compare( StringRef rhs ) const;
+    };
+
+
+    constexpr auto operator ""_sr( char const* rawChars, std::size_t size ) noexcept -> StringRef {
+        return StringRef( rawChars, size );
+    }
+} // namespace Catch
+
+constexpr auto operator ""_catch_sr( char const* rawChars, std::size_t size ) noexcept -> Catch::StringRef {
+    return Catch::StringRef( rawChars, size );
+}
 
+#endif // CATCH_STRINGREF_HPP_INCLUDED
 
 
 #ifndef CATCH_RESULT_TYPE_HPP_INCLUDED
@@ -979,120 +839,12 @@ namespace Catch {
 
 #endif // CATCH_RESULT_TYPE_HPP_INCLUDED
 
-namespace Catch {
-
-    struct AssertionInfo {
-        // AssertionInfo() = delete;
-
-        StringRef macroName;
-        SourceLineInfo lineInfo;
-        StringRef capturedExpression;
-        ResultDisposition::Flags resultDisposition;
-    };
-
-} // end namespace Catch
-
-#endif // CATCH_ASSERTION_INFO_HPP_INCLUDED
 
+#ifndef CATCH_UNIQUE_PTR_HPP_INCLUDED
+#define CATCH_UNIQUE_PTR_HPP_INCLUDED
 
-#ifndef CATCH_LAZY_EXPR_HPP_INCLUDED
-#define CATCH_LAZY_EXPR_HPP_INCLUDED
-
-#include <iosfwd>
-
-namespace Catch {
-
-    class ITransientExpression;
-
-    class LazyExpression {
-        friend class AssertionHandler;
-        friend struct AssertionStats;
-        friend class RunContext;
-
-        ITransientExpression const* m_transientExpression = nullptr;
-        bool m_isNegated;
-    public:
-        LazyExpression( bool isNegated ):
-            m_isNegated(isNegated)
-        {}
-        LazyExpression(LazyExpression const& other) = default;
-        LazyExpression& operator = ( LazyExpression const& ) = delete;
-
-        explicit operator bool() const {
-            return m_transientExpression != nullptr;
-        }
-
-        friend auto operator << ( std::ostream& os, LazyExpression const& lazyExpr ) -> std::ostream&;
-    };
-
-} // namespace Catch
-
-#endif // CATCH_LAZY_EXPR_HPP_INCLUDED
-
-#include <string>
-
-namespace Catch {
-
-    struct AssertionResultData
-    {
-        AssertionResultData() = delete;
-
-        AssertionResultData( ResultWas::OfType _resultType, LazyExpression const& _lazyExpression );
-
-        std::string message;
-        mutable std::string reconstructedExpression;
-        LazyExpression lazyExpression;
-        ResultWas::OfType resultType;
-
-        std::string reconstructExpression() const;
-    };
-
-    class AssertionResult {
-    public:
-        AssertionResult() = delete;
-        AssertionResult( AssertionInfo const& info, AssertionResultData&& data );
-
-        bool isOk() const;
-        bool succeeded() const;
-        ResultWas::OfType getResultType() const;
-        bool hasExpression() const;
-        bool hasMessage() const;
-        std::string getExpression() const;
-        std::string getExpressionInMacro() const;
-        bool hasExpandedExpression() const;
-        std::string getExpandedExpression() const;
-        StringRef getMessage() const;
-        SourceLineInfo getSourceInfo() const;
-        StringRef getTestMacroName() const;
-
-    //protected:
-        AssertionInfo m_info;
-        AssertionResultData m_resultData;
-    };
-
-} // end namespace Catch
-
-#endif // CATCH_ASSERTION_RESULT_HPP_INCLUDED
-
-
-#ifndef CATCH_MESSAGE_INFO_HPP_INCLUDED
-#define CATCH_MESSAGE_INFO_HPP_INCLUDED
-
-
-
-#ifndef CATCH_INTERFACES_CAPTURE_HPP_INCLUDED
-#define CATCH_INTERFACES_CAPTURE_HPP_INCLUDED
-
-#include <string>
-#include <chrono>
-
-
-
-#ifndef CATCH_UNIQUE_PTR_HPP_INCLUDED
-#define CATCH_UNIQUE_PTR_HPP_INCLUDED
-
-#include <cassert>
-#include <type_traits>
+#include <cassert>
+#include <type_traits>
 
 
 namespace Catch {
@@ -1199,6 +951,45 @@ namespace Detail {
 
 #endif // CATCH_UNIQUE_PTR_HPP_INCLUDED
 
+
+#ifndef CATCH_BENCHMARK_STATS_FWD_HPP_INCLUDED
+#define CATCH_BENCHMARK_STATS_FWD_HPP_INCLUDED
+
+
+
+// Adapted from donated nonius code.
+
+#ifndef CATCH_CLOCK_HPP_INCLUDED
+#define CATCH_CLOCK_HPP_INCLUDED
+
+#include <chrono>
+
+namespace Catch {
+    namespace Benchmark {
+        using IDuration = std::chrono::nanoseconds;
+        using FDuration = std::chrono::duration<double, std::nano>;
+
+        template <typename Clock>
+        using TimePoint = typename Clock::time_point;
+
+        using default_clock = std::chrono::steady_clock;
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_CLOCK_HPP_INCLUDED
+
+namespace Catch {
+
+    // We cannot forward declare the type with default template argument
+    // multiple times, so it is split out into a separate header so that
+    // we can prevent multiple declarations in dependees
+    template <typename Duration = Benchmark::FDuration>
+    struct BenchmarkStats;
+
+} // end namespace Catch
+
+#endif // CATCH_BENCHMARK_STATS_FWD_HPP_INCLUDED
+
 namespace Catch {
 
     class AssertionResult;
@@ -1215,8 +1006,6 @@ namespace Catch {
     class IGeneratorTracker;
 
     struct BenchmarkInfo;
-    template <typename Duration = std::chrono::duration<double, std::nano>>
-    struct BenchmarkStats;
 
     namespace Generators {
         class GeneratorUntypedBase;
@@ -1228,6 +1017,7 @@ namespace Catch {
     public:
         virtual ~IResultCapture();
 
+        virtual void notifyAssertionStarted( AssertionInfo const& info ) = 0;
         virtual bool sectionStarted( StringRef sectionName,
                                      SourceLineInfo const& sectionLineInfo,
                                      Counts& assertions ) = 0;
@@ -1268,7 +1058,7 @@ namespace Catch {
                     AssertionReaction& reaction ) = 0;
         virtual void handleUnexpectedInflightException
                 (   AssertionInfo const& info,
-                    std::string const& message,
+                    std::string&& message,
                     AssertionReaction& reaction ) = 0;
         virtual void handleIncomplete
                 (   AssertionInfo const& info ) = 0;
@@ -1293,415 +1083,308 @@ namespace Catch {
 
 #endif // CATCH_INTERFACES_CAPTURE_HPP_INCLUDED
 
-#include <string>
-
-namespace Catch {
-
-    struct MessageInfo {
-        MessageInfo(    StringRef _macroName,
-                        SourceLineInfo const& _lineInfo,
-                        ResultWas::OfType _type );
-
-        StringRef macroName;
-        std::string message;
-        SourceLineInfo lineInfo;
-        ResultWas::OfType type;
-        unsigned int sequence;
-
-        bool operator == (MessageInfo const& other) const {
-            return sequence == other.sequence;
-        }
-        bool operator < (MessageInfo const& other) const {
-            return sequence < other.sequence;
-        }
-    private:
-        static unsigned int globalCount;
-    };
 
-} // end namespace Catch
-
-#endif // CATCH_MESSAGE_INFO_HPP_INCLUDED
+#ifndef CATCH_INTERFACES_CONFIG_HPP_INCLUDED
+#define CATCH_INTERFACES_CONFIG_HPP_INCLUDED
 
 
-// Adapted from donated nonius code.
 
-#ifndef CATCH_ESTIMATE_HPP_INCLUDED
-#define CATCH_ESTIMATE_HPP_INCLUDED
+#ifndef CATCH_NONCOPYABLE_HPP_INCLUDED
+#define CATCH_NONCOPYABLE_HPP_INCLUDED
 
 namespace Catch {
-    namespace Benchmark {
-        template <typename Duration>
-        struct Estimate {
-            Duration point;
-            Duration lower_bound;
-            Duration upper_bound;
-            double confidence_interval;
-
-            template <typename Duration2>
-            operator Estimate<Duration2>() const {
-                return { point, lower_bound, upper_bound, confidence_interval };
-            }
-        };
-    } // namespace Benchmark
-} // namespace Catch
-
-#endif // CATCH_ESTIMATE_HPP_INCLUDED
-
-
-// Adapted from donated nonius code.
-
-#ifndef CATCH_OUTLIER_CLASSIFICATION_HPP_INCLUDED
-#define CATCH_OUTLIER_CLASSIFICATION_HPP_INCLUDED
+    namespace Detail {
 
-namespace Catch {
-    namespace Benchmark {
-        struct OutlierClassification {
-            int samples_seen = 0;
-            int low_severe = 0;     // more than 3 times IQR below Q1
-            int low_mild = 0;       // 1.5 to 3 times IQR below Q1
-            int high_mild = 0;      // 1.5 to 3 times IQR above Q3
-            int high_severe = 0;    // more than 3 times IQR above Q3
+        //! Deriving classes become noncopyable and nonmovable
+        class NonCopyable {
+            NonCopyable( NonCopyable const& ) = delete;
+            NonCopyable( NonCopyable&& ) = delete;
+            NonCopyable& operator=( NonCopyable const& ) = delete;
+            NonCopyable& operator=( NonCopyable&& ) = delete;
 
-            int total() const {
-                return low_severe + low_mild + high_mild + high_severe;
-            }
+        protected:
+            NonCopyable() noexcept = default;
         };
-    } // namespace Benchmark
-} // namespace Catch
 
-#endif // CATCH_OUTLIERS_CLASSIFICATION_HPP_INCLUDED
+    } // namespace Detail
+} // namespace Catch
 
+#endif // CATCH_NONCOPYABLE_HPP_INCLUDED
 
-#include <map>
+#include <chrono>
+#include <iosfwd>
 #include <string>
 #include <vector>
-#include <iosfwd>
 
 namespace Catch {
 
-    struct ReporterDescription;
-    struct ListenerDescription;
-    struct TagInfo;
-    struct TestCaseInfo;
-    class TestCaseHandle;
-    class IConfig;
-    class IStream;
-    enum class ColourMode : std::uint8_t;
-
-    struct ReporterConfig {
-        ReporterConfig( IConfig const* _fullConfig,
-                        Detail::unique_ptr<IStream> _stream,
-                        ColourMode colourMode,
-                        std::map<std::string, std::string> customOptions );
-
-        ReporterConfig( ReporterConfig&& ) = default;
-        ReporterConfig& operator=( ReporterConfig&& ) = default;
-        ~ReporterConfig(); // = default
+    enum class Verbosity {
+        Quiet = 0,
+        Normal,
+        High
+    };
 
-        Detail::unique_ptr<IStream> takeStream() &&;
-        IConfig const* fullConfig() const;
-        ColourMode colourMode() const;
-        std::map<std::string, std::string> const& customOptions() const;
+    struct WarnAbout { enum What {
+        Nothing = 0x00,
+        //! A test case or leaf section did not run any assertions
+        NoAssertions = 0x01,
+        //! A command line test spec matched no test cases
+        UnmatchedTestSpec = 0x02,
+    }; };
 
-    private:
-        Detail::unique_ptr<IStream> m_stream;
-        IConfig const* m_fullConfig;
-        ColourMode m_colourMode;
-        std::map<std::string, std::string> m_customOptions;
+    enum class ShowDurations {
+        DefaultForReporter,
+        Always,
+        Never
     };
-
-    struct TestRunInfo {
-        constexpr TestRunInfo(StringRef _name) : name(_name) {}
-        StringRef name;
+    enum class TestRunOrder {
+        Declared,
+        LexicographicallySorted,
+        Randomized
     };
-
-    struct AssertionStats {
-        AssertionStats( AssertionResult const& _assertionResult,
-                        std::vector<MessageInfo> const& _infoMessages,
-                        Totals const& _totals );
-
-        AssertionStats( AssertionStats const& )              = default;
-        AssertionStats( AssertionStats && )                  = default;
-        AssertionStats& operator = ( AssertionStats const& ) = delete;
-        AssertionStats& operator = ( AssertionStats && )     = delete;
-
-        AssertionResult assertionResult;
-        std::vector<MessageInfo> infoMessages;
-        Totals totals;
+    enum class ColourMode : std::uint8_t {
+        //! Let Catch2 pick implementation based on platform detection
+        PlatformDefault,
+        //! Use ANSI colour code escapes
+        ANSI,
+        //! Use Win32 console colour API
+        Win32,
+        //! Don't use any colour
+        None
     };
+    struct WaitForKeypress { enum When {
+        Never,
+        BeforeStart = 1,
+        BeforeExit = 2,
+        BeforeStartAndExit = BeforeStart | BeforeExit
+    }; };
 
-    struct SectionStats {
-        SectionStats(   SectionInfo&& _sectionInfo,
-                        Counts const& _assertions,
-                        double _durationInSeconds,
-                        bool _missingAssertions );
+    class TestSpec;
+    class IStream;
 
-        SectionInfo sectionInfo;
-        Counts assertions;
-        double durationInSeconds;
-        bool missingAssertions;
-    };
+    class IConfig : public Detail::NonCopyable {
+    public:
+        virtual ~IConfig();
 
-    struct TestCaseStats {
-        TestCaseStats(  TestCaseInfo const& _testInfo,
-                        Totals const& _totals,
-                        std::string&& _stdOut,
-                        std::string&& _stdErr,
-                        bool _aborting );
+        virtual bool allowThrows() const = 0;
+        virtual StringRef name() const = 0;
+        virtual bool includeSuccessfulResults() const = 0;
+        virtual bool shouldDebugBreak() const = 0;
+        virtual bool warnAboutMissingAssertions() const = 0;
+        virtual bool warnAboutUnmatchedTestSpecs() const = 0;
+        virtual bool zeroTestsCountAsSuccess() const = 0;
+        virtual int abortAfter() const = 0;
+        virtual bool showInvisibles() const = 0;
+        virtual ShowDurations showDurations() const = 0;
+        virtual double minDuration() const = 0;
+        virtual TestSpec const& testSpec() const = 0;
+        virtual bool hasTestFilters() const = 0;
+        virtual std::vector<std::string> const& getTestsOrTags() const = 0;
+        virtual TestRunOrder runOrder() const = 0;
+        virtual uint32_t rngSeed() const = 0;
+        virtual unsigned int shardCount() const = 0;
+        virtual unsigned int shardIndex() const = 0;
+        virtual ColourMode defaultColourMode() const = 0;
+        virtual std::vector<std::string> const& getSectionsToRun() const = 0;
+        virtual Verbosity verbosity() const = 0;
 
-        TestCaseInfo const * testInfo;
-        Totals totals;
-        std::string stdOut;
-        std::string stdErr;
-        bool aborting;
+        virtual bool skipBenchmarks() const = 0;
+        virtual bool benchmarkNoAnalysis() const = 0;
+        virtual unsigned int benchmarkSamples() const = 0;
+        virtual double benchmarkConfidenceInterval() const = 0;
+        virtual unsigned int benchmarkResamples() const = 0;
+        virtual std::chrono::milliseconds benchmarkWarmupTime() const = 0;
     };
+}
 
-    struct TestRunStats {
-        TestRunStats(   TestRunInfo const& _runInfo,
-                        Totals const& _totals,
-                        bool _aborting );
+#endif // CATCH_INTERFACES_CONFIG_HPP_INCLUDED
 
-        TestRunInfo runInfo;
-        Totals totals;
-        bool aborting;
-    };
 
+#ifndef CATCH_INTERFACES_REGISTRY_HUB_HPP_INCLUDED
+#define CATCH_INTERFACES_REGISTRY_HUB_HPP_INCLUDED
 
-    struct BenchmarkInfo {
-        std::string name;
-        double estimatedDuration;
-        int iterations;
-        unsigned int samples;
-        unsigned int resamples;
-        double clockResolution;
-        double clockCost;
-    };
 
-    template <class Duration>
-    struct BenchmarkStats {
-        BenchmarkInfo info;
+#include <string>
 
-        std::vector<Duration> samples;
-        Benchmark::Estimate<Duration> mean;
-        Benchmark::Estimate<Duration> standardDeviation;
-        Benchmark::OutlierClassification outliers;
-        double outlierVariance;
+namespace Catch {
 
-        template <typename Duration2>
-        operator BenchmarkStats<Duration2>() const {
-            std::vector<Duration2> samples2;
-            samples2.reserve(samples.size());
-            for (auto const& sample : samples) {
-                samples2.push_back(Duration2(sample));
-            }
-            return {
-                info,
-                CATCH_MOVE(samples2),
-                mean,
-                standardDeviation,
-                outliers,
-                outlierVariance,
-            };
-        }
-    };
+    class TestCaseHandle;
+    struct TestCaseInfo;
+    class ITestCaseRegistry;
+    class IExceptionTranslatorRegistry;
+    class IExceptionTranslator;
+    class ReporterRegistry;
+    class IReporterFactory;
+    class ITagAliasRegistry;
+    class ITestInvoker;
+    class IMutableEnumValuesRegistry;
+    struct SourceLineInfo;
 
-    //! By setting up its preferences, a reporter can modify Catch2's behaviour
-    //! in some regards, e.g. it can request Catch2 to capture writes to
-    //! stdout/stderr during test execution, and pass them to the reporter.
-    struct ReporterPreferences {
-        //! Catch2 should redirect writes to stdout and pass them to the
-        //! reporter
-        bool shouldRedirectStdOut = false;
-        //! Catch2 should call `Reporter::assertionEnded` even for passing
-        //! assertions
-        bool shouldReportAllAssertions = false;
-    };
+    class StartupExceptionRegistry;
+    class EventListenerFactory;
 
-    /**
-     * The common base for all reporters and event listeners
-     *
-     * Implementing classes must also implement:
-     *
-     *     //! User-friendly description of the reporter/listener type
-     *     static std::string getDescription()
-     *
-     * Generally shouldn't be derived from by users of Catch2 directly,
-     * instead they should derive from one of the utility bases that
-     * derive from this class.
-     */
-    class IEventListener {
-    protected:
-        //! Derived classes can set up their preferences here
-        ReporterPreferences m_preferences;
-        //! The test run's config as filled in from CLI and defaults
-        IConfig const* m_config;
+    using IReporterFactoryPtr = Detail::unique_ptr<IReporterFactory>;
 
+    class IRegistryHub {
     public:
-        IEventListener( IConfig const* config ): m_config( config ) {}
+        virtual ~IRegistryHub(); // = default
 
-        virtual ~IEventListener(); // = default;
+        virtual ReporterRegistry const& getReporterRegistry() const = 0;
+        virtual ITestCaseRegistry const& getTestCaseRegistry() const = 0;
+        virtual ITagAliasRegistry const& getTagAliasRegistry() const = 0;
+        virtual IExceptionTranslatorRegistry const& getExceptionTranslatorRegistry() const = 0;
 
-        // Implementing class must also provide the following static methods:
-        // static std::string getDescription();
 
-        ReporterPreferences const& getPreferences() const {
-            return m_preferences;
-        }
+        virtual StartupExceptionRegistry const& getStartupExceptionRegistry() const = 0;
+    };
 
-        //! Called when no test cases match provided test spec
-        virtual void noMatchingTestCases( StringRef unmatchedSpec ) = 0;
-        //! Called for all invalid test specs from the cli
-        virtual void reportInvalidTestSpec( StringRef invalidArgument ) = 0;
+    class IMutableRegistryHub {
+    public:
+        virtual ~IMutableRegistryHub(); // = default
+        virtual void registerReporter( std::string const& name, IReporterFactoryPtr factory ) = 0;
+        virtual void registerListener( Detail::unique_ptr<EventListenerFactory> factory ) = 0;
+        virtual void registerTest(Detail::unique_ptr<TestCaseInfo>&& testInfo, Detail::unique_ptr<ITestInvoker>&& invoker) = 0;
+        virtual void registerTranslator( Detail::unique_ptr<IExceptionTranslator>&& translator ) = 0;
+        virtual void registerTagAlias( std::string const& alias, std::string const& tag, SourceLineInfo const& lineInfo ) = 0;
+        virtual void registerStartupException() noexcept = 0;
+        virtual IMutableEnumValuesRegistry& getMutableEnumValuesRegistry() = 0;
+    };
 
-        /**
-         * Called once in a testing run before tests are started
-         *
-         * Not called if tests won't be run (e.g. only listing will happen)
-         */
-        virtual void testRunStarting( TestRunInfo const& testRunInfo ) = 0;
+    IRegistryHub const& getRegistryHub();
+    IMutableRegistryHub& getMutableRegistryHub();
+    void cleanUp();
+    std::string translateActiveException();
 
-        //! Called _once_ for each TEST_CASE, no matter how many times it is entered
-        virtual void testCaseStarting( TestCaseInfo const& testInfo ) = 0;
-        //! Called _every time_ a TEST_CASE is entered, including repeats (due to sections)
-        virtual void testCasePartialStarting( TestCaseInfo const& testInfo, uint64_t partNumber ) = 0;
-        //! Called when a `SECTION` is being entered. Not called for skipped sections
-        virtual void sectionStarting( SectionInfo const& sectionInfo ) = 0;
+}
 
-        //! Called when user-code is being probed before the actual benchmark runs
-        virtual void benchmarkPreparing( StringRef benchmarkName ) = 0;
-        //! Called after probe but before the user-code is being benchmarked
-        virtual void benchmarkStarting( BenchmarkInfo const& benchmarkInfo ) = 0;
-        //! Called with the benchmark results if benchmark successfully finishes
-        virtual void benchmarkEnded( BenchmarkStats<> const& benchmarkStats ) = 0;
-        //! Called if running the benchmarks fails for any reason
-        virtual void benchmarkFailed( StringRef benchmarkName ) = 0;
+#endif // CATCH_INTERFACES_REGISTRY_HUB_HPP_INCLUDED
 
-        //! Called before assertion success/failure is evaluated
-        virtual void assertionStarting( AssertionInfo const& assertionInfo ) = 0;
 
-        //! Called after assertion was fully evaluated
-        virtual void assertionEnded( AssertionStats const& assertionStats ) = 0;
+#ifndef CATCH_BENCHMARK_STATS_HPP_INCLUDED
+#define CATCH_BENCHMARK_STATS_HPP_INCLUDED
 
-        //! Called after a `SECTION` has finished running
-        virtual void sectionEnded( SectionStats const& sectionStats ) = 0;
-        //! Called _every time_ a TEST_CASE is entered, including repeats (due to sections)
-        virtual void testCasePartialEnded(TestCaseStats const& testCaseStats, uint64_t partNumber ) = 0;
-        //! Called _once_ for each TEST_CASE, no matter how many times it is entered
-        virtual void testCaseEnded( TestCaseStats const& testCaseStats ) = 0;
-        /**
-         * Called once after all tests in a testing run are finished
-         *
-         * Not called if tests weren't run (e.g. only listings happened)
-         */
-        virtual void testRunEnded( TestRunStats const& testRunStats ) = 0;
 
-        /**
-         * Called with test cases that are skipped due to the test run aborting.
-         * NOT called for test cases that are explicitly skipped using the `SKIP` macro.
-         *
-         * Deprecated - will be removed in the next major release.
-         */
-        virtual void skipTest( TestCaseInfo const& testInfo ) = 0;
 
-        //! Called if a fatal error (signal/structured exception) occured
-        virtual void fatalErrorEncountered( StringRef error ) = 0;
+// Adapted from donated nonius code.
 
-        //! Writes out information about provided reporters using reporter-specific format
-        virtual void listReporters(std::vector<ReporterDescription> const& descriptions) = 0;
-        //! Writes out the provided listeners descriptions using reporter-specific format
-        virtual void listListeners(std::vector<ListenerDescription> const& descriptions) = 0;
-        //! Writes out information about provided tests using reporter-specific format
-        virtual void listTests(std::vector<TestCaseHandle> const& tests) = 0;
-        //! Writes out information about the provided tags using reporter-specific format
-        virtual void listTags(std::vector<TagInfo> const& tags) = 0;
-    };
-    using IEventListenerPtr = Detail::unique_ptr<IEventListener>;
+#ifndef CATCH_ESTIMATE_HPP_INCLUDED
+#define CATCH_ESTIMATE_HPP_INCLUDED
 
-} // end namespace Catch
+namespace Catch {
+    namespace Benchmark {
+        template <typename Type>
+        struct Estimate {
+            Type point;
+            Type lower_bound;
+            Type upper_bound;
+            double confidence_interval;
+        };
+    } // namespace Benchmark
+} // namespace Catch
 
-#endif // CATCH_INTERFACES_REPORTER_HPP_INCLUDED
+#endif // CATCH_ESTIMATE_HPP_INCLUDED
 
 
-#ifndef CATCH_UNIQUE_NAME_HPP_INCLUDED
-#define CATCH_UNIQUE_NAME_HPP_INCLUDED
+// Adapted from donated nonius code.
 
+#ifndef CATCH_OUTLIER_CLASSIFICATION_HPP_INCLUDED
+#define CATCH_OUTLIER_CLASSIFICATION_HPP_INCLUDED
 
+namespace Catch {
+    namespace Benchmark {
+        struct OutlierClassification {
+            int samples_seen = 0;
+            int low_severe = 0;     // more than 3 times IQR below Q1
+            int low_mild = 0;       // 1.5 to 3 times IQR below Q1
+            int high_mild = 0;      // 1.5 to 3 times IQR above Q3
+            int high_severe = 0;    // more than 3 times IQR above Q3
 
+            int total() const {
+                return low_severe + low_mild + high_mild + high_severe;
+            }
+        };
+    } // namespace Benchmark
+} // namespace Catch
 
-/** \file
- * Wrapper for the CONFIG configuration option
- *
- * When generating internal unique names, there are two options. Either
- * we mix in the current line number, or mix in an incrementing number.
- * We prefer the latter, using `__COUNTER__`, but users might want to
- * use the former.
- */
+#endif // CATCH_OUTLIERS_CLASSIFICATION_HPP_INCLUDED
+// The fwd decl & default specialization needs to be seen by VS2017 before
+// BenchmarkStats itself, or VS2017 will report compilation error.
 
-#ifndef CATCH_CONFIG_COUNTER_HPP_INCLUDED
-#define CATCH_CONFIG_COUNTER_HPP_INCLUDED
+#include <string>
+#include <vector>
 
-#if ( !defined(__JETBRAINS_IDE__) || __JETBRAINS_IDE__ >= 20170300L )
-    #define CATCH_INTERNAL_CONFIG_COUNTER
-#endif
+namespace Catch {
 
-#if defined( CATCH_INTERNAL_CONFIG_COUNTER ) && \
-    !defined( CATCH_CONFIG_NO_COUNTER ) && \
-    !defined( CATCH_CONFIG_COUNTER )
-#    define CATCH_CONFIG_COUNTER
-#endif
+    struct BenchmarkInfo {
+        std::string name;
+        double estimatedDuration;
+        int iterations;
+        unsigned int samples;
+        unsigned int resamples;
+        double clockResolution;
+        double clockCost;
+    };
 
+    // We need to keep template parameter for backwards compatibility,
+    // but we also do not want to use the template paraneter.
+    template <class Dummy>
+    struct BenchmarkStats {
+        BenchmarkInfo info;
 
-#endif // CATCH_CONFIG_COUNTER_HPP_INCLUDED
-#define INTERNAL_CATCH_UNIQUE_NAME_LINE2( name, line ) name##line
-#define INTERNAL_CATCH_UNIQUE_NAME_LINE( name, line ) INTERNAL_CATCH_UNIQUE_NAME_LINE2( name, line )
-#ifdef CATCH_CONFIG_COUNTER
-#  define INTERNAL_CATCH_UNIQUE_NAME( name ) INTERNAL_CATCH_UNIQUE_NAME_LINE( name, __COUNTER__ )
-#else
-#  define INTERNAL_CATCH_UNIQUE_NAME( name ) INTERNAL_CATCH_UNIQUE_NAME_LINE( name, __LINE__ )
-#endif
+        std::vector<Benchmark::FDuration> samples;
+        Benchmark::Estimate<Benchmark::FDuration> mean;
+        Benchmark::Estimate<Benchmark::FDuration> standardDeviation;
+        Benchmark::OutlierClassification outliers;
+        double outlierVariance;
+    };
 
-#endif // CATCH_UNIQUE_NAME_HPP_INCLUDED
+
+} // end namespace Catch
+
+#endif // CATCH_BENCHMARK_STATS_HPP_INCLUDED
 
 
 // Adapted from donated nonius code.
 
-#ifndef CATCH_CHRONOMETER_HPP_INCLUDED
-#define CATCH_CHRONOMETER_HPP_INCLUDED
+#ifndef CATCH_ENVIRONMENT_HPP_INCLUDED
+#define CATCH_ENVIRONMENT_HPP_INCLUDED
 
 
+namespace Catch {
+    namespace Benchmark {
+        struct EnvironmentEstimate {
+            FDuration mean;
+            OutlierClassification outliers;
+        };
+        struct Environment {
+            EnvironmentEstimate clock_resolution;
+            EnvironmentEstimate clock_cost;
+        };
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_ENVIRONMENT_HPP_INCLUDED
+
 
 // Adapted from donated nonius code.
 
-#ifndef CATCH_CLOCK_HPP_INCLUDED
-#define CATCH_CLOCK_HPP_INCLUDED
+#ifndef CATCH_EXECUTION_PLAN_HPP_INCLUDED
+#define CATCH_EXECUTION_PLAN_HPP_INCLUDED
 
-#include <chrono>
-#include <ratio>
 
-namespace Catch {
-    namespace Benchmark {
-        template <typename Clock>
-        using ClockDuration = typename Clock::duration;
-        template <typename Clock>
-        using FloatDuration = std::chrono::duration<double, typename Clock::period>;
 
-        template <typename Clock>
-        using TimePoint = typename Clock::time_point;
+// Adapted from donated nonius code.
 
-        using default_clock = std::chrono::steady_clock;
+#ifndef CATCH_BENCHMARK_FUNCTION_HPP_INCLUDED
+#define CATCH_BENCHMARK_FUNCTION_HPP_INCLUDED
 
-        template <typename Clock>
-        struct now {
-            TimePoint<Clock> operator()() const {
-                return Clock::now();
-            }
-        };
 
-        using fp_seconds = std::chrono::duration<double, std::ratio<1>>;
-    } // namespace Benchmark
-} // namespace Catch
 
-#endif // CATCH_CLOCK_HPP_INCLUDED
+// Adapted from donated nonius code.
+
+#ifndef CATCH_CHRONOMETER_HPP_INCLUDED
+#define CATCH_CHRONOMETER_HPP_INCLUDED
+
 
 
 // Adapted from donated nonius code.
@@ -1709,7 +1392,7 @@ namespace Catch {
 #ifndef CATCH_OPTIMIZER_HPP_INCLUDED
 #define CATCH_OPTIMIZER_HPP_INCLUDED
 
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) || defined(__IAR_SYSTEMS_ICC__)
 #   include <atomic> // atomic_thread_fence
 #endif
 
@@ -1730,16 +1413,23 @@ namespace Catch {
         namespace Detail {
             inline void optimizer_barrier() { keep_memory(); }
         } // namespace Detail
-#elif defined(_MSC_VER)
+#elif defined(_MSC_VER) || defined(__IAR_SYSTEMS_ICC__)
 
+#if defined(_MSVC_VER)
 #pragma optimize("", off)
+#elif defined(__IAR_SYSTEMS_ICC__)
+// For IAR the pragma only affects the following function
+#pragma optimize=disable
+#endif
         template <typename T>
         inline void keep_memory(T* p) {
             // thanks @milleniumbug
             *reinterpret_cast<char volatile*>(p) = *reinterpret_cast<char const volatile*>(p);
         }
         // TODO equivalent keep_memory()
+#if defined(_MSVC_VER)
 #pragma optimize("", on)
+#endif
 
         namespace Detail {
             inline void optimizer_barrier() {
@@ -1751,52 +1441,22 @@ namespace Catch {
 
         template <typename T>
         inline void deoptimize_value(T&& x) {
-            keep_memory(&x);
-        }
-
-        template <typename Fn, typename... Args>
-        inline auto invoke_deoptimized(Fn&& fn, Args&&... args) -> std::enable_if_t<!std::is_same<void, decltype(fn(args...))>::value> {
-            deoptimize_value(CATCH_FORWARD(fn) (CATCH_FORWARD(args)...));
-        }
-
-        template <typename Fn, typename... Args>
-        inline auto invoke_deoptimized(Fn&& fn, Args&&... args) -> std::enable_if_t<std::is_same<void, decltype(fn(args...))>::value> {
-            CATCH_FORWARD(fn) (CATCH_FORWARD(args)...);
-        }
-    } // namespace Benchmark
-} // namespace Catch
-
-#endif // CATCH_OPTIMIZER_HPP_INCLUDED
-
-
-// Adapted from donated nonius code.
-
-#ifndef CATCH_COMPLETE_INVOKE_HPP_INCLUDED
-#define CATCH_COMPLETE_INVOKE_HPP_INCLUDED
-
-
-
-#ifndef CATCH_TEST_FAILURE_EXCEPTION_HPP_INCLUDED
-#define CATCH_TEST_FAILURE_EXCEPTION_HPP_INCLUDED
-
-namespace Catch {
-
-    //! Used to signal that an assertion macro failed
-    struct TestFailureException{};
-
-    /**
-     * Outlines throwing of `TestFailureException` into a single TU
-     *
-     * Also handles `CATCH_CONFIG_DISABLE_EXCEPTIONS` for callers.
-     */
-    [[noreturn]] void throw_test_failure_exception();
+            keep_memory(&x);
+        }
 
-    //! Used to signal that the remainder of a test should be skipped
-    struct TestSkipException{};
+        template <typename Fn, typename... Args>
+        inline auto invoke_deoptimized(Fn&& fn, Args&&... args) -> std::enable_if_t<!std::is_same<void, decltype(fn(args...))>::value> {
+            deoptimize_value(CATCH_FORWARD(fn) (CATCH_FORWARD(args)...));
+        }
 
+        template <typename Fn, typename... Args>
+        inline auto invoke_deoptimized(Fn&& fn, Args&&... args) -> std::enable_if_t<std::is_same<void, decltype(fn(args...))>::value> {
+            CATCH_FORWARD((fn)) (CATCH_FORWARD(args)...);
+        }
+    } // namespace Benchmark
 } // namespace Catch
 
-#endif // CATCH_TEST_FAILURE_EXCEPTION_HPP_INCLUDED
+#endif // CATCH_OPTIMIZER_HPP_INCLUDED
 
 
 #ifndef CATCH_META_HPP_INCLUDED
@@ -1840,112 +1500,6 @@ namespace mpl_{
 
 #endif // CATCH_META_HPP_INCLUDED
 
-
-#ifndef CATCH_INTERFACES_REGISTRY_HUB_HPP_INCLUDED
-#define CATCH_INTERFACES_REGISTRY_HUB_HPP_INCLUDED
-
-
-#include <string>
-
-namespace Catch {
-
-    class TestCaseHandle;
-    struct TestCaseInfo;
-    class ITestCaseRegistry;
-    class IExceptionTranslatorRegistry;
-    class IExceptionTranslator;
-    class IReporterRegistry;
-    class IReporterFactory;
-    class ITagAliasRegistry;
-    class ITestInvoker;
-    class IMutableEnumValuesRegistry;
-    struct SourceLineInfo;
-
-    class StartupExceptionRegistry;
-    class EventListenerFactory;
-
-    using IReporterFactoryPtr = Detail::unique_ptr<IReporterFactory>;
-
-    class IRegistryHub {
-    public:
-        virtual ~IRegistryHub(); // = default
-
-        virtual IReporterRegistry const& getReporterRegistry() const = 0;
-        virtual ITestCaseRegistry const& getTestCaseRegistry() const = 0;
-        virtual ITagAliasRegistry const& getTagAliasRegistry() const = 0;
-        virtual IExceptionTranslatorRegistry const& getExceptionTranslatorRegistry() const = 0;
-
-
-        virtual StartupExceptionRegistry const& getStartupExceptionRegistry() const = 0;
-    };
-
-    class IMutableRegistryHub {
-    public:
-        virtual ~IMutableRegistryHub(); // = default
-        virtual void registerReporter( std::string const& name, IReporterFactoryPtr factory ) = 0;
-        virtual void registerListener( Detail::unique_ptr<EventListenerFactory> factory ) = 0;
-        virtual void registerTest(Detail::unique_ptr<TestCaseInfo>&& testInfo, Detail::unique_ptr<ITestInvoker>&& invoker) = 0;
-        virtual void registerTranslator( Detail::unique_ptr<IExceptionTranslator>&& translator ) = 0;
-        virtual void registerTagAlias( std::string const& alias, std::string const& tag, SourceLineInfo const& lineInfo ) = 0;
-        virtual void registerStartupException() noexcept = 0;
-        virtual IMutableEnumValuesRegistry& getMutableEnumValuesRegistry() = 0;
-    };
-
-    IRegistryHub const& getRegistryHub();
-    IMutableRegistryHub& getMutableRegistryHub();
-    void cleanUp();
-    std::string translateActiveException();
-
-}
-
-#endif // CATCH_INTERFACES_REGISTRY_HUB_HPP_INCLUDED
-
-#include <type_traits>
-
-namespace Catch {
-    namespace Benchmark {
-        namespace Detail {
-            template <typename T>
-            struct CompleteType { using type = T; };
-            template <>
-            struct CompleteType<void> { struct type {}; };
-
-            template <typename T>
-            using CompleteType_t = typename CompleteType<T>::type;
-
-            template <typename Result>
-            struct CompleteInvoker {
-                template <typename Fun, typename... Args>
-                static Result invoke(Fun&& fun, Args&&... args) {
-                    return CATCH_FORWARD(fun)(CATCH_FORWARD(args)...);
-                }
-            };
-            template <>
-            struct CompleteInvoker<void> {
-                template <typename Fun, typename... Args>
-                static CompleteType_t<void> invoke(Fun&& fun, Args&&... args) {
-                    CATCH_FORWARD(fun)(CATCH_FORWARD(args)...);
-                    return {};
-                }
-            };
-
-            // invoke and not return void :(
-            template <typename Fun, typename... Args>
-            CompleteType_t<FunctionReturnType<Fun, Args...>> complete_invoke(Fun&& fun, Args&&... args) {
-                return CompleteInvoker<FunctionReturnType<Fun, Args...>>::invoke(CATCH_FORWARD(fun), CATCH_FORWARD(args)...);
-            }
-
-        } // namespace Detail
-
-        template <typename Fun>
-        Detail::CompleteType_t<FunctionReturnType<Fun>> user_code(Fun&& fun) {
-            return Detail::complete_invoke(CATCH_FORWARD(fun));
-        }
-    } // namespace Benchmark
-} // namespace Catch
-
-#endif // CATCH_COMPLETE_INVOKE_HPP_INCLUDED
-
 namespace Catch {
     namespace Benchmark {
         namespace Detail {
@@ -1963,7 +1517,10 @@ namespace Catch {
                 void start() override { started = Clock::now(); }
                 void finish() override { finished = Clock::now(); }
 
-                ClockDuration<Clock> elapsed() const { return finished - started; }
+                IDuration elapsed() const {
+                    return std::chrono::duration_cast<std::chrono::nanoseconds>(
+                        finished - started );
+                }
 
                 TimePoint<Clock> started;
                 TimePoint<Clock> finished;
@@ -2004,50 +1561,6 @@ namespace Catch {
 
 #endif // CATCH_CHRONOMETER_HPP_INCLUDED
 
-
-// Adapted from donated nonius code.
-
-#ifndef CATCH_ENVIRONMENT_HPP_INCLUDED
-#define CATCH_ENVIRONMENT_HPP_INCLUDED
-
-
-namespace Catch {
-    namespace Benchmark {
-        template <typename Duration>
-        struct EnvironmentEstimate {
-            Duration mean;
-            OutlierClassification outliers;
-
-            template <typename Duration2>
-            operator EnvironmentEstimate<Duration2>() const {
-                return { mean, outliers };
-            }
-        };
-        template <typename Clock>
-        struct Environment {
-            using clock_type = Clock;
-            EnvironmentEstimate<FloatDuration<Clock>> clock_resolution;
-            EnvironmentEstimate<FloatDuration<Clock>> clock_cost;
-        };
-    } // namespace Benchmark
-} // namespace Catch
-
-#endif // CATCH_ENVIRONMENT_HPP_INCLUDED
-
-
-// Adapted from donated nonius code.
-
-#ifndef CATCH_EXECUTION_PLAN_HPP_INCLUDED
-#define CATCH_EXECUTION_PLAN_HPP_INCLUDED
-
-
-
-// Adapted from donated nonius code.
-
-#ifndef CATCH_BENCHMARK_FUNCTION_HPP_INCLUDED
-#define CATCH_BENCHMARK_FUNCTION_HPP_INCLUDED
-
-
 #include <type_traits>
 
 namespace Catch {
@@ -2184,6 +1697,57 @@ namespace Catch {
 
 
 
+// Adapted from donated nonius code.
+
+#ifndef CATCH_COMPLETE_INVOKE_HPP_INCLUDED
+#define CATCH_COMPLETE_INVOKE_HPP_INCLUDED
+
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            template <typename T>
+            struct CompleteType { using type = T; };
+            template <>
+            struct CompleteType<void> { struct type {}; };
+
+            template <typename T>
+            using CompleteType_t = typename CompleteType<T>::type;
+
+            template <typename Result>
+            struct CompleteInvoker {
+                template <typename Fun, typename... Args>
+                static Result invoke(Fun&& fun, Args&&... args) {
+                    return CATCH_FORWARD(fun)(CATCH_FORWARD(args)...);
+                }
+            };
+            template <>
+            struct CompleteInvoker<void> {
+                template <typename Fun, typename... Args>
+                static CompleteType_t<void> invoke(Fun&& fun, Args&&... args) {
+                    CATCH_FORWARD(fun)(CATCH_FORWARD(args)...);
+                    return {};
+                }
+            };
+
+            // invoke and not return void :(
+            template <typename Fun, typename... Args>
+            CompleteType_t<FunctionReturnType<Fun, Args...>> complete_invoke(Fun&& fun, Args&&... args) {
+                return CompleteInvoker<FunctionReturnType<Fun, Args...>>::invoke(CATCH_FORWARD(fun), CATCH_FORWARD(args)...);
+            }
+
+        } // namespace Detail
+
+        template <typename Fun>
+        Detail::CompleteType_t<FunctionReturnType<Fun>> user_code(Fun&& fun) {
+            return Detail::complete_invoke(CATCH_FORWARD(fun));
+        }
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_COMPLETE_INVOKE_HPP_INCLUDED
+
+
 // Adapted from donated nonius code.
 
 #ifndef CATCH_TIMING_HPP_INCLUDED
@@ -2194,14 +1758,14 @@ namespace Catch {
 
 namespace Catch {
     namespace Benchmark {
-        template <typename Duration, typename Result>
+        template <typename Result>
         struct Timing {
-            Duration elapsed;
+            IDuration elapsed;
             Result result;
             int iterations;
         };
-        template <typename Clock, typename Func, typename... Args>
-        using TimingOf = Timing<ClockDuration<Clock>, Detail::CompleteType_t<FunctionReturnType<Func, Args...>>>;
+        template <typename Func, typename... Args>
+        using TimingOf = Timing<Detail::CompleteType_t<FunctionReturnType<Func, Args...>>>;
     } // namespace Benchmark
 } // namespace Catch
 
@@ -2211,7 +1775,7 @@ namespace Catch {
     namespace Benchmark {
         namespace Detail {
             template <typename Clock, typename Fun, typename... Args>
-            TimingOf<Clock, Fun, Args...> measure(Fun&& fun, Args&&... args) {
+            TimingOf<Fun, Args...> measure(Fun&& fun, Args&&... args) {
                 auto start = Clock::now();
                 auto&& r = Detail::complete_invoke(fun, CATCH_FORWARD(args)...);
                 auto end = Clock::now();
@@ -2230,11 +1794,11 @@ namespace Catch {
     namespace Benchmark {
         namespace Detail {
             template <typename Clock, typename Fun>
-            TimingOf<Clock, Fun, int> measure_one(Fun&& fun, int iters, std::false_type) {
+            TimingOf<Fun, int> measure_one(Fun&& fun, int iters, std::false_type) {
                 return Detail::measure<Clock>(fun, iters);
             }
             template <typename Clock, typename Fun>
-            TimingOf<Clock, Fun, Chronometer> measure_one(Fun&& fun, int iters, std::true_type) {
+            TimingOf<Fun, Chronometer> measure_one(Fun&& fun, int iters, std::true_type) {
                 Detail::ChronometerModel<Clock> meter;
                 auto&& result = Detail::complete_invoke(fun, Chronometer(meter, iters));
 
@@ -2249,8 +1813,8 @@ namespace Catch {
             void throw_optimized_away_error();
 
             template <typename Clock, typename Fun>
-            TimingOf<Clock, Fun, run_for_at_least_argument_t<Clock, Fun>>
-                run_for_at_least(ClockDuration<Clock> how_long,
+            TimingOf<Fun, run_for_at_least_argument_t<Clock, Fun>>
+                run_for_at_least(IDuration how_long,
                                  const int initial_iterations,
                                  Fun&& fun) {
                 auto iters = initial_iterations;
@@ -2270,38 +1834,38 @@ namespace Catch {
 
 #endif // CATCH_RUN_FOR_AT_LEAST_HPP_INCLUDED
 
-#include <algorithm>
-#include <iterator>
+#include <vector>
 
 namespace Catch {
     namespace Benchmark {
-        template <typename Duration>
         struct ExecutionPlan {
             int iterations_per_sample;
-            Duration estimated_duration;
+            FDuration estimated_duration;
             Detail::BenchmarkFunction benchmark;
-            Duration warmup_time;
+            FDuration warmup_time;
             int warmup_iterations;
 
-            template <typename Duration2>
-            operator ExecutionPlan<Duration2>() const {
-                return { iterations_per_sample, estimated_duration, benchmark, warmup_time, warmup_iterations };
-            }
-
             template <typename Clock>
-            std::vector<FloatDuration<Clock>> run(const IConfig &cfg, Environment<FloatDuration<Clock>> env) const {
+            std::vector<FDuration> run(const IConfig &cfg, Environment env) const {
                 // warmup a bit
-                Detail::run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(warmup_time), warmup_iterations, Detail::repeat(now<Clock>{}));
+                Detail::run_for_at_least<Clock>(
+                    std::chrono::duration_cast<IDuration>( warmup_time ),
+                    warmup_iterations,
+                    Detail::repeat( []() { return Clock::now(); } )
+                );
 
-                std::vector<FloatDuration<Clock>> times;
-                times.reserve(cfg.benchmarkSamples());
-                std::generate_n(std::back_inserter(times), cfg.benchmarkSamples(), [this, env] {
+                std::vector<FDuration> times;
+                const auto num_samples = cfg.benchmarkSamples();
+                times.reserve( num_samples );
+                for ( size_t i = 0; i < num_samples; ++i ) {
                     Detail::ChronometerModel<Clock> model;
-                    this->benchmark(Chronometer(model, iterations_per_sample));
+                    this->benchmark( Chronometer( model, iterations_per_sample ) );
                     auto sample_time = model.elapsed() - env.clock_cost.mean;
-                    if (sample_time < FloatDuration<Clock>::zero()) sample_time = FloatDuration<Clock>::zero();
-                    return sample_time / iterations_per_sample;
-                });
+                    if ( sample_time < FDuration::zero() ) {
+                        sample_time = FDuration::zero();
+                    }
+                    times.push_back(sample_time / iterations_per_sample);
+                }
                 return times;
             }
         };
@@ -2324,122 +1888,35 @@ namespace Catch {
 #define CATCH_STATS_HPP_INCLUDED
 
 
-#include <algorithm>
 #include <vector>
-#include <numeric>
-#include <tuple>
-#include <cmath>
 
 namespace Catch {
     namespace Benchmark {
         namespace Detail {
             using sample = std::vector<double>;
 
-            // Used when we know we want == comparison of two doubles
-            // to centralize warning suppression
-            bool directCompare( double lhs, double rhs );
-
-            double weighted_average_quantile(int k, int q, std::vector<double>::iterator first, std::vector<double>::iterator last);
-
-            template <typename Iterator>
-            OutlierClassification classify_outliers(Iterator first, Iterator last) {
-                std::vector<double> copy(first, last);
-
-                auto q1 = weighted_average_quantile(1, 4, copy.begin(), copy.end());
-                auto q3 = weighted_average_quantile(3, 4, copy.begin(), copy.end());
-                auto iqr = q3 - q1;
-                auto los = q1 - (iqr * 3.);
-                auto lom = q1 - (iqr * 1.5);
-                auto him = q3 + (iqr * 1.5);
-                auto his = q3 + (iqr * 3.);
-
-                OutlierClassification o;
-                for (; first != last; ++first) {
-                    auto&& t = *first;
-                    if (t < los) ++o.low_severe;
-                    else if (t < lom) ++o.low_mild;
-                    else if (t > his) ++o.high_severe;
-                    else if (t > him) ++o.high_mild;
-                    ++o.samples_seen;
-                }
-                return o;
-            }
-
-            template <typename Iterator>
-            double mean(Iterator first, Iterator last) {
-                auto count = last - first;
-                double sum = std::accumulate(first, last, 0.);
-                return sum / static_cast<double>(count);
-            }
+            double weighted_average_quantile( int k,
+                                              int q,
+                                              double* first,
+                                              double* last );
 
-            template <typename Estimator, typename Iterator>
-            sample jackknife(Estimator&& estimator, Iterator first, Iterator last) {
-                auto n = static_cast<size_t>(last - first);
-                auto second = first;
-                ++second;
-                sample results;
-                results.reserve(n);
+            OutlierClassification
+            classify_outliers( double const* first, double const* last );
 
-                for (auto it = first; it != last; ++it) {
-                    std::iter_swap(it, first);
-                    results.push_back(estimator(second, last));
-                }
-
-                return results;
-            }
+            double mean( double const* first, double const* last );
 
-            inline double normal_cdf(double x) {
-                return std::erfc(-x / std::sqrt(2.0)) / 2.0;
-            }
+            double normal_cdf( double x );
 
             double erfc_inv(double x);
 
             double normal_quantile(double p);
 
-            template <typename Iterator, typename Estimator>
-            Estimate<double> bootstrap(double confidence_level, Iterator first, Iterator last, sample const& resample, Estimator&& estimator) {
-                auto n_samples = last - first;
-
-                double point = estimator(first, last);
-                // Degenerate case with a single sample
-                if (n_samples == 1) return { point, point, point, confidence_level };
-
-                sample jack = jackknife(estimator, first, last);
-                double jack_mean = mean(jack.begin(), jack.end());
-                double sum_squares, sum_cubes;
-                std::tie(sum_squares, sum_cubes) = std::accumulate(jack.begin(), jack.end(), std::make_pair(0., 0.), [jack_mean](std::pair<double, double> sqcb, double x) -> std::pair<double, double> {
-                    auto d = jack_mean - x;
-                    auto d2 = d * d;
-                    auto d3 = d2 * d;
-                    return { sqcb.first + d2, sqcb.second + d3 };
-                });
-
-                double accel = sum_cubes / (6 * std::pow(sum_squares, 1.5));
-                long n = static_cast<long>(resample.size());
-                double prob_n = std::count_if(resample.begin(), resample.end(), [point](double x) { return x < point; }) / static_cast<double>(n);
-                // degenerate case with uniform samples
-                if ( directCompare( prob_n, 0. ) ) {
-                    return { point, point, point, confidence_level };
-                }
-
-                double bias = normal_quantile(prob_n);
-                double z1 = normal_quantile((1. - confidence_level) / 2.);
-
-                auto cumn = [n]( double x ) -> long {
-                    return std::lround( normal_cdf( x ) * static_cast<double>(n) );
-                };
-                auto a = [bias, accel](double b) { return bias + b / (1. - accel * b); };
-                double b1 = bias + z1;
-                double b2 = bias - z1;
-                double a1 = a(b1);
-                double a2 = a(b2);
-                auto lo = static_cast<size_t>((std::max)(cumn(a1), 0l));
-                auto hi = static_cast<size_t>((std::min)(cumn(a2), n - 1));
-
-                return { point, resample[lo], resample[hi], confidence_level };
-            }
-
-            double outlier_variance(Estimate<double> mean, Estimate<double> stddev, int n);
+            Estimate<double>
+            bootstrap( double confidence_level,
+                       double* first,
+                       double* last,
+                       sample const& resample,
+                       double ( *estimator )( double const*, double const* ) );
 
             struct bootstrap_analysis {
                 Estimate<double> mean;
@@ -2447,7 +1924,10 @@ namespace Catch {
                 double outlier_variance;
             };
 
-            bootstrap_analysis analyse_samples(double confidence_level, unsigned int n_resamples, std::vector<double>::iterator first, std::vector<double>::iterator last);
+            bootstrap_analysis analyse_samples(double confidence_level,
+                                               unsigned int n_resamples,
+                                               double* first,
+                                               double* last);
         } // namespace Detail
     } // namespace Benchmark
 } // namespace Catch
@@ -2455,7 +1935,6 @@ namespace Catch {
 #endif // CATCH_STATS_HPP_INCLUDED
 
 #include <algorithm>
-#include <iterator>
 #include <vector>
 #include <cmath>
 
@@ -2466,46 +1945,49 @@ namespace Catch {
             std::vector<double> resolution(int k) {
                 std::vector<TimePoint<Clock>> times;
                 times.reserve(static_cast<size_t>(k + 1));
-                std::generate_n(std::back_inserter(times), k + 1, now<Clock>{});
+                for ( int i = 0; i < k + 1; ++i ) {
+                    times.push_back( Clock::now() );
+                }
 
                 std::vector<double> deltas;
                 deltas.reserve(static_cast<size_t>(k));
-                std::transform(std::next(times.begin()), times.end(), times.begin(),
-                    std::back_inserter(deltas),
-                    [](TimePoint<Clock> a, TimePoint<Clock> b) { return static_cast<double>((a - b).count()); });
+                for ( size_t idx = 1; idx < times.size(); ++idx ) {
+                    deltas.push_back( static_cast<double>(
+                        ( times[idx] - times[idx - 1] ).count() ) );
+                }
 
                 return deltas;
             }
 
-            const auto warmup_iterations = 10000;
-            const auto warmup_time = std::chrono::milliseconds(100);
-            const auto minimum_ticks = 1000;
-            const auto warmup_seed = 10000;
-            const auto clock_resolution_estimation_time = std::chrono::milliseconds(500);
-            const auto clock_cost_estimation_time_limit = std::chrono::seconds(1);
-            const auto clock_cost_estimation_tick_limit = 100000;
-            const auto clock_cost_estimation_time = std::chrono::milliseconds(10);
-            const auto clock_cost_estimation_iterations = 10000;
+            constexpr auto warmup_iterations = 10000;
+            constexpr auto warmup_time = std::chrono::milliseconds(100);
+            constexpr auto minimum_ticks = 1000;
+            constexpr auto warmup_seed = 10000;
+            constexpr auto clock_resolution_estimation_time = std::chrono::milliseconds(500);
+            constexpr auto clock_cost_estimation_time_limit = std::chrono::seconds(1);
+            constexpr auto clock_cost_estimation_tick_limit = 100000;
+            constexpr auto clock_cost_estimation_time = std::chrono::milliseconds(10);
+            constexpr auto clock_cost_estimation_iterations = 10000;
 
             template <typename Clock>
             int warmup() {
-                return run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(warmup_time), warmup_seed, &resolution<Clock>)
+                return run_for_at_least<Clock>(warmup_time, warmup_seed, &resolution<Clock>)
                     .iterations;
             }
             template <typename Clock>
-            EnvironmentEstimate<FloatDuration<Clock>> estimate_clock_resolution(int iterations) {
-                auto r = run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(clock_resolution_estimation_time), iterations, &resolution<Clock>)
+            EnvironmentEstimate estimate_clock_resolution(int iterations) {
+                auto r = run_for_at_least<Clock>(clock_resolution_estimation_time, iterations, &resolution<Clock>)
                     .result;
                 return {
-                    FloatDuration<Clock>(mean(r.begin(), r.end())),
-                    classify_outliers(r.begin(), r.end()),
+                    FDuration(mean(r.data(), r.data() + r.size())),
+                    classify_outliers(r.data(), r.data() + r.size()),
                 };
             }
             template <typename Clock>
-            EnvironmentEstimate<FloatDuration<Clock>> estimate_clock_cost(FloatDuration<Clock> resolution) {
+            EnvironmentEstimate estimate_clock_cost(FDuration resolution) {
                 auto time_limit = (std::min)(
                     resolution * clock_cost_estimation_tick_limit,
-                    FloatDuration<Clock>(clock_cost_estimation_time_limit));
+                    FDuration(clock_cost_estimation_time_limit));
                 auto time_clock = [](int k) {
                     return Detail::measure<Clock>([k] {
                         for (int i = 0; i < k; ++i) {
@@ -2516,26 +1998,28 @@ namespace Catch {
                 };
                 time_clock(1);
                 int iters = clock_cost_estimation_iterations;
-                auto&& r = run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(clock_cost_estimation_time), iters, time_clock);
+                auto&& r = run_for_at_least<Clock>(clock_cost_estimation_time, iters, time_clock);
                 std::vector<double> times;
                 int nsamples = static_cast<int>(std::ceil(time_limit / r.elapsed));
                 times.reserve(static_cast<size_t>(nsamples));
-                std::generate_n(std::back_inserter(times), nsamples, [time_clock, &r] {
-                    return static_cast<double>((time_clock(r.iterations) / r.iterations).count());
-                });
+                for ( int s = 0; s < nsamples; ++s ) {
+                    times.push_back( static_cast<double>(
+                        ( time_clock( r.iterations ) / r.iterations )
+                            .count() ) );
+                }
                 return {
-                    FloatDuration<Clock>(mean(times.begin(), times.end())),
-                    classify_outliers(times.begin(), times.end()),
+                    FDuration(mean(times.data(), times.data() + times.size())),
+                    classify_outliers(times.data(), times.data() + times.size()),
                 };
             }
 
             template <typename Clock>
-            Environment<FloatDuration<Clock>> measure_environment() {
+            Environment measure_environment() {
 #if defined(__clang__)
 #    pragma clang diagnostic push
 #    pragma clang diagnostic ignored "-Wexit-time-destructors"
 #endif
-                static Catch::Detail::unique_ptr<Environment<FloatDuration<Clock>>> env;
+                static Catch::Detail::unique_ptr<Environment> env;
 #if defined(__clang__)
 #    pragma clang diagnostic pop
 #endif
@@ -2547,7 +2031,7 @@ namespace Catch {
                 auto resolution = Detail::estimate_clock_resolution<Clock>(iters);
                 auto cost = Detail::estimate_clock_cost<Clock>(resolution.mean);
 
-                env = Catch::Detail::make_unique<Environment<FloatDuration<Clock>>>( Environment<FloatDuration<Clock>>{resolution, cost} );
+                env = Catch::Detail::make_unique<Environment>( Environment{resolution, cost} );
                 return *env;
             }
         } // namespace Detail
@@ -2570,95 +2054,29 @@ namespace Catch {
 #define CATCH_SAMPLE_ANALYSIS_HPP_INCLUDED
 
 
-#include <algorithm>
 #include <vector>
-#include <iterator>
 
 namespace Catch {
     namespace Benchmark {
-        template <typename Duration>
         struct SampleAnalysis {
-            std::vector<Duration> samples;
-            Estimate<Duration> mean;
-            Estimate<Duration> standard_deviation;
+            std::vector<FDuration> samples;
+            Estimate<FDuration> mean;
+            Estimate<FDuration> standard_deviation;
             OutlierClassification outliers;
             double outlier_variance;
-
-            template <typename Duration2>
-            operator SampleAnalysis<Duration2>() const {
-                std::vector<Duration2> samples2;
-                samples2.reserve(samples.size());
-                std::transform(samples.begin(), samples.end(), std::back_inserter(samples2), [](Duration d) { return Duration2(d); });
-                return {
-                    CATCH_MOVE(samples2),
-                    mean,
-                    standard_deviation,
-                    outliers,
-                    outlier_variance,
-                };
-            }
         };
     } // namespace Benchmark
 } // namespace Catch
 
 #endif // CATCH_SAMPLE_ANALYSIS_HPP_INCLUDED
 
-#include <algorithm>
-#include <iterator>
-#include <vector>
 
 namespace Catch {
+    class IConfig;
+
     namespace Benchmark {
         namespace Detail {
-            template <typename Duration, typename Iterator>
-            SampleAnalysis<Duration> analyse(const IConfig &cfg, Environment<Duration>, Iterator first, Iterator last) {
-                if (!cfg.benchmarkNoAnalysis()) {
-                    std::vector<double> samples;
-                    samples.reserve(static_cast<size_t>(last - first));
-                    std::transform(first, last, std::back_inserter(samples), [](Duration d) { return d.count(); });
-
-                    auto analysis = Catch::Benchmark::Detail::analyse_samples(cfg.benchmarkConfidenceInterval(), cfg.benchmarkResamples(), samples.begin(), samples.end());
-                    auto outliers = Catch::Benchmark::Detail::classify_outliers(samples.begin(), samples.end());
-
-                    auto wrap_estimate = [](Estimate<double> e) {
-                        return Estimate<Duration> {
-                            Duration(e.point),
-                                Duration(e.lower_bound),
-                                Duration(e.upper_bound),
-                                e.confidence_interval,
-                        };
-                    };
-                    std::vector<Duration> samples2;
-                    samples2.reserve(samples.size());
-                    std::transform(samples.begin(), samples.end(), std::back_inserter(samples2), [](double d) { return Duration(d); });
-                    return {
-                        CATCH_MOVE(samples2),
-                        wrap_estimate(analysis.mean),
-                        wrap_estimate(analysis.standard_deviation),
-                        outliers,
-                        analysis.outlier_variance,
-                    };
-                } else {
-                    std::vector<Duration> samples;
-                    samples.reserve(static_cast<size_t>(last - first));
-
-                    Duration mean = Duration(0);
-                    int i = 0;
-                    for (auto it = first; it < last; ++it, ++i) {
-                        samples.push_back(Duration(*it));
-                        mean += Duration(*it);
-                    }
-                    mean /= i;
-
-                    return {
-                        CATCH_MOVE(samples),
-                        Estimate<Duration>{mean, mean, mean, 0.0},
-                        Estimate<Duration>{Duration(0), Duration(0), Duration(0), 0.0},
-                        OutlierClassification{},
-                        0.0
-                    };
-                }
-            }
+            SampleAnalysis analyse(const IConfig &cfg, FDuration* first, FDuration* last);
         } // namespace Detail
     } // namespace Benchmark
 } // namespace Catch
@@ -2666,9 +2084,9 @@ namespace Catch {
 #endif // CATCH_ANALYSE_HPP_INCLUDED
 
 #include <algorithm>
-#include <functional>
+#include <chrono>
+#include <exception>
 #include <string>
-#include <vector>
 #include <cmath>
 
 namespace Catch {
@@ -2682,16 +2100,18 @@ namespace Catch {
                 : fun(CATCH_MOVE(func)), name(CATCH_MOVE(benchmarkName)) {}
 
             template <typename Clock>
-            ExecutionPlan<FloatDuration<Clock>> prepare(const IConfig &cfg, Environment<FloatDuration<Clock>> env) const {
+            ExecutionPlan prepare(const IConfig &cfg, Environment env) const {
                 auto min_time = env.clock_resolution.mean * Detail::minimum_ticks;
                 auto run_time = std::max(min_time, std::chrono::duration_cast<decltype(min_time)>(cfg.benchmarkWarmupTime()));
-                auto&& test = Detail::run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(run_time), 1, fun);
+                auto&& test = Detail::run_for_at_least<Clock>(std::chrono::duration_cast<IDuration>(run_time), 1, fun);
                 int new_iters = static_cast<int>(std::ceil(min_time * test.iterations / test.elapsed));
-                return { new_iters, test.elapsed / test.iterations * new_iters * cfg.benchmarkSamples(), fun, std::chrono::duration_cast<FloatDuration<Clock>>(cfg.benchmarkWarmupTime()), Detail::warmup_iterations };
+                return { new_iters, test.elapsed / test.iterations * new_iters * cfg.benchmarkSamples(), fun, std::chrono::duration_cast<FDuration>(cfg.benchmarkWarmupTime()), Detail::warmup_iterations };
             }
 
             template <typename Clock = default_clock>
             void run() {
+                static_assert( Clock::is_steady,
+                               "Benchmarking clock should be steady" );
                 auto const* cfg = getCurrentContext().getConfig();
 
                 auto env = Detail::measure_environment<Clock>();
@@ -2718,10 +2138,10 @@ namespace Catch {
                         return plan.template run<Clock>(*cfg, env);
                     });
 
-                    auto analysis = Detail::analyse(*cfg, env, samples.begin(), samples.end());
-                    BenchmarkStats<FloatDuration<Clock>> stats{ CATCH_MOVE(info), CATCH_MOVE(analysis.samples), analysis.mean, analysis.standard_deviation, analysis.outliers, analysis.outlier_variance };
+                    auto analysis = Detail::analyse(*cfg, samples.data(), samples.data() + samples.size());
+                    BenchmarkStats<> stats{ CATCH_MOVE(info), CATCH_MOVE(analysis.samples), analysis.mean, analysis.standard_deviation, analysis.outliers, analysis.outlier_variance };
                     getResultCapture().benchmarkEnded(stats);
-                } CATCH_CATCH_ANON (TestFailureException) {
+                } CATCH_CATCH_ANON (TestFailureException const&) {
                     getResultCapture().benchmarkFailed("Benchmark failed due to failed assertion"_sr);
                 } CATCH_CATCH_ALL{
                     getResultCapture().benchmarkFailed(translateActiveException());
@@ -2889,6 +2309,7 @@ namespace Catch {
 #ifndef CATCH_CONFIG_WCHAR_HPP_INCLUDED
 #define CATCH_CONFIG_WCHAR_HPP_INCLUDED
 
+
 // We assume that WCHAR should be enabled by default, and only disabled
 // for a shortlist (so far only DJGPP) of compilers.
 
@@ -3112,7 +2533,6 @@ namespace Catch {
     } // namespace Detail
 
 
-    // If we decide for C++14, change these to enable_if_ts
     template <typename T, typename = void>
     struct StringMaker {
         template <typename Fake = T>
@@ -3395,6 +2815,12 @@ namespace Catch {
             }
         }
     };
+    template <>
+    struct StringMaker<std::nullopt_t> {
+        static std::string convert(const std::nullopt_t&) {
+            return "{ }";
+        }
+    };
 }
 #endif // CATCH_CONFIG_ENABLE_OPTIONAL_STRINGMAKER
 
@@ -3781,6 +3207,143 @@ struct StringMaker<Catch::Approx> {
 #endif // CATCH_APPROX_HPP_INCLUDED
 
 
+#ifndef CATCH_ASSERTION_INFO_HPP_INCLUDED
+#define CATCH_ASSERTION_INFO_HPP_INCLUDED
+
+
+
+#ifndef CATCH_SOURCE_LINE_INFO_HPP_INCLUDED
+#define CATCH_SOURCE_LINE_INFO_HPP_INCLUDED
+
+#include <cstddef>
+#include <iosfwd>
+
+namespace Catch {
+
+    struct SourceLineInfo {
+
+        SourceLineInfo() = delete;
+        constexpr SourceLineInfo( char const* _file, std::size_t _line ) noexcept:
+            file( _file ),
+            line( _line )
+        {}
+
+        bool operator == ( SourceLineInfo const& other ) const noexcept;
+        bool operator < ( SourceLineInfo const& other ) const noexcept;
+
+        char const* file;
+        std::size_t line;
+
+        friend std::ostream& operator << (std::ostream& os, SourceLineInfo const& info);
+    };
+}
+
+#define CATCH_INTERNAL_LINEINFO \
+    ::Catch::SourceLineInfo( __FILE__, static_cast<std::size_t>( __LINE__ ) )
+
+#endif // CATCH_SOURCE_LINE_INFO_HPP_INCLUDED
+
+namespace Catch {
+
+    struct AssertionInfo {
+        // AssertionInfo() = delete;
+
+        StringRef macroName;
+        SourceLineInfo lineInfo;
+        StringRef capturedExpression;
+        ResultDisposition::Flags resultDisposition;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_ASSERTION_INFO_HPP_INCLUDED
+
+
+#ifndef CATCH_ASSERTION_RESULT_HPP_INCLUDED
+#define CATCH_ASSERTION_RESULT_HPP_INCLUDED
+
+
+
+#ifndef CATCH_LAZY_EXPR_HPP_INCLUDED
+#define CATCH_LAZY_EXPR_HPP_INCLUDED
+
+#include <iosfwd>
+
+namespace Catch {
+
+    class ITransientExpression;
+
+    class LazyExpression {
+        friend class AssertionHandler;
+        friend struct AssertionStats;
+        friend class RunContext;
+
+        ITransientExpression const* m_transientExpression = nullptr;
+        bool m_isNegated;
+    public:
+        LazyExpression( bool isNegated ):
+            m_isNegated(isNegated)
+        {}
+        LazyExpression(LazyExpression const& other) = default;
+        LazyExpression& operator = ( LazyExpression const& ) = delete;
+
+        explicit operator bool() const {
+            return m_transientExpression != nullptr;
+        }
+
+        friend auto operator << ( std::ostream& os, LazyExpression const& lazyExpr ) -> std::ostream&;
+    };
+
+} // namespace Catch
+
+#endif // CATCH_LAZY_EXPR_HPP_INCLUDED
+
+#include <string>
+
+namespace Catch {
+
+    struct AssertionResultData
+    {
+        AssertionResultData() = delete;
+
+        AssertionResultData( ResultWas::OfType _resultType, LazyExpression const& _lazyExpression );
+
+        std::string message;
+        mutable std::string reconstructedExpression;
+        LazyExpression lazyExpression;
+        ResultWas::OfType resultType;
+
+        std::string reconstructExpression() const;
+    };
+
+    class AssertionResult {
+    public:
+        AssertionResult() = delete;
+        AssertionResult( AssertionInfo const& info, AssertionResultData&& data );
+
+        bool isOk() const;
+        bool succeeded() const;
+        ResultWas::OfType getResultType() const;
+        bool hasExpression() const;
+        bool hasMessage() const;
+        std::string getExpression() const;
+        std::string getExpressionInMacro() const;
+        bool hasExpandedExpression() const;
+        std::string getExpandedExpression() const;
+        StringRef getMessage() const;
+        SourceLineInfo getSourceInfo() const;
+        StringRef getTestMacroName() const;
+
+    //protected:
+        AssertionInfo m_info;
+        AssertionResultData m_resultData;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_ASSERTION_RESULT_HPP_INCLUDED
+
+
 #ifndef CATCH_CONFIG_HPP_INCLUDED
 #define CATCH_CONFIG_HPP_INCLUDED
 
@@ -3945,6 +3508,7 @@ namespace Catch {
 #ifndef CATCH_OPTIONAL_HPP_INCLUDED
 #define CATCH_OPTIONAL_HPP_INCLUDED
 
+
 #include <cassert>
 
 namespace Catch {
@@ -3953,35 +3517,50 @@ namespace Catch {
     template<typename T>
     class Optional {
     public:
-        Optional() : nullableValue( nullptr ) {}
-        Optional( T const& _value )
-        : nullableValue( new( storage ) T( _value ) )
-        {}
-        Optional( Optional const& _other )
-        : nullableValue( _other ? new( storage ) T( *_other ) : nullptr )
-        {}
+        Optional(): nullableValue( nullptr ) {}
+        ~Optional() { reset(); }
+
+        Optional( T const& _value ):
+            nullableValue( new ( storage ) T( _value ) ) {}
+        Optional( T&& _value ):
+            nullableValue( new ( storage ) T( CATCH_MOVE( _value ) ) ) {}
 
-        ~Optional() {
+        Optional& operator=( T const& _value ) {
+            reset();
+            nullableValue = new ( storage ) T( _value );
+            return *this;
+        }
+        Optional& operator=( T&& _value ) {
             reset();
+            nullableValue = new ( storage ) T( CATCH_MOVE( _value ) );
+            return *this;
         }
 
-        Optional& operator= ( Optional const& _other ) {
-            if( &_other != this ) {
+        Optional( Optional const& _other ):
+            nullableValue( _other ? new ( storage ) T( *_other ) : nullptr ) {}
+        Optional( Optional&& _other ):
+            nullableValue( _other ? new ( storage ) T( CATCH_MOVE( *_other ) )
+                                  : nullptr ) {}
+
+        Optional& operator=( Optional const& _other ) {
+            if ( &_other != this ) {
                 reset();
-                if( _other )
-                    nullableValue = new( storage ) T( *_other );
+                if ( _other ) { nullableValue = new ( storage ) T( *_other ); }
             }
             return *this;
         }
-        Optional& operator = ( T const& _value ) {
-            reset();
-            nullableValue = new( storage ) T( _value );
+        Optional& operator=( Optional&& _other ) {
+            if ( &_other != this ) {
+                reset();
+                if ( _other ) {
+                    nullableValue = new ( storage ) T( CATCH_MOVE( *_other ) );
+                }
+            }
             return *this;
         }
 
         void reset() {
-            if( nullableValue )
-                nullableValue->~T();
+            if ( nullableValue ) { nullableValue->~T(); }
             nullableValue = nullptr;
         }
 
@@ -4025,177 +3604,42 @@ namespace Catch {
         }
         friend bool operator!=(Optional const& a, Optional const& b) {
             return !( a == b );
-        }
-
-    private:
-        T *nullableValue;
-        alignas(alignof(T)) char storage[sizeof(T)];
-    };
-
-} // end namespace Catch
-
-#endif // CATCH_OPTIONAL_HPP_INCLUDED
-
-
-#ifndef CATCH_RANDOM_SEED_GENERATION_HPP_INCLUDED
-#define CATCH_RANDOM_SEED_GENERATION_HPP_INCLUDED
-
-#include <cstdint>
-
-namespace Catch {
-
-    enum class GenerateFrom {
-        Time,
-        RandomDevice,
-        //! Currently equivalent to RandomDevice, but can change at any point
-        Default
-    };
-
-    std::uint32_t generateRandomSeed(GenerateFrom from);
-
-} // end namespace Catch
-
-#endif // CATCH_RANDOM_SEED_GENERATION_HPP_INCLUDED
-
-
-#ifndef CATCH_REPORTER_SPEC_PARSER_HPP_INCLUDED
-#define CATCH_REPORTER_SPEC_PARSER_HPP_INCLUDED
-
-
-
-#ifndef CATCH_CONSOLE_COLOUR_HPP_INCLUDED
-#define CATCH_CONSOLE_COLOUR_HPP_INCLUDED
-
-
-#include <iosfwd>
-#include <cstdint>
-
-namespace Catch {
-
-    enum class ColourMode : std::uint8_t;
-    class IStream;
-
-    struct Colour {
-        enum Code {
-            None = 0,
-
-            White,
-            Red,
-            Green,
-            Blue,
-            Cyan,
-            Yellow,
-            Grey,
-
-            Bright = 0x10,
-
-            BrightRed = Bright | Red,
-            BrightGreen = Bright | Green,
-            LightGrey = Bright | Grey,
-            BrightWhite = Bright | White,
-            BrightYellow = Bright | Yellow,
-
-            // By intention
-            FileName = LightGrey,
-            Warning = BrightYellow,
-            ResultError = BrightRed,
-            ResultSuccess = BrightGreen,
-            ResultExpectedFailure = Warning,
-
-            Error = BrightRed,
-            Success = Green,
-            Skip = LightGrey,
-
-            OriginalExpression = Cyan,
-            ReconstructedExpression = BrightYellow,
-
-            SecondaryText = LightGrey,
-            Headers = White
-        };
-    };
-
-    class ColourImpl {
-    protected:
-        //! The associated stream of this ColourImpl instance
-        IStream* m_stream;
-    public:
-        ColourImpl( IStream* stream ): m_stream( stream ) {}
-
-        //! RAII wrapper around writing specific colour of text using specific
-        //! colour impl into a stream.
-        class ColourGuard {
-            ColourImpl const* m_colourImpl;
-            Colour::Code m_code;
-            bool m_engaged = false;
+        }
 
-        public:
-            //! Does **not** engage the guard/start the colour
-            ColourGuard( Colour::Code code,
-                         ColourImpl const* colour );
+    private:
+        T* nullableValue;
+        alignas(alignof(T)) char storage[sizeof(T)];
+    };
 
-            ColourGuard( ColourGuard const& rhs ) = delete;
-            ColourGuard& operator=( ColourGuard const& rhs ) = delete;
+} // end namespace Catch
 
-            ColourGuard( ColourGuard&& rhs ) noexcept;
-            ColourGuard& operator=( ColourGuard&& rhs ) noexcept;
+#endif // CATCH_OPTIONAL_HPP_INCLUDED
 
-            //! Removes colour _if_ the guard was engaged
-            ~ColourGuard();
 
-            /**
-             * Explicitly engages colour for given stream.
-             *
-             * The API based on operator<< should be preferred.
-             */
-            ColourGuard& engage( std::ostream& stream ) &;
-            /**
-             * Explicitly engages colour for given stream.
-             *
-             * The API based on operator<< should be preferred.
-             */
-            ColourGuard&& engage( std::ostream& stream ) &&;
+#ifndef CATCH_RANDOM_SEED_GENERATION_HPP_INCLUDED
+#define CATCH_RANDOM_SEED_GENERATION_HPP_INCLUDED
 
-        private:
-            //! Engages the guard and starts using colour
-            friend std::ostream& operator<<( std::ostream& lhs,
-                                             ColourGuard& guard ) {
-                guard.engageImpl( lhs );
-                return lhs;
-            }
-            //! Engages the guard and starts using colour
-            friend std::ostream& operator<<( std::ostream& lhs,
-                                            ColourGuard&& guard) {
-                guard.engageImpl( lhs );
-                return lhs;
-            }
+#include <cstdint>
 
-            void engageImpl( std::ostream& stream );
+namespace Catch {
 
-        };
+    enum class GenerateFrom {
+        Time,
+        RandomDevice,
+        //! Currently equivalent to RandomDevice, but can change at any point
+        Default
+    };
 
-        virtual ~ColourImpl(); // = default
-        /**
-         * Creates a guard object for given colour and this colour impl
-         *
-         * **Important:**
-         * the guard starts disengaged, and has to be engaged explicitly.
-         */
-        ColourGuard guardColour( Colour::Code colourCode );
+    std::uint32_t generateRandomSeed(GenerateFrom from);
 
-    private:
-        virtual void use( Colour::Code colourCode ) const = 0;
-    };
+} // end namespace Catch
 
-    //! Provides ColourImpl based on global config and target compilation platform
-    Detail::unique_ptr<ColourImpl> makeColourImpl( ColourMode colourSelection,
-                                                   IStream* stream );
+#endif // CATCH_RANDOM_SEED_GENERATION_HPP_INCLUDED
 
-    //! Checks if specific colour impl has been compiled into the binary
-    bool isColourImplAvailable( ColourMode colourSelection );
 
-} // end namespace Catch
+#ifndef CATCH_REPORTER_SPEC_PARSER_HPP_INCLUDED
+#define CATCH_REPORTER_SPEC_PARSER_HPP_INCLUDED
 
-#endif // CATCH_CONSOLE_COLOUR_HPP_INCLUDED
 
 #include <map>
 #include <string>
@@ -4322,7 +3766,7 @@ namespace Catch {
         bool benchmarkNoAnalysis = false;
         unsigned int benchmarkSamples = 100;
         double benchmarkConfidenceInterval = 0.95;
-        unsigned int benchmarkResamples = 100000;
+        unsigned int benchmarkResamples = 100'000;
         std::chrono::milliseconds::rep benchmarkWarmupTime = 100;
 
         Verbosity verbosity = Verbosity::Normal;
@@ -4424,6 +3868,29 @@ namespace Catch {
 
 
 
+
+/** \file
+ * Wrapper for the CATCH_CONFIG_PREFIX_MESSAGES configuration option
+ *
+ * CATCH_CONFIG_PREFIX_ALL can be used to avoid clashes with other macros
+ * by prepending CATCH_. This may not be desirable if the only clashes are with
+ * logger macros such as INFO and WARN. In this cases
+ * CATCH_CONFIG_PREFIX_MESSAGES can be used to only prefix a small subset
+ * of relevant macros.
+ *
+ */
+
+#ifndef CATCH_CONFIG_PREFIX_MESSAGES_HPP_INCLUDED
+#define CATCH_CONFIG_PREFIX_MESSAGES_HPP_INCLUDED
+
+
+#if defined(CATCH_CONFIG_PREFIX_ALL) && !defined(CATCH_CONFIG_PREFIX_MESSAGES)
+    #define CATCH_CONFIG_PREFIX_MESSAGES
+#endif
+
+#endif // CATCH_CONFIG_PREFIX_MESSAGES_HPP_INCLUDED
+
+
 #ifndef CATCH_STREAM_END_STOP_HPP_INCLUDED
 #define CATCH_STREAM_END_STOP_HPP_INCLUDED
 
@@ -4435,10 +3902,10 @@ namespace Catch {
     // as well as
     //    << stuff +StreamEndStop
     struct StreamEndStop {
-        StringRef operator+() const { return StringRef(); }
+        constexpr StringRef operator+() const { return StringRef(); }
 
         template <typename T>
-        friend T const& operator+( T const& value, StreamEndStop ) {
+        constexpr friend T const& operator+( T const& value, StreamEndStop ) {
             return value;
         }
     };
@@ -4447,12 +3914,47 @@ namespace Catch {
 
 #endif // CATCH_STREAM_END_STOP_HPP_INCLUDED
 
+
+#ifndef CATCH_MESSAGE_INFO_HPP_INCLUDED
+#define CATCH_MESSAGE_INFO_HPP_INCLUDED
+
+
+#include <string>
+
+namespace Catch {
+
+    struct MessageInfo {
+        MessageInfo(    StringRef _macroName,
+                        SourceLineInfo const& _lineInfo,
+                        ResultWas::OfType _type );
+
+        StringRef macroName;
+        std::string message;
+        SourceLineInfo lineInfo;
+        ResultWas::OfType type;
+        unsigned int sequence;
+
+        bool operator == (MessageInfo const& other) const {
+            return sequence == other.sequence;
+        }
+        bool operator < (MessageInfo const& other) const {
+            return sequence < other.sequence;
+        }
+    private:
+        static unsigned int globalCount;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_MESSAGE_INFO_HPP_INCLUDED
+
 #include <string>
 #include <vector>
 
 namespace Catch {
 
     struct SourceLineInfo;
+    class IResultCapture;
 
     struct MessageStream {
 
@@ -4493,7 +3995,7 @@ namespace Catch {
 
     class Capturer {
         std::vector<MessageInfo> m_messages;
-        IResultCapture& m_resultCapture = getResultCapture();
+        IResultCapture& m_resultCapture;
         size_t m_captured = 0;
     public:
         Capturer( StringRef macroName, SourceLineInfo const& lineInfo, ResultWas::OfType resultType, StringRef names );
@@ -4544,28 +4046,28 @@ namespace Catch {
     Catch::getResultCapture().emplaceUnscopedMessage( Catch::MessageBuilder( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, Catch::ResultWas::Info ) << log )
 
 
-#if defined(CATCH_CONFIG_PREFIX_ALL) && !defined(CATCH_CONFIG_DISABLE)
+#if defined(CATCH_CONFIG_PREFIX_MESSAGES) && !defined(CATCH_CONFIG_DISABLE)
 
   #define CATCH_INFO( msg ) INTERNAL_CATCH_INFO( "CATCH_INFO", msg )
   #define CATCH_UNSCOPED_INFO( msg ) INTERNAL_CATCH_UNSCOPED_INFO( "CATCH_UNSCOPED_INFO", msg )
   #define CATCH_WARN( msg ) INTERNAL_CATCH_MSG( "CATCH_WARN", Catch::ResultWas::Warning, Catch::ResultDisposition::ContinueOnFailure, msg )
   #define CATCH_CAPTURE( ... ) INTERNAL_CATCH_CAPTURE( INTERNAL_CATCH_UNIQUE_NAME(capturer), "CATCH_CAPTURE", __VA_ARGS__ )
 
-#elif defined(CATCH_CONFIG_PREFIX_ALL) && defined(CATCH_CONFIG_DISABLE)
+#elif defined(CATCH_CONFIG_PREFIX_MESSAGES) && defined(CATCH_CONFIG_DISABLE)
 
   #define CATCH_INFO( msg )          (void)(0)
   #define CATCH_UNSCOPED_INFO( msg ) (void)(0)
   #define CATCH_WARN( msg )          (void)(0)
   #define CATCH_CAPTURE( ... )       (void)(0)
 
-#elif !defined(CATCH_CONFIG_PREFIX_ALL) && !defined(CATCH_CONFIG_DISABLE)
+#elif !defined(CATCH_CONFIG_PREFIX_MESSAGES) && !defined(CATCH_CONFIG_DISABLE)
 
   #define INFO( msg ) INTERNAL_CATCH_INFO( "INFO", msg )
   #define UNSCOPED_INFO( msg ) INTERNAL_CATCH_UNSCOPED_INFO( "UNSCOPED_INFO", msg )
   #define WARN( msg ) INTERNAL_CATCH_MSG( "WARN", Catch::ResultWas::Warning, Catch::ResultDisposition::ContinueOnFailure, msg )
   #define CAPTURE( ... ) INTERNAL_CATCH_CAPTURE( INTERNAL_CATCH_UNIQUE_NAME(capturer), "CAPTURE", __VA_ARGS__ )
 
-#elif !defined(CATCH_CONFIG_PREFIX_ALL) && defined(CATCH_CONFIG_DISABLE)
+#elif !defined(CATCH_CONFIG_PREFIX_MESSAGES) && defined(CATCH_CONFIG_DISABLE)
 
   #define INFO( msg )          (void)(0)
   #define UNSCOPED_INFO( msg ) (void)(0)
@@ -4580,6 +4082,75 @@ namespace Catch {
 #endif // CATCH_MESSAGE_HPP_INCLUDED
 
 
+#ifndef CATCH_SECTION_INFO_HPP_INCLUDED
+#define CATCH_SECTION_INFO_HPP_INCLUDED
+
+
+
+#ifndef CATCH_TOTALS_HPP_INCLUDED
+#define CATCH_TOTALS_HPP_INCLUDED
+
+#include <cstdint>
+
+namespace Catch {
+
+    struct Counts {
+        Counts operator - ( Counts const& other ) const;
+        Counts& operator += ( Counts const& other );
+
+        std::uint64_t total() const;
+        bool allPassed() const;
+        bool allOk() const;
+
+        std::uint64_t passed = 0;
+        std::uint64_t failed = 0;
+        std::uint64_t failedButOk = 0;
+        std::uint64_t skipped = 0;
+    };
+
+    struct Totals {
+
+        Totals operator - ( Totals const& other ) const;
+        Totals& operator += ( Totals const& other );
+
+        Totals delta( Totals const& prevTotals ) const;
+
+        Counts assertions;
+        Counts testCases;
+    };
+}
+
+#endif // CATCH_TOTALS_HPP_INCLUDED
+
+#include <string>
+
+namespace Catch {
+
+    struct SectionInfo {
+        // The last argument is ignored, so that people can write
+        // SECTION("ShortName", "Proper description that is long") and
+        // still use the `-c` flag comfortably.
+        SectionInfo( SourceLineInfo const& _lineInfo, std::string _name,
+                    const char* const = nullptr ):
+            name(CATCH_MOVE(_name)),
+            lineInfo(_lineInfo)
+            {}
+
+        std::string name;
+        SourceLineInfo lineInfo;
+    };
+
+    struct SectionEndInfo {
+        SectionInfo sectionInfo;
+        Counts prevAssertions;
+        double durationInSeconds;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_SECTION_INFO_HPP_INCLUDED
+
+
 #ifndef CATCH_SESSION_HPP_INCLUDED
 #define CATCH_SESSION_HPP_INCLUDED
 
@@ -4683,17 +4254,16 @@ namespace Catch {
             enum class TokenType { Option, Argument };
             struct Token {
                 TokenType type;
-                std::string token;
+                StringRef token;
             };
 
             // Abstracts iterators into args as a stream of tokens, with option
             // arguments uniformly handled
             class TokenStream {
-                using Iterator = std::vector<std::string>::const_iterator;
+                using Iterator = std::vector<StringRef>::const_iterator;
                 Iterator it;
                 Iterator itEnd;
                 std::vector<Token> m_tokenBuffer;
-
                 void loadBuffer();
 
             public:
@@ -4745,12 +4315,17 @@ namespace Catch {
                 ResultType m_type;
             };
 
-            template <typename T> class ResultValueBase : public ResultBase {
+            template <typename T>
+            class ResultValueBase : public ResultBase {
             public:
-                auto value() const -> T const& {
+                T const& value() const& {
                     enforceOk();
                     return m_value;
                 }
+                T&& value() && {
+                    enforceOk();
+                    return CATCH_MOVE( m_value );
+                }
 
             protected:
                 ResultValueBase( ResultType type ): ResultBase( type ) {}
@@ -4760,13 +4335,23 @@ namespace Catch {
                     if ( m_type == ResultType::Ok )
                         new ( &m_value ) T( other.m_value );
                 }
+                ResultValueBase( ResultValueBase&& other ):
+                    ResultBase( other ) {
+                    if ( m_type == ResultType::Ok )
+                        new ( &m_value ) T( CATCH_MOVE(other.m_value) );
+                }
 
-                ResultValueBase( ResultType, T const& value ): ResultBase( ResultType::Ok ) {
+
+                ResultValueBase( ResultType, T const& value ):
+                    ResultBase( ResultType::Ok ) {
                     new ( &m_value ) T( value );
                 }
+                ResultValueBase( ResultType, T&& value ):
+                    ResultBase( ResultType::Ok ) {
+                    new ( &m_value ) T( CATCH_MOVE(value) );
+                }
 
-                auto operator=( ResultValueBase const& other )
-                    -> ResultValueBase& {
+                ResultValueBase& operator=( ResultValueBase const& other ) {
                     if ( m_type == ResultType::Ok )
                         m_value.~T();
                     ResultBase::operator=( other );
@@ -4774,6 +4359,14 @@ namespace Catch {
                         new ( &m_value ) T( other.m_value );
                     return *this;
                 }
+                ResultValueBase& operator=( ResultValueBase&& other ) {
+                    if ( m_type == ResultType::Ok ) m_value.~T();
+                    ResultBase::operator=( other );
+                    if ( m_type == ResultType::Ok )
+                        new ( &m_value ) T( CATCH_MOVE(other.m_value) );
+                    return *this;
+                }
+
 
                 ~ResultValueBase() override {
                     if ( m_type == ResultType::Ok )
@@ -4801,8 +4394,8 @@ namespace Catch {
                 }
 
                 template <typename U>
-                static auto ok( U const& value ) -> BasicResult {
-                    return { ResultType::Ok, value };
+                static auto ok( U&& value ) -> BasicResult {
+                    return { ResultType::Ok, CATCH_FORWARD(value) };
                 }
                 static auto ok() -> BasicResult { return { ResultType::Ok }; }
                 static auto logicError( std::string&& message )
@@ -4849,12 +4442,15 @@ namespace Catch {
             class ParseState {
             public:
                 ParseState( ParseResultType type,
-                            TokenStream const& remainingTokens );
+                            TokenStream remainingTokens );
 
                 ParseResultType type() const { return m_type; }
-                TokenStream const& remainingTokens() const {
+                TokenStream const& remainingTokens() const& {
                     return m_remainingTokens;
                 }
+                TokenStream&& remainingTokens() && {
+                    return CATCH_MOVE( m_remainingTokens );
+                }
 
             private:
                 ParseResultType m_type;
@@ -4867,7 +4463,7 @@ namespace Catch {
 
             struct HelpColumns {
                 std::string left;
-                std::string right;
+                StringRef descriptions;
             };
 
             template <typename T>
@@ -5027,7 +4623,7 @@ namespace Catch {
                 virtual ~ParserBase() = default;
                 virtual auto validate() const -> Result { return Result::ok(); }
                 virtual auto parse( std::string const& exeName,
-                                    TokenStream const& tokens ) const
+                                    TokenStream tokens ) const
                     -> InternalParseResult = 0;
                 virtual size_t cardinality() const;
 
@@ -5047,8 +4643,8 @@ namespace Catch {
             protected:
                 Optionality m_optionality = Optionality::Optional;
                 std::shared_ptr<BoundRef> m_ref;
-                std::string m_hint;
-                std::string m_description;
+                StringRef m_hint;
+                StringRef m_description;
 
                 explicit ParserRefImpl( std::shared_ptr<BoundRef> const& ref ):
                     m_ref( ref ) {}
@@ -5057,28 +4653,32 @@ namespace Catch {
                 template <typename LambdaT>
                 ParserRefImpl( accept_many_t,
                                LambdaT const& ref,
-                               std::string const& hint ):
+                               StringRef hint ):
                     m_ref( std::make_shared<BoundManyLambda<LambdaT>>( ref ) ),
                     m_hint( hint ) {}
 
                 template <typename T,
                           typename = typename std::enable_if_t<
                               !Detail::is_unary_function<T>::value>>
-                ParserRefImpl( T& ref, std::string const& hint ):
+                ParserRefImpl( T& ref, StringRef hint ):
                     m_ref( std::make_shared<BoundValueRef<T>>( ref ) ),
                     m_hint( hint ) {}
 
                 template <typename LambdaT,
                           typename = typename std::enable_if_t<
                               Detail::is_unary_function<LambdaT>::value>>
-                ParserRefImpl( LambdaT const& ref, std::string const& hint ):
+                ParserRefImpl( LambdaT const& ref, StringRef hint ):
                     m_ref( std::make_shared<BoundLambda<LambdaT>>( ref ) ),
                     m_hint( hint ) {}
 
-                auto operator()( std::string const& description ) -> DerivedT& {
+                DerivedT& operator()( StringRef description ) & {
                     m_description = description;
                     return static_cast<DerivedT&>( *this );
                 }
+                DerivedT&& operator()( StringRef description ) && {
+                    m_description = description;
+                    return static_cast<DerivedT&&>( *this );
+                }
 
                 auto optional() -> DerivedT& {
                     m_optionality = Optionality::Optional;
@@ -5101,7 +4701,7 @@ namespace Catch {
                         return 1;
                 }
 
-                std::string const& hint() const { return m_hint; }
+                StringRef hint() const { return m_hint; }
             };
 
         } // namespace detail
@@ -5115,13 +4715,13 @@ namespace Catch {
 
             Detail::InternalParseResult
                 parse(std::string const&,
-                      Detail::TokenStream const& tokens) const override;
+                      Detail::TokenStream tokens) const override;
         };
 
         // A parser for options
         class Opt : public Detail::ParserRefImpl<Opt> {
         protected:
-            std::vector<std::string> m_optNames;
+            std::vector<StringRef> m_optNames;
 
         public:
             template <typename LambdaT>
@@ -5134,33 +4734,37 @@ namespace Catch {
             template <typename LambdaT,
                       typename = typename std::enable_if_t<
                           Detail::is_unary_function<LambdaT>::value>>
-            Opt( LambdaT const& ref, std::string const& hint ):
+            Opt( LambdaT const& ref, StringRef hint ):
                 ParserRefImpl( ref, hint ) {}
 
             template <typename LambdaT>
-            Opt( accept_many_t, LambdaT const& ref, std::string const& hint ):
+            Opt( accept_many_t, LambdaT const& ref, StringRef hint ):
                 ParserRefImpl( accept_many, ref, hint ) {}
 
             template <typename T,
                       typename = typename std::enable_if_t<
                           !Detail::is_unary_function<T>::value>>
-            Opt( T& ref, std::string const& hint ):
+            Opt( T& ref, StringRef hint ):
                 ParserRefImpl( ref, hint ) {}
 
-            auto operator[](std::string const& optName) -> Opt& {
+            Opt& operator[]( StringRef optName ) & {
                 m_optNames.push_back(optName);
                 return *this;
             }
+            Opt&& operator[]( StringRef optName ) && {
+                m_optNames.push_back( optName );
+                return CATCH_MOVE(*this);
+            }
 
-            std::vector<Detail::HelpColumns> getHelpColumns() const;
+            Detail::HelpColumns getHelpColumns() const;
 
-            bool isMatch(std::string const& optToken) const;
+            bool isMatch(StringRef optToken) const;
 
             using ParserBase::parse;
 
             Detail::InternalParseResult
                 parse(std::string const&,
-                      Detail::TokenStream const& tokens) const override;
+                      Detail::TokenStream tokens) const override;
 
             Detail::Result validate() const override;
         };
@@ -5183,7 +4787,7 @@ namespace Catch {
             // handled specially
             Detail::InternalParseResult
                 parse(std::string const&,
-                      Detail::TokenStream const& tokens) const override;
+                      Detail::TokenStream tokens) const override;
 
             std::string const& name() const { return *m_name; }
             Detail::ParserResult set(std::string const& newName);
@@ -5208,16 +4812,28 @@ namespace Catch {
                 return *this;
             }
 
-            auto operator|=(Opt const& opt) -> Parser& {
-                m_options.push_back(opt);
-                return *this;
+            friend Parser& operator|=( Parser& p, Opt const& opt ) {
+                p.m_options.push_back( opt );
+                return p;
+            }
+            friend Parser& operator|=( Parser& p, Opt&& opt ) {
+                p.m_options.push_back( CATCH_MOVE(opt) );
+                return p;
             }
 
             Parser& operator|=(Parser const& other);
 
             template <typename T>
-            auto operator|(T const& other) const -> Parser {
-                return Parser(*this) |= other;
+            friend Parser operator|( Parser const& p, T&& rhs ) {
+                Parser temp( p );
+                temp |= rhs;
+                return temp;
+            }
+
+            template <typename T>
+            friend Parser operator|( Parser&& p, T&& rhs ) {
+                p |= CATCH_FORWARD(rhs);
+                return CATCH_MOVE(p);
             }
 
             std::vector<Detail::HelpColumns> getHelpColumns() const;
@@ -5235,21 +4851,23 @@ namespace Catch {
             using ParserBase::parse;
             Detail::InternalParseResult
                 parse(std::string const& exeName,
-                      Detail::TokenStream const& tokens) const override;
+                      Detail::TokenStream tokens) const override;
         };
 
-        // Transport for raw args (copied from main args, or supplied via
-        // init list for testing)
+        /**
+         * Wrapper over argc + argv, assumes that the inputs outlive it
+         */
         class Args {
             friend Detail::TokenStream;
-            std::string m_exeName;
-            std::vector<std::string> m_args;
+            StringRef m_exeName;
+            std::vector<StringRef> m_args;
 
         public:
             Args(int argc, char const* const* argv);
-            Args(std::initializer_list<std::string> args);
+            // Helper constructor for testing
+            Args(std::initializer_list<StringRef> args);
 
-            std::string const& exeName() const { return m_exeName; }
+            StringRef exeName() const { return m_exeName; }
         };
 
 
@@ -5855,8 +5473,6 @@ namespace Catch {
 
 namespace Catch {
 
-    class IResultCapture;
-
     struct AssertionReaction {
         bool shouldDebugBreak = false;
         bool shouldThrow = false;
@@ -5897,7 +5513,6 @@ namespace Catch {
         void handleUnexpectedInflightException();
 
         void complete();
-        void setCompleted();
 
         // query
         auto allowThrows() const -> bool;
@@ -5909,13 +5524,10 @@ namespace Catch {
 
 #endif // CATCH_ASSERTION_HANDLER_HPP_INCLUDED
 
-// We need this suppression to leak, because it took until GCC 10
-// for the front end to handle local suppression via _Pragma properly
-#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && __GNUC__ <= 9
-  #pragma GCC diagnostic ignored "-Wparentheses"
-#endif
 
-#if !defined(CATCH_CONFIG_DISABLE)
+#ifndef CATCH_PREPROCESSOR_INTERNAL_STRINGIFY_HPP_INCLUDED
+#define CATCH_PREPROCESSOR_INTERNAL_STRINGIFY_HPP_INCLUDED
+
 
 #if !defined(CATCH_CONFIG_DISABLE_STRINGIFICATION)
   #define CATCH_INTERNAL_STRINGIFY(...) #__VA_ARGS__##_catch_sr
@@ -5923,6 +5535,16 @@ namespace Catch {
   #define CATCH_INTERNAL_STRINGIFY(...) "Disabled by CATCH_CONFIG_DISABLE_STRINGIFICATION"_catch_sr
 #endif
 
+#endif // CATCH_PREPROCESSOR_INTERNAL_STRINGIFY_HPP_INCLUDED
+
+// We need this suppression to leak, because it took until GCC 10
+// for the front end to handle local suppression via _Pragma properly
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && __GNUC__ <= 9
+  #pragma GCC diagnostic ignored "-Wparentheses"
+#endif
+
+#if !defined(CATCH_CONFIG_DISABLE)
+
 #if defined(CATCH_CONFIG_FAST_COMPILE) || defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5934,7 +5556,7 @@ namespace Catch {
 #else // CATCH_CONFIG_FAST_COMPILE
 
 #define INTERNAL_CATCH_TRY try
-#define INTERNAL_CATCH_CATCH( handler ) catch(...) { handler.handleUnexpectedInflightException(); }
+#define INTERNAL_CATCH_CATCH( handler ) catch(...) { (handler).handleUnexpectedInflightException(); }
 
 #endif
 
@@ -5990,6 +5612,7 @@ namespace Catch {
         if( catchAssertionHandler.allowThrows() ) \
             try { \
                 CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+                CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT \
                 CATCH_INTERNAL_SUPPRESS_USELESS_CAST_WARNINGS \
                 static_cast<void>(__VA_ARGS__); \
                 CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
@@ -6010,6 +5633,7 @@ namespace Catch {
         if( catchAssertionHandler.allowThrows() ) \
             try { \
                 CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+                CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT \
                 CATCH_INTERNAL_SUPPRESS_USELESS_CAST_WARNINGS \
                 static_cast<void>(expr); \
                 CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
@@ -6036,6 +5660,7 @@ namespace Catch {
         if( catchAssertionHandler.allowThrows() ) \
             try { \
                 CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+                CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT \
                 CATCH_INTERNAL_SUPPRESS_USELESS_CAST_WARNINGS \
                 static_cast<void>(__VA_ARGS__); \
                 CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
@@ -6051,12 +5676,40 @@ namespace Catch {
 
 #endif // CATCH_CONFIG_DISABLE
 
-#endif // CATCH_TEST_MACRO_IMPL_HPP_INCLUDED
+#endif // CATCH_TEST_MACRO_IMPL_HPP_INCLUDED
+
+
+#ifndef CATCH_SECTION_HPP_INCLUDED
+#define CATCH_SECTION_HPP_INCLUDED
+
+
+
+
+/** \file
+ * Wrapper for the STATIC_ANALYSIS_SUPPORT configuration option
+ *
+ * Some of Catch2's macros can be defined differently to work better with
+ * static analysis tools, like clang-tidy or coverity.
+ * Currently the main use case is to show that `SECTION`s are executed
+ * exclusively, and not all in one run of a `TEST_CASE`.
+ */
+
+#ifndef CATCH_CONFIG_STATIC_ANALYSIS_SUPPORT_HPP_INCLUDED
+#define CATCH_CONFIG_STATIC_ANALYSIS_SUPPORT_HPP_INCLUDED
+
+
+#if defined(__clang_analyzer__) || defined(__COVERITY__)
+    #define CATCH_INTERNAL_CONFIG_STATIC_ANALYSIS_SUPPORT
+#endif
 
+#if defined( CATCH_INTERNAL_CONFIG_STATIC_ANALYSIS_SUPPORT ) && \
+    !defined( CATCH_CONFIG_NO_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT ) && \
+    !defined( CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT )
+#    define CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT
+#endif
 
-#ifndef CATCH_SECTION_HPP_INCLUDED
-#define CATCH_SECTION_HPP_INCLUDED
 
+#endif // CATCH_CONFIG_STATIC_ANALYSIS_SUPPORT_HPP_INCLUDED
 
 
 #ifndef CATCH_TIMER_HPP_INCLUDED
@@ -6103,17 +5756,63 @@ namespace Catch {
 
 } // end namespace Catch
 
-#define INTERNAL_CATCH_SECTION( ... ) \
-    CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
-    CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
-    if( Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME( catch_internal_Section ) = Catch::Section( CATCH_INTERNAL_LINEINFO, __VA_ARGS__ ) ) \
-    CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+#if !defined(CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT)
+#    define INTERNAL_CATCH_SECTION( ... )                                 \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                         \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS                  \
+        if ( Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME(            \
+                 catch_internal_Section ) =                               \
+                 Catch::Section( CATCH_INTERNAL_LINEINFO, __VA_ARGS__ ) ) \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#    define INTERNAL_CATCH_DYNAMIC_SECTION( ... )                     \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                     \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS              \
+        if ( Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME(        \
+                 catch_internal_Section ) =                           \
+                 Catch::SectionInfo(                                  \
+                     CATCH_INTERNAL_LINEINFO,                         \
+                     ( Catch::ReusableStringStream() << __VA_ARGS__ ) \
+                         .str() ) )                                   \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#else
+
+// These section definitions imply that at most one section at one level
+// will be intered (because only one section's __LINE__ can be equal to
+// the dummy `catchInternalSectionHint` variable from `TEST_CASE`).
+
+namespace Catch {
+    namespace Detail {
+        // Intentionally without linkage, as it should only be used as a dummy
+        // symbol for static analysis.
+        int GetNewSectionHint();
+    } // namespace Detail
+} // namespace Catch
+
+
+#    define INTERNAL_CATCH_SECTION( ... )                                   \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                           \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS                    \
+        CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS                             \
+        if ( [[maybe_unused]] const int catchInternalPreviousSectionHint =  \
+                 catchInternalSectionHint,                                  \
+             catchInternalSectionHint = Catch::Detail::GetNewSectionHint(); \
+             catchInternalPreviousSectionHint == __LINE__ )                 \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#    define INTERNAL_CATCH_DYNAMIC_SECTION( ... )                           \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                           \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS                    \
+        CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS                             \
+        if ( [[maybe_unused]] const int catchInternalPreviousSectionHint =  \
+                 catchInternalSectionHint,                                  \
+             catchInternalSectionHint = Catch::Detail::GetNewSectionHint(); \
+             catchInternalPreviousSectionHint == __LINE__ )                 \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#endif
 
-#define INTERNAL_CATCH_DYNAMIC_SECTION( ... ) \
-    CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
-    CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
-    if( Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME( catch_internal_Section ) = Catch::SectionInfo( CATCH_INTERNAL_LINEINFO, (Catch::ReusableStringStream() << __VA_ARGS__).str() ) ) \
-    CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
 
 #endif // CATCH_SECTION_HPP_INCLUDED
 
@@ -6123,42 +5822,20 @@ namespace Catch {
 
 
 
-#ifndef CATCH_INTERFACES_TESTCASE_HPP_INCLUDED
-#define CATCH_INTERFACES_TESTCASE_HPP_INCLUDED
-
-#include <vector>
+#ifndef CATCH_INTERFACES_TEST_INVOKER_HPP_INCLUDED
+#define CATCH_INTERFACES_TEST_INVOKER_HPP_INCLUDED
 
 namespace Catch {
 
-    class TestSpec;
-    struct TestCaseInfo;
-
     class ITestInvoker {
     public:
-        virtual void invoke () const = 0;
+        virtual void invoke() const = 0;
         virtual ~ITestInvoker(); // = default
     };
 
-    class TestCaseHandle;
-    class IConfig;
-
-    class ITestCaseRegistry {
-    public:
-        virtual ~ITestCaseRegistry(); // = default
-        // TODO: this exists only for adding filenames to test cases -- let's expose this in a saner way later
-        virtual std::vector<TestCaseInfo* > const& getAllInfos() const = 0;
-        virtual std::vector<TestCaseHandle> const& getAllTests() const = 0;
-        virtual std::vector<TestCaseHandle> const& getAllTestsSorted( IConfig const& config ) const = 0;
-    };
-
-    bool isThrowSafe( TestCaseHandle const& testCase, IConfig const& config );
-    bool matchTest( TestCaseHandle const& testCase, TestSpec const& testSpec, IConfig const& config );
-    std::vector<TestCaseHandle> filterTests( std::vector<TestCaseHandle> const& testCases, TestSpec const& testSpec, IConfig const& config );
-    std::vector<TestCaseHandle> const& getAllTestCasesSorted( IConfig const& config );
-
-}
+} // namespace Catch
 
-#endif // CATCH_INTERFACES_TESTCASE_HPP_INCLUDED
+#endif // CATCH_INTERFACES_TEST_INVOKER_HPP_INCLUDED
 
 
 #ifndef CATCH_PREPROCESSOR_REMOVE_PARENS_HPP_INCLUDED
@@ -6230,6 +5907,9 @@ struct AutoReg : Detail::NonCopyable {
         void TestName::test()
 #endif
 
+
+#if !defined(CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT)
+
     ///////////////////////////////////////////////////////////////////////////////
     #define INTERNAL_CATCH_TESTCASE2( TestName, ... ) \
         static void TestName(); \
@@ -6242,19 +5922,40 @@ struct AutoReg : Detail::NonCopyable {
     #define INTERNAL_CATCH_TESTCASE( ... ) \
         INTERNAL_CATCH_TESTCASE2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEST_ ), __VA_ARGS__ )
 
-    ///////////////////////////////////////////////////////////////////////////////
-    #define INTERNAL_CATCH_METHOD_AS_TEST_CASE( QualifiedMethod, ... ) \
-        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
-        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
-        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
-        namespace {                                                           \
-        const Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( \
-            Catch::makeTestInvoker( &QualifiedMethod ),                   \
-            CATCH_INTERNAL_LINEINFO,                                      \
-            "&" #QualifiedMethod##_catch_sr,                              \
-            Catch::NameAndTags{ __VA_ARGS__ } );                          \
-    } /* NOLINT */ \
-        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+#else  // ^^ !CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT | vv CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT
+
+
+// Dummy registrator for the dumy test case macros
+namespace Catch {
+    namespace Detail {
+        struct DummyUse {
+            DummyUse( void ( * )( int ) );
+        };
+    } // namespace Detail
+} // namespace Catch
+
+// Note that both the presence of the argument and its exact name are
+// necessary for the section support.
+
+// We provide a shadowed variable so that a `SECTION` inside non-`TEST_CASE`
+// tests can compile. The redefined `TEST_CASE` shadows this with param.
+static int catchInternalSectionHint = 0;
+
+#    define INTERNAL_CATCH_TESTCASE2( fname )                              \
+        static void fname( int );                                          \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                          \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                           \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS                   \
+        static const Catch::Detail::DummyUse INTERNAL_CATCH_UNIQUE_NAME(   \
+            dummyUser )( &(fname) );                                       \
+        CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS                            \
+        static void fname( [[maybe_unused]] int catchInternalSectionHint ) \
+            CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+#    define INTERNAL_CATCH_TESTCASE( ... ) \
+        INTERNAL_CATCH_TESTCASE2( INTERNAL_CATCH_UNIQUE_NAME( dummyFunction ) )
+
+
+#endif // CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT
 
     ///////////////////////////////////////////////////////////////////////////////
     #define INTERNAL_CATCH_TEST_CASE_METHOD2( TestName, ClassName, ... )\
@@ -6276,6 +5977,22 @@ struct AutoReg : Detail::NonCopyable {
     #define INTERNAL_CATCH_TEST_CASE_METHOD( ClassName, ... ) \
         INTERNAL_CATCH_TEST_CASE_METHOD2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEST_ ), ClassName, __VA_ARGS__ )
 
+
+    ///////////////////////////////////////////////////////////////////////////////
+    #define INTERNAL_CATCH_METHOD_AS_TEST_CASE( QualifiedMethod, ... ) \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
+        namespace {                                                           \
+        const Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( \
+            Catch::makeTestInvoker( &QualifiedMethod ),                   \
+            CATCH_INTERNAL_LINEINFO,                                      \
+            "&" #QualifiedMethod##_catch_sr,                              \
+            Catch::NameAndTags{ __VA_ARGS__ } );                          \
+    } /* NOLINT */ \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+
     ///////////////////////////////////////////////////////////////////////////////
     #define INTERNAL_CATCH_REGISTER_TESTCASE( Function, ... ) \
         do { \
@@ -7194,6 +6911,7 @@ namespace Catch {
     };
 
     class ITestInvoker;
+    struct NameAndTags;
 
     enum class TestCaseProperties : uint8_t {
         None = 0,
@@ -7318,6 +7036,10 @@ namespace Catch {
 #include <exception>
 
 namespace Catch {
+    namespace Detail {
+        void registerTranslatorImpl(
+            Detail::unique_ptr<IExceptionTranslator>&& translator );
+    }
 
     class ExceptionTranslatorRegistrar {
         template<typename T>
@@ -7351,9 +7073,9 @@ namespace Catch {
     public:
         template<typename T>
         ExceptionTranslatorRegistrar( std::string(*translateFunction)( T const& ) ) {
-            getMutableRegistryHub().registerTranslator(
-                Detail::make_unique<ExceptionTranslator<T>>(translateFunction)
-            );
+            Detail::registerTranslatorImpl(
+                Detail::make_unique<ExceptionTranslator<T>>(
+                    translateFunction ) );
         }
     };
 
@@ -7425,7 +7147,7 @@ namespace Catch {
 #define CATCH_VERSION_MACROS_HPP_INCLUDED
 
 #define CATCH_VERSION_MAJOR 3
-#define CATCH_VERSION_MINOR 3
+#define CATCH_VERSION_MINOR 5
 #define CATCH_VERSION_PATCH 2
 
 #endif // CATCH_VERSION_MACROS_HPP_INCLUDED
@@ -7584,12 +7306,6 @@ namespace Detail {
         }
 
     public:
-        ~IGenerator() override = default;
-        IGenerator() = default;
-        IGenerator(IGenerator const&) = default;
-        IGenerator& operator=(IGenerator const&) = default;
-
-
         // Returns the current element of the generator
         //
         // \Precondition The generator is either freshly constructed,
@@ -8058,37 +7774,578 @@ namespace Catch {
             return static_cast<result_type>(-1);
         }
 
-        // Provide some default initial state for the default constructor
-        SimplePcg32():SimplePcg32(0xed743cc4U) {}
+        // Provide some default initial state for the default constructor
+        SimplePcg32():SimplePcg32(0xed743cc4U) {}
+
+        explicit SimplePcg32(result_type seed_);
+
+        void seed(result_type seed_);
+        void discard(uint64_t skip);
+
+        result_type operator()();
+
+    private:
+        friend bool operator==(SimplePcg32 const& lhs, SimplePcg32 const& rhs);
+        friend bool operator!=(SimplePcg32 const& lhs, SimplePcg32 const& rhs);
+
+        // In theory we also need operator<< and operator>>
+        // In practice we do not use them, so we will skip them for now
+
+
+        std::uint64_t m_state;
+        // This part of the state determines which "stream" of the numbers
+        // is chosen -- we take it as a constant for Catch2, so we only
+        // need to deal with seeding the main state.
+        // Picked by reading 8 bytes from `/dev/random` :-)
+        static const std::uint64_t s_inc = (0x13ed0cc53f939476ULL << 1ULL) | 1ULL;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_RANDOM_NUMBER_GENERATOR_HPP_INCLUDED
+
+
+
+#ifndef CATCH_UNIFORM_INTEGER_DISTRIBUTION_HPP_INCLUDED
+#define CATCH_UNIFORM_INTEGER_DISTRIBUTION_HPP_INCLUDED
+
+
+
+
+#ifndef CATCH_RANDOM_INTEGER_HELPERS_HPP_INCLUDED
+#define CATCH_RANDOM_INTEGER_HELPERS_HPP_INCLUDED
+
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+namespace Catch {
+    namespace Detail {
+
+        template <std::size_t>
+        struct SizedUnsignedType;
+#define SizedUnsignedTypeHelper( TYPE )        \
+    template <>                                \
+    struct SizedUnsignedType<sizeof( TYPE )> { \
+        using type = TYPE;                     \
+    }
+
+        SizedUnsignedTypeHelper( std::uint8_t );
+        SizedUnsignedTypeHelper( std::uint16_t );
+        SizedUnsignedTypeHelper( std::uint32_t );
+        SizedUnsignedTypeHelper( std::uint64_t );
+#undef SizedUnsignedTypeHelper
+
+        template <std::size_t sz>
+        using SizedUnsignedType_t = typename SizedUnsignedType<sz>::type;
+
+        template <typename T>
+        using DoubleWidthUnsignedType_t = SizedUnsignedType_t<2 * sizeof( T )>;
+
+        template <typename T>
+        struct ExtendedMultResult {
+            T upper;
+            T lower;
+            friend bool operator==( ExtendedMultResult const& lhs,
+                                    ExtendedMultResult const& rhs ) {
+                return lhs.upper == rhs.upper && lhs.lower == rhs.lower;
+            }
+        };
+
+        // Returns 128 bit result of multiplying lhs and rhs
+        constexpr ExtendedMultResult<std::uint64_t>
+        extendedMult( std::uint64_t lhs, std::uint64_t rhs ) {
+            // We use the simple long multiplication approach for
+            // correctness, we can use platform specific builtins
+            // for performance later.
+
+            // Split the lhs and rhs into two 32bit "digits", so that we can
+            // do 64 bit arithmetic to handle carry bits.
+            //            32b    32b    32b    32b
+            //     lhs                  L1     L2
+            //   * rhs                  R1     R2
+            //            ------------------------
+            //                       |  R2 * L2  |
+            //                 |  R2 * L1  |
+            //                 |  R1 * L2  |
+            //           |  R1 * L1  |
+            //           -------------------------
+            //           |  a  |  b  |  c  |  d  |
+
+#define CarryBits( x ) ( x >> 32 )
+#define Digits( x ) ( x & 0xFF'FF'FF'FF )
+
+            auto r2l2 = Digits( rhs ) * Digits( lhs );
+            auto r2l1 = Digits( rhs ) * CarryBits( lhs );
+            auto r1l2 = CarryBits( rhs ) * Digits( lhs );
+            auto r1l1 = CarryBits( rhs ) * CarryBits( lhs );
+
+            // Sum to columns first
+            auto d = Digits( r2l2 );
+            auto c = CarryBits( r2l2 ) + Digits( r2l1 ) + Digits( r1l2 );
+            auto b = CarryBits( r2l1 ) + CarryBits( r1l2 ) + Digits( r1l1 );
+            auto a = CarryBits( r1l1 );
+
+            // Propagate carries between columns
+            c += CarryBits( d );
+            b += CarryBits( c );
+            a += CarryBits( b );
+
+            // Remove the used carries
+            c = Digits( c );
+            b = Digits( b );
+            a = Digits( a );
+
+#undef CarryBits
+#undef Digits
+
+            return {
+                a << 32 | b, // upper 64 bits
+                c << 32 | d  // lower 64 bits
+            };
+        }
+
+        template <typename UInt>
+        constexpr ExtendedMultResult<UInt> extendedMult( UInt lhs, UInt rhs ) {
+            static_assert( std::is_unsigned<UInt>::value,
+                           "extendedMult can only handle unsigned integers" );
+            static_assert( sizeof( UInt ) < sizeof( std::uint64_t ),
+                           "Generic extendedMult can only handle types smaller "
+                           "than uint64_t" );
+            using WideType = DoubleWidthUnsignedType_t<UInt>;
+
+            auto result = WideType( lhs ) * WideType( rhs );
+            return {
+                static_cast<UInt>( result >> ( CHAR_BIT * sizeof( UInt ) ) ),
+                static_cast<UInt>( result & UInt( -1 ) ) };
+        }
+
+
+        template <typename TargetType,
+                  typename Generator>
+            std::enable_if_t<sizeof(typename Generator::result_type) >= sizeof(TargetType),
+            TargetType> fillBitsFrom(Generator& gen) {
+            using gresult_type = typename Generator::result_type;
+            static_assert( std::is_unsigned<TargetType>::value, "Only unsigned integers are supported" );
+            static_assert( Generator::min() == 0 &&
+                           Generator::max() == static_cast<gresult_type>( -1 ),
+                           "Generator must be able to output all numbers in its result type (effectively it must be a random bit generator)" );
+
+            // We want to return the top bits from a generator, as they are
+            // usually considered higher quality.
+            constexpr auto generated_bits = sizeof( gresult_type ) * CHAR_BIT;
+            constexpr auto return_bits = sizeof( TargetType ) * CHAR_BIT;
+
+            return static_cast<TargetType>( gen() >>
+                                            ( generated_bits - return_bits) );
+        }
+
+        template <typename TargetType,
+                  typename Generator>
+            std::enable_if_t<sizeof(typename Generator::result_type) < sizeof(TargetType),
+            TargetType> fillBitsFrom(Generator& gen) {
+            using gresult_type = typename Generator::result_type;
+            static_assert( std::is_unsigned<TargetType>::value,
+                           "Only unsigned integers are supported" );
+            static_assert( Generator::min() == 0 &&
+                           Generator::max() == static_cast<gresult_type>( -1 ),
+                           "Generator must be able to output all numbers in its result type (effectively it must be a random bit generator)" );
+
+            constexpr auto generated_bits = sizeof( gresult_type ) * CHAR_BIT;
+            constexpr auto return_bits = sizeof( TargetType ) * CHAR_BIT;
+            std::size_t filled_bits = 0;
+            TargetType ret = 0;
+            do {
+                ret <<= generated_bits;
+                ret |= gen();
+                filled_bits += generated_bits;
+            } while ( filled_bits < return_bits );
+
+            return ret;
+        }
+
+        /*
+         * Transposes numbers into unsigned type while keeping their ordering
+         *
+         * This means that signed types are changed so that the ordering is
+         * [INT_MIN, ..., -1, 0, ..., INT_MAX], rather than order we would
+         * get by simple casting ([0, ..., INT_MAX, INT_MIN, ..., -1])
+         */
+        template <typename OriginalType, typename UnsignedType>
+        std::enable_if_t<std::is_signed<OriginalType>::value, UnsignedType>
+        transposeToNaturalOrder( UnsignedType in ) {
+            static_assert(
+                sizeof( OriginalType ) == sizeof( UnsignedType ),
+                "reordering requires the same sized types on both sides" );
+            static_assert( std::is_unsigned<UnsignedType>::value,
+                           "Input type must be unsigned" );
+            // Assuming 2s complement (standardized in current C++), the
+            // positive and negative numbers are already internally ordered,
+            // and their difference is in the top bit. Swapping it orders
+            // them the desired way.
+            constexpr auto highest_bit =
+                UnsignedType( 1 ) << ( sizeof( UnsignedType ) * CHAR_BIT - 1 );
+            return static_cast<UnsignedType>( in ^ highest_bit );
+        }
+
+
+
+        template <typename OriginalType,
+                  typename UnsignedType>
+        std::enable_if_t<std::is_unsigned<OriginalType>::value, UnsignedType>
+            transposeToNaturalOrder(UnsignedType in) {
+            static_assert(
+                sizeof( OriginalType ) == sizeof( UnsignedType ),
+                "reordering requires the same sized types on both sides" );
+            static_assert( std::is_unsigned<UnsignedType>::value, "Input type must be unsigned" );
+            // No reordering is needed for unsigned -> unsigned
+            return in;
+        }
+    } // namespace Detail
+} // namespace Catch
+
+#endif // CATCH_RANDOM_INTEGER_HELPERS_HPP_INCLUDED
+
+namespace Catch {
+
+    namespace Detail {
+        // Indirection to enable make_unsigned<bool> behaviour.
+        template <typename T>
+        struct make_unsigned {
+            using type = std::make_unsigned_t<T>;
+        };
+
+        template <>
+        struct make_unsigned<bool> {
+            using type = uint8_t;
+        };
+
+        template <typename T>
+        using make_unsigned_t = typename make_unsigned<T>::type;
+    }
+
+/**
+ * Implementation of uniform distribution on integers.
+ *
+ * Unlike `std::uniform_int_distribution`, this implementation supports
+ * various 1 byte integral types, including bool (but you should not
+ * actually use it for bools).
+ *
+ * The underlying algorithm is based on the one described in "Fast Random
+ * Integer Generation in an Interval" by Daniel Lemire, but has been
+ * optimized under the assumption of reuse of the same distribution object.
+ */
+template <typename IntegerType>
+class uniform_integer_distribution {
+    static_assert(std::is_integral<IntegerType>::value, "...");
+
+    using UnsignedIntegerType = Detail::make_unsigned_t<IntegerType>;
+
+    // Only the left bound is stored, and we store it converted to its
+    // unsigned image. This avoids having to do the conversions inside
+    // the operator(), at the cost of having to do the conversion in
+    // the a() getter. The right bound is only needed in the b() getter,
+    // so we recompute it there from other stored data.
+    UnsignedIntegerType m_a;
+
+    // How many different values are there in [a, b]. a == b => 1, can be 0 for distribution over all values in the type.
+    UnsignedIntegerType m_ab_distance;
+
+    // We hoisted this out of the main generation function. Technically,
+    // this means that using this distribution will be slower than Lemire's
+    // algorithm if this distribution instance will be used only few times,
+    // but it will be faster if it is used many times. Since Catch2 uses
+    // distributions only to implement random generators, we assume that each
+    // distribution will be reused many times and this is an optimization.
+    UnsignedIntegerType m_rejection_threshold = 0;
+
+    UnsignedIntegerType computeDistance(IntegerType a, IntegerType b) const {
+        // This overflows and returns 0 if a == 0 and b == TYPE_MAX.
+        // We handle that later when generating the number.
+        return transposeTo(b) - transposeTo(a) + 1;
+    }
+
+    static UnsignedIntegerType computeRejectionThreshold(UnsignedIntegerType ab_distance) {
+        // distance == 0 means that we will return all possible values from
+        // the type's range, and that we shouldn't reject anything.
+        if ( ab_distance == 0 ) { return 0; }
+        return ( ~ab_distance + 1 ) % ab_distance;
+    }
+
+    static UnsignedIntegerType transposeTo(IntegerType in) {
+        return Detail::transposeToNaturalOrder<IntegerType>(
+            static_cast<UnsignedIntegerType>( in ) );
+    }
+    static IntegerType transposeBack(UnsignedIntegerType in) {
+        return static_cast<IntegerType>(
+            Detail::transposeToNaturalOrder<IntegerType>(in) );
+    }
+
+public:
+    using result_type = IntegerType;
+
+    uniform_integer_distribution( IntegerType a, IntegerType b ):
+        m_a( transposeTo(a) ),
+        m_ab_distance( computeDistance(a, b) ),
+        m_rejection_threshold( computeRejectionThreshold(m_ab_distance) ) {
+        assert( a <= b );
+    }
+
+    template <typename Generator>
+    result_type operator()( Generator& g ) {
+        // All possible values of result_type are valid.
+        if ( m_ab_distance == 0 ) {
+            return transposeBack( Detail::fillBitsFrom<UnsignedIntegerType>( g ) );
+        }
+
+        auto random_number = Detail::fillBitsFrom<UnsignedIntegerType>( g );
+        auto emul = Detail::extendedMult( random_number, m_ab_distance );
+        // Unlike Lemire's algorithm we skip the ab_distance check, since
+        // we precomputed the rejection threshold, which is always tighter.
+        while (emul.lower < m_rejection_threshold) {
+            random_number = Detail::fillBitsFrom<UnsignedIntegerType>( g );
+            emul = Detail::extendedMult( random_number, m_ab_distance );
+        }
+
+        return transposeBack(m_a + emul.upper);
+    }
+
+    result_type a() const { return transposeBack(m_a); }
+    result_type b() const { return transposeBack(m_ab_distance + m_a - 1); }
+};
+
+} // end namespace Catch
+
+#endif // CATCH_UNIFORM_INTEGER_DISTRIBUTION_HPP_INCLUDED
+
+
+
+#ifndef CATCH_UNIFORM_FLOATING_POINT_DISTRIBUTION_HPP_INCLUDED
+#define CATCH_UNIFORM_FLOATING_POINT_DISTRIBUTION_HPP_INCLUDED
+
+
+
+
+#ifndef CATCH_RANDOM_FLOATING_POINT_HELPERS_HPP_INCLUDED
+#define CATCH_RANDOM_FLOATING_POINT_HELPERS_HPP_INCLUDED
+
+
+
+#ifndef CATCH_POLYFILLS_HPP_INCLUDED
+#define CATCH_POLYFILLS_HPP_INCLUDED
+
+namespace Catch {
+
+    bool isnan(float f);
+    bool isnan(double d);
+
+    float nextafter(float x, float y);
+    double nextafter(double x, double y);
+
+}
+
+#endif // CATCH_POLYFILLS_HPP_INCLUDED
+
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
+namespace Catch {
+
+    namespace Detail {
+        /**
+         * Returns the largest magnitude of 1-ULP distance inside the [a, b] range.
+         *
+         * Assumes `a < b`.
+         */
+        template <typename FloatType>
+        FloatType gamma(FloatType a, FloatType b) {
+            static_assert( std::is_floating_point<FloatType>::value,
+                           "gamma returns the largest ULP magnitude within "
+                           "floating point range [a, b]. This only makes sense "
+                           "for floating point types" );
+            assert( a <= b );
+
+            const auto gamma_up = Catch::nextafter( a, std::numeric_limits<FloatType>::infinity() ) - a;
+            const auto gamma_down = b - Catch::nextafter( b, -std::numeric_limits<FloatType>::infinity() );
+
+            return gamma_up < gamma_down ? gamma_down : gamma_up;
+        }
+
+        template <typename FloatingPoint>
+        struct DistanceTypePicker;
+        template <>
+        struct DistanceTypePicker<float> {
+            using type = std::uint32_t;
+        };
+        template <>
+        struct DistanceTypePicker<double> {
+            using type = std::uint64_t;
+        };
+
+        template <typename T>
+        using DistanceType = typename DistanceTypePicker<T>::type;
+
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        /**
+         * Computes the number of equi-distant floats in [a, b]
+         *
+         * Since not every range can be split into equidistant floats
+         * exactly, we actually compute ceil(b/distance - a/distance),
+         * because in those cases we want to overcount.
+         *
+         * Uses modified Dekker's FastTwoSum algorithm to handle rounding.
+         */
+        template <typename FloatType>
+        DistanceType<FloatType>
+        count_equidistant_floats( FloatType a, FloatType b, FloatType distance ) {
+            assert( a <= b );
+            // We get distance as gamma for our uniform float distribution,
+            // so this will round perfectly.
+            const auto ag = a / distance;
+            const auto bg = b / distance;
+
+            const auto s = bg - ag;
+            const auto err = ( std::fabs( a ) <= std::fabs( b ) )
+                                 ? -ag - ( s - bg )
+                                 : bg - ( s + ag );
+            const auto ceil_s = static_cast<DistanceType<FloatType>>( std::ceil( s ) );
+
+            return ( ceil_s != s ) ? ceil_s : ceil_s + ( err > 0 );
+        }
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic pop
+#endif
+
+    }
 
-        explicit SimplePcg32(result_type seed_);
+} // end namespace Catch
 
-        void seed(result_type seed_);
-        void discard(uint64_t skip);
+#endif // CATCH_RANDOM_FLOATING_POINT_HELPERS_HPP_INCLUDED
 
-        result_type operator()();
+#include <cmath>
+#include <type_traits>
 
-    private:
-        friend bool operator==(SimplePcg32 const& lhs, SimplePcg32 const& rhs);
-        friend bool operator!=(SimplePcg32 const& lhs, SimplePcg32 const& rhs);
+namespace Catch {
 
-        // In theory we also need operator<< and operator>>
-        // In practice we do not use them, so we will skip them for now
+    namespace Detail {
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        // The issue with overflow only happens with maximal ULP and HUGE
+        // distance, e.g. when generating numbers in [-inf, inf] for given
+        // type. So we only check for the largest possible ULP in the
+        // type, and return something that does not overflow to inf in 1 mult.
+        constexpr std::uint64_t calculate_max_steps_in_one_go(double gamma) {
+            if ( gamma == 1.99584030953472e+292 ) { return 9007199254740991; }
+            return static_cast<std::uint64_t>( -1 );
+        }
+        constexpr std::uint32_t calculate_max_steps_in_one_go(float gamma) {
+            if ( gamma == 2.028241e+31f ) { return 16777215; }
+            return static_cast<std::uint32_t>( -1 );
+        }
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic pop
+#endif
+    }
 
+/**
+ * Implementation of uniform distribution on floating point numbers.
+ *
+ * Note that we support only `float` and `double` types, because these
+ * usually mean the same thing across different platform. `long double`
+ * varies wildly by platform and thus we cannot provide reproducible
+ * implementation. Also note that we don't implement all parts of
+ * distribution per standard: this distribution is not serializable, nor
+ * can the range be arbitrarily reset.
+ *
+ * The implementation also uses different approach than the one taken by
+ * `std::uniform_real_distribution`, where instead of generating a number
+ * between [0, 1) and then multiplying the range bounds with it, we first
+ * split the [a, b] range into a set of equidistributed floating point
+ * numbers, and then use uniform int distribution to pick which one to
+ * return.
+ *
+ * This has the advantage of guaranteeing uniformity (the multiplication
+ * method loses uniformity due to rounding when multiplying floats), except
+ * for small non-uniformity at one side of the interval, where we have
+ * to deal with the fact that not every interval is splittable into
+ * equidistributed floats.
+ *
+ * Based on "Drawing random floating-point numbers from an interval" by
+ * Frederic Goualard.
+ */
+template <typename FloatType>
+class uniform_floating_point_distribution {
+    static_assert(std::is_floating_point<FloatType>::value, "...");
+    static_assert(!std::is_same<FloatType, long double>::value,
+                  "We do not support long double due to inconsistent behaviour between platforms");
+
+    using WidthType = Detail::DistanceType<FloatType>;
+
+    FloatType m_a, m_b;
+    FloatType m_ulp_magnitude;
+    WidthType m_floats_in_range;
+    uniform_integer_distribution<WidthType> m_int_dist;
+
+    // In specific cases, we can overflow into `inf` when computing the
+    // `steps * g` offset. To avoid this, we don't offset by more than this
+    // in one multiply + addition.
+    WidthType m_max_steps_in_one_go;
+    // We don't want to do the magnitude check every call to `operator()`
+    bool m_a_has_leq_magnitude;
 
-        std::uint64_t m_state;
-        // This part of the state determines which "stream" of the numbers
-        // is chosen -- we take it as a constant for Catch2, so we only
-        // need to deal with seeding the main state.
-        // Picked by reading 8 bytes from `/dev/random` :-)
-        static const std::uint64_t s_inc = (0x13ed0cc53f939476ULL << 1ULL) | 1ULL;
-    };
+public:
+    using result_type = FloatType;
+
+    uniform_floating_point_distribution( FloatType a, FloatType b ):
+        m_a( a ),
+        m_b( b ),
+        m_ulp_magnitude( Detail::gamma( m_a, m_b ) ),
+        m_floats_in_range( Detail::count_equidistant_floats( m_a, m_b, m_ulp_magnitude ) ),
+        m_int_dist(0, m_floats_in_range),
+        m_max_steps_in_one_go( Detail::calculate_max_steps_in_one_go(m_ulp_magnitude)),
+        m_a_has_leq_magnitude(std::fabs(m_a) <= std::fabs(m_b))
+    {
+        assert( a <= b );
+    }
 
-} // end namespace Catch
+    template <typename Generator>
+    result_type operator()( Generator& g ) {
+        WidthType steps = m_int_dist( g );
+        if ( m_a_has_leq_magnitude ) {
+            if ( steps == m_floats_in_range ) { return m_a; }
+            auto b = m_b;
+            while (steps > m_max_steps_in_one_go) {
+                b -= m_max_steps_in_one_go * m_ulp_magnitude;
+                steps -= m_max_steps_in_one_go;
+            }
+            return b - steps * m_ulp_magnitude;
+        } else {
+            if ( steps == m_floats_in_range ) { return m_b; }
+            auto a = m_a;
+            while (steps > m_max_steps_in_one_go) {
+                a += m_max_steps_in_one_go * m_ulp_magnitude;
+                steps -= m_max_steps_in_one_go;
+            }
+            return a + steps * m_ulp_magnitude;
+        }
+    }
 
-#endif // CATCH_RANDOM_NUMBER_GENERATOR_HPP_INCLUDED
+    result_type a() const { return m_a; }
+    result_type b() const { return m_b; }
+};
+
+} // end namespace Catch
 
-#include <random>
+#endif // CATCH_UNIFORM_FLOATING_POINT_DISTRIBUTION_HPP_INCLUDED
 
 namespace Catch {
 namespace Generators {
@@ -8102,7 +8359,7 @@ namespace Detail {
 template <typename Float>
 class RandomFloatingGenerator final : public IGenerator<Float> {
     Catch::SimplePcg32 m_rng;
-    std::uniform_real_distribution<Float> m_dist;
+    Catch::uniform_floating_point_distribution<Float> m_dist;
     Float m_current_number;
 public:
     RandomFloatingGenerator( Float a, Float b, std::uint32_t seed ):
@@ -8120,10 +8377,27 @@ class RandomFloatingGenerator final : public IGenerator<Float> {
     }
 };
 
+template <>
+class RandomFloatingGenerator<long double> final : public IGenerator<long double> {
+    // We still rely on <random> for this specialization, but we don't
+    // want to drag it into the header.
+    struct PImpl;
+    Catch::Detail::unique_ptr<PImpl> m_pimpl;
+    long double m_current_number;
+
+public:
+    RandomFloatingGenerator( long double a, long double b, std::uint32_t seed );
+
+    long double const& get() const override { return m_current_number; }
+    bool next() override;
+
+    ~RandomFloatingGenerator() override; // = default
+};
+
 template <typename Integer>
 class RandomIntegerGenerator final : public IGenerator<Integer> {
     Catch::SimplePcg32 m_rng;
-    std::uniform_int_distribution<Integer> m_dist;
+    Catch::uniform_integer_distribution<Integer> m_dist;
     Integer m_current_number;
 public:
     RandomIntegerGenerator( Integer a, Integer b, std::uint32_t seed ):
@@ -8144,14 +8418,6 @@ class RandomIntegerGenerator final : public IGenerator<Integer> {
 template <typename T>
 std::enable_if_t<std::is_integral<T>::value, GeneratorWrapper<T>>
 random(T a, T b) {
-    static_assert(
-        !std::is_same<T, char>::value &&
-        !std::is_same<T, int8_t>::value &&
-        !std::is_same<T, uint8_t>::value &&
-        !std::is_same<T, signed char>::value &&
-        !std::is_same<T, unsigned char>::value &&
-        !std::is_same<T, bool>::value,
-        "The requested type is not supported by the underlying random distributions from std" );
     return GeneratorWrapper<T>(
         Catch::Detail::make_unique<RandomIntegerGenerator<T>>(a, b, Detail::getSeed())
     );
@@ -8264,39 +8530,266 @@ GeneratorWrapper<ResultType> from_range(InputIterator from, InputSentinel to) {
     return GeneratorWrapper<ResultType>(Catch::Detail::make_unique<IteratorGenerator<ResultType>>(from, to));
 }
 
-template <typename Container,
-          typename ResultType = typename Container::value_type>
-GeneratorWrapper<ResultType> from_range(Container const& cnt) {
-    return GeneratorWrapper<ResultType>(Catch::Detail::make_unique<IteratorGenerator<ResultType>>(cnt.begin(), cnt.end()));
+template <typename Container>
+auto from_range(Container const& cnt) {
+    using std::begin;
+    using std::end;
+    return from_range( begin( cnt ), end( cnt ) );
 }
 
 
-} // namespace Generators
-} // namespace Catch
+} // namespace Generators
+} // namespace Catch
+
+
+#endif // CATCH_GENERATORS_RANGE_HPP_INCLUDED
+
+#endif // CATCH_GENERATORS_ALL_HPP_INCLUDED
+
+
+/** \file
+ * This is a convenience header for Catch2's interfaces. It includes
+ * **all** of Catch2 headers related to interfaces.
+ *
+ * Generally the Catch2 users should use specific includes they need,
+ * but this header can be used instead for ease-of-experimentation, or
+ * just plain convenience, at the cost of somewhat increased compilation
+ * times.
+ *
+ * When a new header is added to either the `interfaces` folder, or to
+ * the corresponding internal subfolder, it should be added here.
+ */
+
+
+#ifndef CATCH_INTERFACES_ALL_HPP_INCLUDED
+#define CATCH_INTERFACES_ALL_HPP_INCLUDED
+
+
+
+#ifndef CATCH_INTERFACES_REPORTER_HPP_INCLUDED
+#define CATCH_INTERFACES_REPORTER_HPP_INCLUDED
+
+
+
+#ifndef CATCH_TEST_RUN_INFO_HPP_INCLUDED
+#define CATCH_TEST_RUN_INFO_HPP_INCLUDED
+
+
+namespace Catch {
+
+    struct TestRunInfo {
+        constexpr TestRunInfo(StringRef _name) : name(_name) {}
+        StringRef name;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_TEST_RUN_INFO_HPP_INCLUDED
+
+#include <map>
+#include <string>
+#include <vector>
+#include <iosfwd>
+
+namespace Catch {
+
+    struct ReporterDescription;
+    struct ListenerDescription;
+    struct TagInfo;
+    struct TestCaseInfo;
+    class TestCaseHandle;
+    class IConfig;
+    class IStream;
+    enum class ColourMode : std::uint8_t;
+
+    struct ReporterConfig {
+        ReporterConfig( IConfig const* _fullConfig,
+                        Detail::unique_ptr<IStream> _stream,
+                        ColourMode colourMode,
+                        std::map<std::string, std::string> customOptions );
+
+        ReporterConfig( ReporterConfig&& ) = default;
+        ReporterConfig& operator=( ReporterConfig&& ) = default;
+        ~ReporterConfig(); // = default
+
+        Detail::unique_ptr<IStream> takeStream() &&;
+        IConfig const* fullConfig() const;
+        ColourMode colourMode() const;
+        std::map<std::string, std::string> const& customOptions() const;
+
+    private:
+        Detail::unique_ptr<IStream> m_stream;
+        IConfig const* m_fullConfig;
+        ColourMode m_colourMode;
+        std::map<std::string, std::string> m_customOptions;
+    };
+
+    struct AssertionStats {
+        AssertionStats( AssertionResult const& _assertionResult,
+                        std::vector<MessageInfo> const& _infoMessages,
+                        Totals const& _totals );
+
+        AssertionStats( AssertionStats const& )              = default;
+        AssertionStats( AssertionStats && )                  = default;
+        AssertionStats& operator = ( AssertionStats const& ) = delete;
+        AssertionStats& operator = ( AssertionStats && )     = delete;
+
+        AssertionResult assertionResult;
+        std::vector<MessageInfo> infoMessages;
+        Totals totals;
+    };
+
+    struct SectionStats {
+        SectionStats(   SectionInfo&& _sectionInfo,
+                        Counts const& _assertions,
+                        double _durationInSeconds,
+                        bool _missingAssertions );
+
+        SectionInfo sectionInfo;
+        Counts assertions;
+        double durationInSeconds;
+        bool missingAssertions;
+    };
+
+    struct TestCaseStats {
+        TestCaseStats(  TestCaseInfo const& _testInfo,
+                        Totals const& _totals,
+                        std::string&& _stdOut,
+                        std::string&& _stdErr,
+                        bool _aborting );
+
+        TestCaseInfo const * testInfo;
+        Totals totals;
+        std::string stdOut;
+        std::string stdErr;
+        bool aborting;
+    };
+
+    struct TestRunStats {
+        TestRunStats(   TestRunInfo const& _runInfo,
+                        Totals const& _totals,
+                        bool _aborting );
+
+        TestRunInfo runInfo;
+        Totals totals;
+        bool aborting;
+    };
+
+    //! By setting up its preferences, a reporter can modify Catch2's behaviour
+    //! in some regards, e.g. it can request Catch2 to capture writes to
+    //! stdout/stderr during test execution, and pass them to the reporter.
+    struct ReporterPreferences {
+        //! Catch2 should redirect writes to stdout and pass them to the
+        //! reporter
+        bool shouldRedirectStdOut = false;
+        //! Catch2 should call `Reporter::assertionEnded` even for passing
+        //! assertions
+        bool shouldReportAllAssertions = false;
+    };
+
+    /**
+     * The common base for all reporters and event listeners
+     *
+     * Implementing classes must also implement:
+     *
+     *     //! User-friendly description of the reporter/listener type
+     *     static std::string getDescription()
+     *
+     * Generally shouldn't be derived from by users of Catch2 directly,
+     * instead they should derive from one of the utility bases that
+     * derive from this class.
+     */
+    class IEventListener {
+    protected:
+        //! Derived classes can set up their preferences here
+        ReporterPreferences m_preferences;
+        //! The test run's config as filled in from CLI and defaults
+        IConfig const* m_config;
+
+    public:
+        IEventListener( IConfig const* config ): m_config( config ) {}
+
+        virtual ~IEventListener(); // = default;
+
+        // Implementing class must also provide the following static methods:
+        // static std::string getDescription();
+
+        ReporterPreferences const& getPreferences() const {
+            return m_preferences;
+        }
+
+        //! Called when no test cases match provided test spec
+        virtual void noMatchingTestCases( StringRef unmatchedSpec ) = 0;
+        //! Called for all invalid test specs from the cli
+        virtual void reportInvalidTestSpec( StringRef invalidArgument ) = 0;
+
+        /**
+         * Called once in a testing run before tests are started
+         *
+         * Not called if tests won't be run (e.g. only listing will happen)
+         */
+        virtual void testRunStarting( TestRunInfo const& testRunInfo ) = 0;
+
+        //! Called _once_ for each TEST_CASE, no matter how many times it is entered
+        virtual void testCaseStarting( TestCaseInfo const& testInfo ) = 0;
+        //! Called _every time_ a TEST_CASE is entered, including repeats (due to sections)
+        virtual void testCasePartialStarting( TestCaseInfo const& testInfo, uint64_t partNumber ) = 0;
+        //! Called when a `SECTION` is being entered. Not called for skipped sections
+        virtual void sectionStarting( SectionInfo const& sectionInfo ) = 0;
+
+        //! Called when user-code is being probed before the actual benchmark runs
+        virtual void benchmarkPreparing( StringRef benchmarkName ) = 0;
+        //! Called after probe but before the user-code is being benchmarked
+        virtual void benchmarkStarting( BenchmarkInfo const& benchmarkInfo ) = 0;
+        //! Called with the benchmark results if benchmark successfully finishes
+        virtual void benchmarkEnded( BenchmarkStats<> const& benchmarkStats ) = 0;
+        //! Called if running the benchmarks fails for any reason
+        virtual void benchmarkFailed( StringRef benchmarkName ) = 0;
 
+        //! Called before assertion success/failure is evaluated
+        virtual void assertionStarting( AssertionInfo const& assertionInfo ) = 0;
 
-#endif // CATCH_GENERATORS_RANGE_HPP_INCLUDED
+        //! Called after assertion was fully evaluated
+        virtual void assertionEnded( AssertionStats const& assertionStats ) = 0;
 
-#endif // CATCH_GENERATORS_ALL_HPP_INCLUDED
+        //! Called after a `SECTION` has finished running
+        virtual void sectionEnded( SectionStats const& sectionStats ) = 0;
+        //! Called _every time_ a TEST_CASE is entered, including repeats (due to sections)
+        virtual void testCasePartialEnded(TestCaseStats const& testCaseStats, uint64_t partNumber ) = 0;
+        //! Called _once_ for each TEST_CASE, no matter how many times it is entered
+        virtual void testCaseEnded( TestCaseStats const& testCaseStats ) = 0;
+        /**
+         * Called once after all tests in a testing run are finished
+         *
+         * Not called if tests weren't run (e.g. only listings happened)
+         */
+        virtual void testRunEnded( TestRunStats const& testRunStats ) = 0;
 
+        /**
+         * Called with test cases that are skipped due to the test run aborting.
+         * NOT called for test cases that are explicitly skipped using the `SKIP` macro.
+         *
+         * Deprecated - will be removed in the next major release.
+         */
+        virtual void skipTest( TestCaseInfo const& testInfo ) = 0;
 
-/** \file
- * This is a convenience header for Catch2's interfaces. It includes
- * **all** of Catch2 headers related to interfaces.
- *
- * Generally the Catch2 users should use specific includes they need,
- * but this header can be used instead for ease-of-experimentation, or
- * just plain convenience, at the cost of somewhat increased compilation
- * times.
- *
- * When a new header is added to either the `interfaces` folder, or to
- * the corresponding internal subfolder, it should be added here.
- */
+        //! Called if a fatal error (signal/structured exception) occurred
+        virtual void fatalErrorEncountered( StringRef error ) = 0;
 
+        //! Writes out information about provided reporters using reporter-specific format
+        virtual void listReporters(std::vector<ReporterDescription> const& descriptions) = 0;
+        //! Writes out the provided listeners descriptions using reporter-specific format
+        virtual void listListeners(std::vector<ListenerDescription> const& descriptions) = 0;
+        //! Writes out information about provided tests using reporter-specific format
+        virtual void listTests(std::vector<TestCaseHandle> const& tests) = 0;
+        //! Writes out information about the provided tags using reporter-specific format
+        virtual void listTags(std::vector<TagInfo> const& tags) = 0;
+    };
+    using IEventListenerPtr = Detail::unique_ptr<IEventListener>;
 
-#ifndef CATCH_INTERFACES_ALL_HPP_INCLUDED
-#define CATCH_INTERFACES_ALL_HPP_INCLUDED
+} // end namespace Catch
 
+#endif // CATCH_INTERFACES_REPORTER_HPP_INCLUDED
 
 
 #ifndef CATCH_INTERFACES_REPORTER_FACTORY_HPP_INCLUDED
@@ -8337,89 +8830,79 @@ namespace Catch {
 #endif // CATCH_INTERFACES_REPORTER_FACTORY_HPP_INCLUDED
 
 
-#ifndef CATCH_INTERFACES_REPORTER_REGISTRY_HPP_INCLUDED
-#define CATCH_INTERFACES_REPORTER_REGISTRY_HPP_INCLUDED
+#ifndef CATCH_INTERFACES_TAG_ALIAS_REGISTRY_HPP_INCLUDED
+#define CATCH_INTERFACES_TAG_ALIAS_REGISTRY_HPP_INCLUDED
 
+#include <string>
 
+namespace Catch {
 
-#ifndef CATCH_CASE_INSENSITIVE_COMPARISONS_HPP_INCLUDED
-#define CATCH_CASE_INSENSITIVE_COMPARISONS_HPP_INCLUDED
+    struct TagAlias;
 
+    class ITagAliasRegistry {
+    public:
+        virtual ~ITagAliasRegistry(); // = default
+        // Nullptr if not present
+        virtual TagAlias const* find( std::string const& alias ) const = 0;
+        virtual std::string expandAliases( std::string const& unexpandedTestSpec ) const = 0;
 
-namespace Catch {
-    namespace Detail {
-        //! Provides case-insensitive `op<` semantics when called
-        struct CaseInsensitiveLess {
-            bool operator()( StringRef lhs,
-                             StringRef rhs ) const;
-        };
+        static ITagAliasRegistry const& get();
+    };
 
-        //! Provides case-insensitive `op==` semantics when called
-        struct CaseInsensitiveEqualTo {
-            bool operator()( StringRef lhs,
-                             StringRef rhs ) const;
-        };
+} // end namespace Catch
 
-    } // namespace Detail
-} // namespace Catch
+#endif // CATCH_INTERFACES_TAG_ALIAS_REGISTRY_HPP_INCLUDED
 
-#endif // CATCH_CASE_INSENSITIVE_COMPARISONS_HPP_INCLUDED
 
-#include <string>
+#ifndef CATCH_INTERFACES_TESTCASE_HPP_INCLUDED
+#define CATCH_INTERFACES_TESTCASE_HPP_INCLUDED
+
 #include <vector>
-#include <map>
 
 namespace Catch {
 
+    struct TestCaseInfo;
+    class TestCaseHandle;
     class IConfig;
 
-    class IEventListener;
-    using IEventListenerPtr = Detail::unique_ptr<IEventListener>;
-    class IReporterFactory;
-    using IReporterFactoryPtr = Detail::unique_ptr<IReporterFactory>;
-    struct ReporterConfig;
-    class EventListenerFactory;
-
-    class IReporterRegistry {
+    class ITestCaseRegistry {
     public:
-        using FactoryMap = std::map<std::string, IReporterFactoryPtr, Detail::CaseInsensitiveLess>;
-        using Listeners = std::vector<Detail::unique_ptr<EventListenerFactory>>;
-
-        virtual ~IReporterRegistry(); // = default
-        virtual IEventListenerPtr create( std::string const& name, ReporterConfig&& config ) const = 0;
-        virtual FactoryMap const& getFactories() const = 0;
-        virtual Listeners const& getListeners() const = 0;
+        virtual ~ITestCaseRegistry(); // = default
+        // TODO: this exists only for adding filenames to test cases -- let's expose this in a saner way later
+        virtual std::vector<TestCaseInfo* > const& getAllInfos() const = 0;
+        virtual std::vector<TestCaseHandle> const& getAllTests() const = 0;
+        virtual std::vector<TestCaseHandle> const& getAllTestsSorted( IConfig const& config ) const = 0;
     };
 
-} // end namespace Catch
-
-#endif // CATCH_INTERFACES_REPORTER_REGISTRY_HPP_INCLUDED
-
+}
 
-#ifndef CATCH_INTERFACES_TAG_ALIAS_REGISTRY_HPP_INCLUDED
-#define CATCH_INTERFACES_TAG_ALIAS_REGISTRY_HPP_INCLUDED
+#endif // CATCH_INTERFACES_TESTCASE_HPP_INCLUDED
 
-#include <string>
+#endif // CATCH_INTERFACES_ALL_HPP_INCLUDED
 
-namespace Catch {
 
-    struct TagAlias;
+#ifndef CATCH_CASE_INSENSITIVE_COMPARISONS_HPP_INCLUDED
+#define CATCH_CASE_INSENSITIVE_COMPARISONS_HPP_INCLUDED
 
-    class ITagAliasRegistry {
-    public:
-        virtual ~ITagAliasRegistry(); // = default
-        // Nullptr if not present
-        virtual TagAlias const* find( std::string const& alias ) const = 0;
-        virtual std::string expandAliases( std::string const& unexpandedTestSpec ) const = 0;
 
-        static ITagAliasRegistry const& get();
-    };
+namespace Catch {
+    namespace Detail {
+        //! Provides case-insensitive `op<` semantics when called
+        struct CaseInsensitiveLess {
+            bool operator()( StringRef lhs,
+                             StringRef rhs ) const;
+        };
 
-} // end namespace Catch
+        //! Provides case-insensitive `op==` semantics when called
+        struct CaseInsensitiveEqualTo {
+            bool operator()( StringRef lhs,
+                             StringRef rhs ) const;
+        };
 
-#endif // CATCH_INTERFACES_TAG_ALIAS_REGISTRY_HPP_INCLUDED
+    } // namespace Detail
+} // namespace Catch
 
-#endif // CATCH_INTERFACES_ALL_HPP_INCLUDED
+#endif // CATCH_CASE_INSENSITIVE_COMPARISONS_HPP_INCLUDED
 
 
 
@@ -8446,46 +8929,182 @@ namespace Catch {
 #    define CATCH_CONFIG_ANDROID_LOGWRITE
 #endif
 
-#endif // CATCH_CONFIG_ANDROID_LOGWRITE_HPP_INCLUDED
+#endif // CATCH_CONFIG_ANDROID_LOGWRITE_HPP_INCLUDED
+
+
+
+/** \file
+ * Wrapper for UNCAUGHT_EXCEPTIONS configuration option
+ *
+ * For some functionality, Catch2 requires to know whether there is
+ * an active exception. Because `std::uncaught_exception` is deprecated
+ * in C++17, we want to use `std::uncaught_exceptions` if possible.
+ */
+
+#ifndef CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP_INCLUDED
+#define CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP_INCLUDED
+
+
+#if defined(_MSC_VER)
+#  if _MSC_VER >= 1900 // Visual Studio 2015 or newer
+#    define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
+#  endif
+#endif
+
+
+#include <exception>
+
+#if defined(__cpp_lib_uncaught_exceptions) \
+    && !defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)
+
+#  define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
+#endif // __cpp_lib_uncaught_exceptions
+
+
+#if defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS) \
+    && !defined(CATCH_CONFIG_NO_CPP17_UNCAUGHT_EXCEPTIONS) \
+    && !defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)
+
+#  define CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
+#endif
+
+
+#endif // CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP_INCLUDED
+
+
+#ifndef CATCH_CONSOLE_COLOUR_HPP_INCLUDED
+#define CATCH_CONSOLE_COLOUR_HPP_INCLUDED
+
+
+#include <iosfwd>
+#include <cstdint>
+
+namespace Catch {
+
+    enum class ColourMode : std::uint8_t;
+    class IStream;
+
+    struct Colour {
+        enum Code {
+            None = 0,
+
+            White,
+            Red,
+            Green,
+            Blue,
+            Cyan,
+            Yellow,
+            Grey,
+
+            Bright = 0x10,
+
+            BrightRed = Bright | Red,
+            BrightGreen = Bright | Green,
+            LightGrey = Bright | Grey,
+            BrightWhite = Bright | White,
+            BrightYellow = Bright | Yellow,
+
+            // By intention
+            FileName = LightGrey,
+            Warning = BrightYellow,
+            ResultError = BrightRed,
+            ResultSuccess = BrightGreen,
+            ResultExpectedFailure = Warning,
+
+            Error = BrightRed,
+            Success = Green,
+            Skip = LightGrey,
+
+            OriginalExpression = Cyan,
+            ReconstructedExpression = BrightYellow,
+
+            SecondaryText = LightGrey,
+            Headers = White
+        };
+    };
+
+    class ColourImpl {
+    protected:
+        //! The associated stream of this ColourImpl instance
+        IStream* m_stream;
+    public:
+        ColourImpl( IStream* stream ): m_stream( stream ) {}
+
+        //! RAII wrapper around writing specific colour of text using specific
+        //! colour impl into a stream.
+        class ColourGuard {
+            ColourImpl const* m_colourImpl;
+            Colour::Code m_code;
+            bool m_engaged = false;
 
+        public:
+            //! Does **not** engage the guard/start the colour
+            ColourGuard( Colour::Code code,
+                         ColourImpl const* colour );
 
+            ColourGuard( ColourGuard const& rhs ) = delete;
+            ColourGuard& operator=( ColourGuard const& rhs ) = delete;
 
-/** \file
- * Wrapper for UNCAUGHT_EXCEPTIONS configuration option
- *
- * For some functionality, Catch2 requires to know whether there is
- * an active exception. Because `std::uncaught_exception` is deprecated
- * in C++17, we want to use `std::uncaught_exceptions` if possible.
- */
+            ColourGuard( ColourGuard&& rhs ) noexcept;
+            ColourGuard& operator=( ColourGuard&& rhs ) noexcept;
 
-#ifndef CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP_INCLUDED
-#define CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP_INCLUDED
+            //! Removes colour _if_ the guard was engaged
+            ~ColourGuard();
 
-#if defined(_MSC_VER)
-#  if _MSC_VER >= 1900 // Visual Studio 2015 or newer
-#    define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
-#  endif
-#endif
+            /**
+             * Explicitly engages colour for given stream.
+             *
+             * The API based on operator<< should be preferred.
+             */
+            ColourGuard& engage( std::ostream& stream ) &;
+            /**
+             * Explicitly engages colour for given stream.
+             *
+             * The API based on operator<< should be preferred.
+             */
+            ColourGuard&& engage( std::ostream& stream ) &&;
 
+        private:
+            //! Engages the guard and starts using colour
+            friend std::ostream& operator<<( std::ostream& lhs,
+                                             ColourGuard& guard ) {
+                guard.engageImpl( lhs );
+                return lhs;
+            }
+            //! Engages the guard and starts using colour
+            friend std::ostream& operator<<( std::ostream& lhs,
+                                            ColourGuard&& guard) {
+                guard.engageImpl( lhs );
+                return lhs;
+            }
 
-#include <exception>
+            void engageImpl( std::ostream& stream );
 
-#if defined(__cpp_lib_uncaught_exceptions) \
-    && !defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)
+        };
 
-#  define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
-#endif // __cpp_lib_uncaught_exceptions
+        virtual ~ColourImpl(); // = default
+        /**
+         * Creates a guard object for given colour and this colour impl
+         *
+         * **Important:**
+         * the guard starts disengaged, and has to be engaged explicitly.
+         */
+        ColourGuard guardColour( Colour::Code colourCode );
 
+    private:
+        virtual void use( Colour::Code colourCode ) const = 0;
+    };
 
-#if defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS) \
-    && !defined(CATCH_CONFIG_NO_CPP17_UNCAUGHT_EXCEPTIONS) \
-    && !defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)
+    //! Provides ColourImpl based on global config and target compilation platform
+    Detail::unique_ptr<ColourImpl> makeColourImpl( ColourMode colourSelection,
+                                                   IStream* stream );
 
-#  define CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
-#endif
+    //! Checks if specific colour impl has been compiled into the binary
+    bool isColourImplAvailable( ColourMode colourSelection );
 
+} // end namespace Catch
 
-#endif // CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP_INCLUDED
+#endif // CATCH_CONSOLE_COLOUR_HPP_INCLUDED
 
 
 #ifndef CATCH_CONSOLE_WIDTH_HPP_INCLUDED
@@ -8751,7 +9370,6 @@ namespace Catch {
         ~ExceptionTranslatorRegistry() override;
         void registerTranslator( Detail::unique_ptr<IExceptionTranslator>&& translator );
         std::string translateActiveException() const override;
-        std::string tryTranslators() const;
 
     private:
         ExceptionTranslators m_translators;
@@ -8764,7 +9382,6 @@ namespace Catch {
 #ifndef CATCH_FATAL_CONDITION_HANDLER_HPP_INCLUDED
 #define CATCH_FATAL_CONDITION_HANDLER_HPP_INCLUDED
 
-
 #include <cassert>
 
 namespace Catch {
@@ -8827,17 +9444,6 @@ namespace Catch {
 #define CATCH_FLOATING_POINT_HELPERS_HPP_INCLUDED
 
 
-
-#ifndef CATCH_POLYFILLS_HPP_INCLUDED
-#define CATCH_POLYFILLS_HPP_INCLUDED
-
-namespace Catch {
-    bool isnan(float f);
-    bool isnan(double d);
-}
-
-#endif // CATCH_POLYFILLS_HPP_INCLUDED
-
 #include <cassert>
 #include <cmath>
 #include <cstdint>
@@ -8850,6 +9456,11 @@ namespace Catch {
         uint32_t convertToBits(float f);
         uint64_t convertToBits(double d);
 
+        // Used when we know we want == comparison of two doubles
+        // to centralize warning suppression
+        bool directCompare( float lhs, float rhs );
+        bool directCompare( double lhs, double rhs );
+
     } // end namespace Detail
 
 
@@ -9126,6 +9737,119 @@ namespace Catch {
 #endif // CATCH_STREAM_HPP_INCLUDED
 
 
+#ifndef CATCH_JSONWRITER_HPP_INCLUDED
+#define CATCH_JSONWRITER_HPP_INCLUDED
+
+
+#include <cstdint>
+#include <sstream>
+
+namespace Catch {
+    class JsonObjectWriter;
+    class JsonArrayWriter;
+
+    struct JsonUtils {
+        static void indent( std::ostream& os, std::uint64_t level );
+        static void appendCommaNewline( std::ostream& os,
+                                        bool& should_comma,
+                                        std::uint64_t level );
+    };
+
+    class JsonValueWriter {
+    public:
+        JsonValueWriter( std::ostream& os );
+        JsonValueWriter( std::ostream& os, std::uint64_t indent_level );
+
+        JsonObjectWriter writeObject() &&;
+        JsonArrayWriter writeArray() &&;
+
+        template <typename T>
+        void write( T const& value ) && {
+            writeImpl( value, !std::is_arithmetic<T>::value );
+        }
+        void write( StringRef value ) &&;
+        void write( bool value ) &&;
+
+    private:
+        void writeImpl( StringRef value, bool quote );
+
+        // Without this SFINAE, this overload is a better match
+        // for `std::string`, `char const*`, `char const[N]` args.
+        // While it would still work, it would cause code bloat
+        // and multiple iteration over the strings
+        template <typename T,
+                  typename = typename std::enable_if_t<
+                      !std::is_convertible<T, StringRef>::value>>
+        void writeImpl( T const& value, bool quote_value ) {
+            m_sstream << value;
+            writeImpl( m_sstream.str(), quote_value );
+        }
+
+        std::ostream& m_os;
+        std::stringstream m_sstream;
+        std::uint64_t m_indent_level;
+    };
+
+    class JsonObjectWriter {
+    public:
+        JsonObjectWriter( std::ostream& os );
+        JsonObjectWriter( std::ostream& os, std::uint64_t indent_level );
+
+        JsonObjectWriter( JsonObjectWriter&& source );
+        JsonObjectWriter& operator=( JsonObjectWriter&& source ) = delete;
+
+        ~JsonObjectWriter();
+
+        JsonValueWriter write( StringRef key );
+
+    private:
+        std::ostream& m_os;
+        std::uint64_t m_indent_level;
+        bool m_should_comma = false;
+        bool m_active = true;
+    };
+
+    class JsonArrayWriter {
+    public:
+        JsonArrayWriter( std::ostream& os );
+        JsonArrayWriter( std::ostream& os, std::uint64_t indent_level );
+
+        JsonArrayWriter( JsonArrayWriter&& source );
+        JsonArrayWriter& operator=( JsonArrayWriter&& source ) = delete;
+
+        ~JsonArrayWriter();
+
+        JsonObjectWriter writeObject();
+        JsonArrayWriter writeArray();
+
+        template <typename T>
+        JsonArrayWriter& write( T const& value ) {
+            return writeImpl( value );
+        }
+
+        JsonArrayWriter& write( bool value );
+
+    private:
+        template <typename T>
+        JsonArrayWriter& writeImpl( T const& value ) {
+            JsonUtils::appendCommaNewline(
+                m_os, m_should_comma, m_indent_level + 1 );
+            JsonValueWriter{ m_os }.write( value );
+
+            return *this;
+        }
+
+        std::ostream& m_os;
+        std::uint64_t m_indent_level;
+        bool m_should_comma = false;
+        bool m_active = true;
+    };
+
+} // namespace Catch
+
+#endif // CATCH_JSONWRITER_HPP_INCLUDED
+
+
 #ifndef CATCH_LEAK_DETECTOR_HPP_INCLUDED
 #define CATCH_LEAK_DETECTOR_HPP_INCLUDED
 
@@ -9312,28 +10036,45 @@ namespace Catch {
 
 
 #include <map>
+#include <string>
+#include <vector>
 
 namespace Catch {
 
-    class ReporterRegistry : public IReporterRegistry {
-    public:
+    class IEventListener;
+    using IEventListenerPtr = Detail::unique_ptr<IEventListener>;
+    class IReporterFactory;
+    using IReporterFactoryPtr = Detail::unique_ptr<IReporterFactory>;
+    struct ReporterConfig;
+    class EventListenerFactory;
+
+    class ReporterRegistry {
+        struct ReporterRegistryImpl;
+        Detail::unique_ptr<ReporterRegistryImpl> m_impl;
 
+    public:
         ReporterRegistry();
-        ~ReporterRegistry() override; // = default, out of line to allow fwd decl
+        ~ReporterRegistry(); // = default;
 
-        IEventListenerPtr create( std::string const& name, ReporterConfig&& config ) const override;
+        IEventListenerPtr create( std::string const& name,
+                                  ReporterConfig&& config ) const;
 
-        void registerReporter( std::string const& name, IReporterFactoryPtr factory );
-        void registerListener( Detail::unique_ptr<EventListenerFactory> factory );
+        void registerReporter( std::string const& name,
+                               IReporterFactoryPtr factory );
 
-        FactoryMap const& getFactories() const override;
-        Listeners const& getListeners() const override;
+        void
+        registerListener( Detail::unique_ptr<EventListenerFactory> factory );
 
-    private:
-        FactoryMap m_factories;
-        Listeners m_listeners;
+        std::map<std::string,
+                 IReporterFactoryPtr,
+                 Detail::CaseInsensitiveLess> const&
+        getFactories() const;
+
+        std::vector<Detail::unique_ptr<EventListenerFactory>> const&
+        getListeners() const;
     };
-}
+
+} // end namespace Catch
 
 #endif // CATCH_REPORTER_REGISTRY_HPP_INCLUDED
 
@@ -9448,7 +10189,7 @@ namespace TestCaseTracking {
 
         //! Returns true if tracker run to completion (successfully or not)
         virtual bool isComplete() const = 0;
-        //! Returns true if tracker run to completion succesfully
+        //! Returns true if tracker run to completion successfully
         bool isSuccessfullyCompleted() const {
             return m_runState == CompletedSuccessfully;
         }
@@ -9582,13 +10323,14 @@ using TestCaseTracking::SectionTracker;
 
 namespace Catch {
 
-    class IMutableContext;
     class IGeneratorTracker;
     class IConfig;
+    class IEventListener;
+    using IEventListenerPtr = Detail::unique_ptr<IEventListener>;
 
     ///////////////////////////////////////////////////////////////////////////
 
-    class RunContext : public IResultCapture {
+    class RunContext final : public IResultCapture {
 
     public:
         RunContext( RunContext const& ) = delete;
@@ -9617,7 +10359,7 @@ namespace Catch {
                     AssertionReaction& reaction ) override;
         void handleUnexpectedInflightException
                 (   AssertionInfo const& info,
-                    std::string const& message,
+                    std::string&& message,
                     AssertionReaction& reaction ) override;
         void handleIncomplete
                 (   AssertionInfo const& info ) override;
@@ -9626,6 +10368,7 @@ namespace Catch {
                     ResultWas::OfType resultType,
                     AssertionReaction &reaction ) override;
 
+        void notifyAssertionStarted( AssertionInfo const& info ) override;
         bool sectionStarted( StringRef sectionName,
                              SourceLineInfo const& sectionLineInfo,
                              Counts& assertions ) override;
@@ -9676,7 +10419,7 @@ namespace Catch {
         void resetAssertionInfo();
         bool testForMissingAssertions( Counts& assertions );
 
-        void assertionEnded( AssertionResult const& result );
+        void assertionEnded( AssertionResult&& result );
         void reportExpr
                 (   AssertionInfo const &info,
                     ResultWas::OfType resultType,
@@ -9690,7 +10433,6 @@ namespace Catch {
         void handleUnfinishedSections();
 
         TestRunInfo m_runInfo;
-        IMutableContext& m_context;
         TestCaseHandle const* m_activeTestCase = nullptr;
         ITracker* m_testCaseTracker = nullptr;
         Optional<AssertionResult> m_lastResult;
@@ -9720,7 +10462,7 @@ namespace Catch {
 #ifndef CATCH_SHARDING_HPP_INCLUDED
 #define CATCH_SHARDING_HPP_INCLUDED
 
-
+#include <cassert>
 #include <cmath>
 #include <algorithm>
 
@@ -9947,24 +10689,20 @@ namespace Catch {
 
 namespace Catch {
 
-    class TestCaseHandle;
     class IConfig;
+    class ITestInvoker;
+    class TestCaseHandle;
     class TestSpec;
 
     std::vector<TestCaseHandle> sortTests( IConfig const& config, std::vector<TestCaseHandle> const& unsortedTestCases );
 
     bool isThrowSafe( TestCaseHandle const& testCase, IConfig const& config );
-    bool matchTest( TestCaseHandle const& testCase, TestSpec const& testSpec, IConfig const& config );
-
-    void enforceNoDuplicateTestCases( std::vector<TestCaseHandle> const& functions );
 
     std::vector<TestCaseHandle> filterTests( std::vector<TestCaseHandle> const& testCases, TestSpec const& testSpec, IConfig const& config );
     std::vector<TestCaseHandle> const& getAllTestCasesSorted( IConfig const& config );
 
     class TestRegistry : public ITestCaseRegistry {
     public:
-        ~TestRegistry() override = default;
-
         void registerTest( Detail::unique_ptr<TestCaseInfo> testInfo, Detail::unique_ptr<ITestInvoker> testInvoker );
 
         std::vector<TestCaseInfo*> const& getAllInfos() const override;
@@ -9985,18 +10723,6 @@ namespace Catch {
 
     ///////////////////////////////////////////////////////////////////////////
 
-    class TestInvokerAsFunction final : public ITestInvoker {
-        using TestType = void(*)();
-        TestType m_testAsFunction;
-    public:
-        TestInvokerAsFunction(TestType testAsFunction) noexcept:
-            m_testAsFunction(testAsFunction) {}
-
-        void invoke() const override;
-    };
-
-    ///////////////////////////////////////////////////////////////////////////
-
 
 } // end namespace Catch
 
@@ -10082,6 +10808,7 @@ namespace Catch {
 #ifndef CATCH_TEXTFLOW_HPP_INCLUDED
 #define CATCH_TEXTFLOW_HPP_INCLUDED
 
+
 #include <cassert>
 #include <string>
 #include <vector>
@@ -10110,7 +10837,7 @@ namespace Catch {
 
         public:
             /**
-             * Iterates "lines" in `Column` and return sthem
+             * Iterates "lines" in `Column` and returns them
              */
             class const_iterator {
                 friend Column;
@@ -10132,7 +10859,7 @@ namespace Catch {
                 // Calculates the length of the current line
                 void calcLength();
 
-                // Returns current indention width
+                // Returns current indentation width
                 size_t indentSize() const;
 
                 // Creates an indented and (optionally) suffixed string from
@@ -10164,20 +10891,35 @@ namespace Catch {
             using iterator = const_iterator;
 
             explicit Column( std::string const& text ): m_string( text ) {}
+            explicit Column( std::string&& text ):
+                m_string( CATCH_MOVE(text)) {}
 
-            Column& width( size_t newWidth ) {
+            Column& width( size_t newWidth ) & {
                 assert( newWidth > 0 );
                 m_width = newWidth;
                 return *this;
             }
-            Column& indent( size_t newIndent ) {
+            Column&& width( size_t newWidth ) && {
+                assert( newWidth > 0 );
+                m_width = newWidth;
+                return CATCH_MOVE( *this );
+            }
+            Column& indent( size_t newIndent ) & {
                 m_indent = newIndent;
                 return *this;
             }
-            Column& initialIndent( size_t newIndent ) {
+            Column&& indent( size_t newIndent ) && {
+                m_indent = newIndent;
+                return CATCH_MOVE( *this );
+            }
+            Column& initialIndent( size_t newIndent ) & {
                 m_initialIndent = newIndent;
                 return *this;
             }
+            Column&& initialIndent( size_t newIndent ) && {
+                m_initialIndent = newIndent;
+                return CATCH_MOVE( *this );
+            }
 
             size_t width() const { return m_width; }
             const_iterator begin() const { return const_iterator( *this ); }
@@ -10186,7 +10928,8 @@ namespace Catch {
             friend std::ostream& operator<<( std::ostream& os,
                                              Column const& col );
 
-            Columns operator+( Column const& other );
+            friend Columns operator+( Column const& lhs, Column const& rhs );
+            friend Columns operator+( Column&& lhs, Column&& rhs );
         };
 
         //! Creates a column that serves as an empty space of specific width
@@ -10230,8 +10973,10 @@ namespace Catch {
             iterator begin() const { return iterator( *this ); }
             iterator end() const { return { *this, iterator::EndTag() }; }
 
-            Columns& operator+=( Column const& col );
-            Columns operator+( Column const& col );
+            friend Columns& operator+=( Columns& lhs, Column const& rhs );
+            friend Columns& operator+=( Columns& lhs, Column&& rhs );
+            friend Columns operator+( Columns const& lhs, Column const& rhs );
+            friend Columns operator+( Columns&& lhs, Column&& rhs );
 
             friend std::ostream& operator<<( std::ostream& os,
                                              Columns const& cols );
@@ -10445,6 +11190,8 @@ namespace Catch {
 #define CATCH_MATCHERS_IMPL_HPP_INCLUDED
 
 
+#include <string>
+
 namespace Catch {
 
     template<typename ArgT, typename MatcherT>
@@ -11680,7 +12427,7 @@ namespace Catch {
 
         /**
          * Creates a matcher that checks if all elements in a range are equal
-         * to all elements in another range, in some permuation.
+         * to all elements in another range, in some permutation.
          *
          * Uses to provided predicate `predicate` to do the comparisons
          */
@@ -11850,11 +12597,10 @@ namespace Matchers {
             // - a more general approach would be via a compare template that defaults
             // to using !=. but could be specialised for, e.g. std::vector<T> etc
             // - then just call that directly
-            if (m_comparator.size() != v.size())
-                return false;
-            for (std::size_t i = 0; i < v.size(); ++i)
-                if (m_comparator[i] != v[i])
-                    return false;
+            if ( m_comparator.size() != v.size() ) { return false; }
+            for ( std::size_t i = 0; i < v.size(); ++i ) {
+                if ( !( m_comparator[i] == v[i] ) ) { return false; }
+            }
             return true;
         }
         std::string describe() const override {
@@ -12358,7 +13104,7 @@ namespace Catch {
         void skipTest(TestCaseInfo const&) override {}
 
     protected:
-        //! Should the cumulative base store the assertion expansion for succesful assertions?
+        //! Should the cumulative base store the assertion expansion for successful assertions?
         bool m_shouldStoreSuccesfulAssertions = true;
         //! Should the cumulative base store the assertion expansion for failed assertions?
         bool m_shouldStoreFailedAssertions = true;
@@ -12526,6 +13272,93 @@ namespace Catch {
 #endif // CATCH_REPORTER_HELPERS_HPP_INCLUDED
 
 
+
+#ifndef CATCH_REPORTER_JSON_HPP_INCLUDED
+#define CATCH_REPORTER_JSON_HPP_INCLUDED
+
+
+#include <stack>
+
+namespace Catch {
+    class JsonReporter : public StreamingReporterBase {
+    public:
+        JsonReporter( ReporterConfig&& config );
+
+        ~JsonReporter() override;
+
+        static std::string getDescription();
+
+    public: // StreamingReporterBase
+        void testRunStarting( TestRunInfo const& runInfo ) override;
+        void testRunEnded( TestRunStats const& runStats ) override;
+
+        void testCaseStarting( TestCaseInfo const& tcInfo ) override;
+        void testCaseEnded( TestCaseStats const& tcStats ) override;
+
+        void testCasePartialStarting( TestCaseInfo const& tcInfo,
+                                      uint64_t index ) override;
+        void testCasePartialEnded( TestCaseStats const& tcStats,
+                                   uint64_t index ) override;
+
+        void sectionStarting( SectionInfo const& sectionInfo ) override;
+        void sectionEnded( SectionStats const& sectionStats ) override;
+
+        void assertionStarting( AssertionInfo const& assertionInfo ) override;
+        void assertionEnded( AssertionStats const& assertionStats ) override;
+
+        //void testRunEndedCumulative() override;
+
+        void benchmarkPreparing( StringRef name ) override;
+        void benchmarkStarting( BenchmarkInfo const& ) override;
+        void benchmarkEnded( BenchmarkStats<> const& ) override;
+        void benchmarkFailed( StringRef error ) override;
+
+        void listReporters(
+            std::vector<ReporterDescription> const& descriptions ) override;
+        void listListeners(
+            std::vector<ListenerDescription> const& descriptions ) override;
+        void listTests( std::vector<TestCaseHandle> const& tests ) override;
+        void listTags( std::vector<TagInfo> const& tags ) override;
+
+    private:
+        Timer m_testCaseTimer;
+        enum class Writer {
+            Object,
+            Array
+        };
+
+        JsonArrayWriter& startArray();
+        JsonArrayWriter& startArray( StringRef key );
+
+        JsonObjectWriter& startObject();
+        JsonObjectWriter& startObject( StringRef key );
+
+        void endObject();
+        void endArray();
+
+        bool isInside( Writer writer );
+
+        void startListing();
+        void endListing();
+
+        // Invariant:
+        // When m_writers is not empty and its top element is
+        // - Writer::Object, then m_objectWriters is not be empty
+        // - Writer::Array,  then m_arrayWriters shall not be empty
+        std::stack<JsonObjectWriter> m_objectWriters{};
+        std::stack<JsonArrayWriter> m_arrayWriters{};
+        std::stack<Writer> m_writers{};
+
+        bool m_startedListing = false;
+
+        // std::size_t m_sectionDepth = 0;
+        // std::size_t m_sectionStarted = 0;
+    };
+} // namespace Catch
+
+#endif // CATCH_REPORTER_JSON_HPP_INCLUDED
+
+
 #ifndef CATCH_REPORTER_JUNIT_HPP_INCLUDED
 #define CATCH_REPORTER_JUNIT_HPP_INCLUDED
 
@@ -12537,8 +13370,6 @@ namespace Catch {
     public:
         JunitReporter(ReporterConfig&& _config);
 
-        ~JunitReporter() override = default;
-
         static std::string getDescription();
 
         void testRunStarting(TestRunInfo const& runInfo) override;
@@ -12665,7 +13496,8 @@ namespace Catch {
         //! independent on the reporter's concrete type
         void registerReporterImpl( std::string const& name,
                                    IReporterFactoryPtr reporterPtr );
-
+        //! Actually registers the factory, independent on listener's concrete type
+        void registerListenerImpl( Detail::unique_ptr<EventListenerFactory> listenerFactory );
     } // namespace Detail
 
     class IEventListener;
@@ -12726,7 +13558,7 @@ namespace Catch {
 
     public:
         ListenerRegistrar(StringRef listenerName) {
-            getMutableRegistryHub().registerListener( Detail::make_unique<TypedListenerFactory>(listenerName) );
+            registerListenerImpl( Detail::make_unique<TypedListenerFactory>(listenerName) );
         }
     };
 }
@@ -12778,8 +13610,6 @@ namespace Catch {
             m_shouldStoreSuccesfulAssertions = false;
         }
 
-        ~SonarQubeReporter() override = default;
-
         static std::string getDescription() {
             using namespace std::string_literals;
             return "Reports test results in the Generic Test Data SonarQube XML format"s;
@@ -12826,7 +13656,6 @@ namespace Catch {
             StreamingReporterBase( CATCH_MOVE(config) ) {
             m_preferences.shouldReportAllAssertions = true;
         }
-        ~TAPReporter() override = default;
 
         static std::string getDescription() {
             using namespace std::string_literals;
diff --git a/alpaka/thirdParty/catch2/fuzzing/NullOStream.cpp b/alpaka/thirdParty/catch2/fuzzing/NullOStream.cpp
index 53e0893d..e3a181e8 100644
--- a/alpaka/thirdParty/catch2/fuzzing/NullOStream.cpp
+++ b/alpaka/thirdParty/catch2/fuzzing/NullOStream.cpp
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 #include "NullOStream.h"
 
 void NullOStream::avoidOutOfLineVirtualCompilerWarning()
diff --git a/alpaka/thirdParty/catch2/fuzzing/NullOStream.h b/alpaka/thirdParty/catch2/fuzzing/NullOStream.h
index e1fe15b0..abbec09c 100644
--- a/alpaka/thirdParty/catch2/fuzzing/NullOStream.h
+++ b/alpaka/thirdParty/catch2/fuzzing/NullOStream.h
@@ -1,3 +1,11 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
 #pragma once
 
 #include <ostream>
diff --git a/alpaka/thirdParty/catch2/fuzzing/fuzz_TestSpecParser.cpp b/alpaka/thirdParty/catch2/fuzzing/fuzz_TestSpecParser.cpp
index af4de406..3aba8c84 100644
--- a/alpaka/thirdParty/catch2/fuzzing/fuzz_TestSpecParser.cpp
+++ b/alpaka/thirdParty/catch2/fuzzing/fuzz_TestSpecParser.cpp
@@ -1,4 +1,10 @@
-//License: Boost 1.0
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
 //By Paul Dreik 2020
 
 #include <catch2/internal/catch_test_spec_parser.hpp>
diff --git a/alpaka/thirdParty/catch2/fuzzing/fuzz_XmlWriter.cpp b/alpaka/thirdParty/catch2/fuzzing/fuzz_XmlWriter.cpp
index f8e5a0d9..70c4ed80 100644
--- a/alpaka/thirdParty/catch2/fuzzing/fuzz_XmlWriter.cpp
+++ b/alpaka/thirdParty/catch2/fuzzing/fuzz_XmlWriter.cpp
@@ -1,4 +1,10 @@
-//License: Boost 1.0
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
 //By Paul Dreik 2020
 
 #include <catch2/internal/catch_xmlwriter.hpp>
diff --git a/alpaka/thirdParty/catch2/fuzzing/fuzz_textflow.cpp b/alpaka/thirdParty/catch2/fuzzing/fuzz_textflow.cpp
index eafe79fe..7000f420 100644
--- a/alpaka/thirdParty/catch2/fuzzing/fuzz_textflow.cpp
+++ b/alpaka/thirdParty/catch2/fuzzing/fuzz_textflow.cpp
@@ -1,4 +1,10 @@
-//License: Boost 1.0
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
 //By Paul Dreik 2020
 
 #include <catch2/internal/catch_textflow.hpp>
diff --git a/alpaka/thirdParty/catch2/meson.build b/alpaka/thirdParty/catch2/meson.build
index 1faca35f..0a897520 100644
--- a/alpaka/thirdParty/catch2/meson.build
+++ b/alpaka/thirdParty/catch2/meson.build
@@ -8,10 +8,12 @@
 project(
   'catch2',
   'cpp',
-  version: '3.3.2', # CML version placeholder, don't delete
+  version: '3.5.2', # CML version placeholder, don't delete
   license: 'BSL-1.0',
-  meson_version: '>=0.50.0',
+  meson_version: '>=0.54.1',
 )
 
 subdir('src/catch2')
-subdir('tests')
+if get_option('tests')
+  subdir('tests')
+endif
diff --git a/alpaka/thirdParty/catch2/src/CMakeLists.txt b/alpaka/thirdParty/catch2/src/CMakeLists.txt
index fd05dbdd..eb805ddd 100644
--- a/alpaka/thirdParty/catch2/src/CMakeLists.txt
+++ b/alpaka/thirdParty/catch2/src/CMakeLists.txt
@@ -21,6 +21,8 @@ set(BENCHMARK_HEADERS
   ${SOURCES_DIR}/benchmark/catch_sample_analysis.hpp
   ${SOURCES_DIR}/benchmark/detail/catch_analyse.hpp
   ${SOURCES_DIR}/benchmark/detail/catch_benchmark_function.hpp
+  ${SOURCES_DIR}/benchmark/detail/catch_benchmark_stats.hpp
+  ${SOURCES_DIR}/benchmark/detail/catch_benchmark_stats_fwd.hpp
   ${SOURCES_DIR}/benchmark/detail/catch_complete_invoke.hpp
   ${SOURCES_DIR}/benchmark/detail/catch_estimate_clock.hpp
   ${SOURCES_DIR}/benchmark/detail/catch_measure.hpp
@@ -31,6 +33,7 @@ set(BENCHMARK_HEADERS
 )
 set(BENCHMARK_SOURCES
   ${SOURCES_DIR}/benchmark/catch_chronometer.cpp
+  ${SOURCES_DIR}/benchmark/detail/catch_analyse.cpp
   ${SOURCES_DIR}/benchmark/detail/catch_benchmark_function.cpp
   ${SOURCES_DIR}/benchmark/detail/catch_run_for_at_least.cpp
   ${SOURCES_DIR}/benchmark/detail/catch_stats.cpp
@@ -71,6 +74,7 @@ set(IMPL_HEADERS
   ${SOURCES_DIR}/internal/catch_compiler_capabilities.hpp
   ${SOURCES_DIR}/internal/catch_config_android_logwrite.hpp
   ${SOURCES_DIR}/internal/catch_config_counter.hpp
+  ${SOURCES_DIR}/internal/catch_config_static_analysis_support.hpp
   ${SOURCES_DIR}/internal/catch_config_uncaught_exceptions.hpp
   ${SOURCES_DIR}/internal/catch_config_wchar.hpp
   ${SOURCES_DIR}/internal/catch_console_colour.hpp
@@ -89,6 +93,7 @@ set(IMPL_HEADERS
   ${SOURCES_DIR}/internal/catch_getenv.hpp
   ${SOURCES_DIR}/internal/catch_istream.hpp
   ${SOURCES_DIR}/internal/catch_is_permutation.hpp
+  ${SOURCES_DIR}/internal/catch_jsonwriter.hpp
   ${SOURCES_DIR}/internal/catch_lazy_expr.hpp
   ${SOURCES_DIR}/internal/catch_leak_detector.hpp
   ${SOURCES_DIR}/internal/catch_list.hpp
@@ -104,6 +109,8 @@ set(IMPL_HEADERS
   ${SOURCES_DIR}/internal/catch_polyfills.hpp
   ${SOURCES_DIR}/internal/catch_preprocessor.hpp
   ${SOURCES_DIR}/internal/catch_preprocessor_remove_parens.hpp
+  ${SOURCES_DIR}/internal/catch_random_floating_point_helpers.hpp
+  ${SOURCES_DIR}/internal/catch_random_integer_helpers.hpp
   ${SOURCES_DIR}/internal/catch_random_number_generator.hpp
   ${SOURCES_DIR}/internal/catch_random_seed_generation.hpp
   ${SOURCES_DIR}/internal/catch_reporter_registry.hpp
@@ -128,10 +135,13 @@ set(IMPL_HEADERS
   ${SOURCES_DIR}/internal/catch_test_failure_exception.hpp
   ${SOURCES_DIR}/internal/catch_test_macro_impl.hpp
   ${SOURCES_DIR}/internal/catch_test_registry.hpp
+  ${SOURCES_DIR}/internal/catch_test_run_info.hpp
   ${SOURCES_DIR}/internal/catch_test_spec_parser.hpp
   ${SOURCES_DIR}/internal/catch_textflow.hpp
   ${SOURCES_DIR}/internal/catch_to_string.hpp
   ${SOURCES_DIR}/internal/catch_uncaught_exceptions.hpp
+  ${SOURCES_DIR}/internal/catch_uniform_floating_point_distribution.hpp
+  ${SOURCES_DIR}/internal/catch_uniform_integer_distribution.hpp
   ${SOURCES_DIR}/internal/catch_unique_name.hpp
   ${SOURCES_DIR}/internal/catch_unique_ptr.hpp
   ${SOURCES_DIR}/internal/catch_void_type.hpp
@@ -153,6 +163,7 @@ set(IMPL_SOURCES
   ${SOURCES_DIR}/catch_timer.cpp
   ${SOURCES_DIR}/catch_tostring.cpp
   ${SOURCES_DIR}/catch_totals.cpp
+  ${SOURCES_DIR}/catch_translate_exception.cpp
   ${SOURCES_DIR}/catch_version.cpp
   ${SOURCES_DIR}/internal/catch_assertion_handler.cpp
   ${SOURCES_DIR}/internal/catch_case_insensitive_comparisons.cpp
@@ -171,6 +182,7 @@ set(IMPL_SOURCES
   ${SOURCES_DIR}/internal/catch_floating_point_helpers.cpp
   ${SOURCES_DIR}/internal/catch_getenv.cpp
   ${SOURCES_DIR}/internal/catch_istream.cpp
+  ${SOURCES_DIR}/internal/catch_jsonwriter.cpp
   ${SOURCES_DIR}/internal/catch_lazy_expr.cpp
   ${SOURCES_DIR}/internal/catch_leak_detector.cpp
   ${SOURCES_DIR}/internal/catch_list.cpp
@@ -216,8 +228,8 @@ set(INTERFACE_HEADERS
   ${SOURCES_DIR}/interfaces/catch_interfaces_registry_hub.hpp
   ${SOURCES_DIR}/interfaces/catch_interfaces_reporter.hpp
   ${SOURCES_DIR}/interfaces/catch_interfaces_reporter_factory.hpp
-  ${SOURCES_DIR}/interfaces/catch_interfaces_reporter_registry.hpp
   ${SOURCES_DIR}/interfaces/catch_interfaces_tag_alias_registry.hpp
+  ${SOURCES_DIR}/interfaces/catch_interfaces_test_invoker.hpp
   ${SOURCES_DIR}/interfaces/catch_interfaces_testcase.hpp
 )
 set(INTERFACE_SOURCES
@@ -228,7 +240,6 @@ set(INTERFACE_SOURCES
   ${SOURCES_DIR}/interfaces/catch_interfaces_registry_hub.cpp
   ${SOURCES_DIR}/interfaces/catch_interfaces_reporter.cpp
   ${SOURCES_DIR}/interfaces/catch_interfaces_reporter_factory.cpp
-  ${SOURCES_DIR}/interfaces/catch_interfaces_reporter_registry.cpp
   ${SOURCES_DIR}/interfaces/catch_interfaces_testcase.cpp
 )
 set(INTERFACE_FILES ${INTERFACE_HEADERS} ${INTERFACE_SOURCES})
@@ -284,6 +295,7 @@ set(REPORTER_HEADERS
   ${SOURCES_DIR}/reporters/catch_reporter_cumulative_base.hpp
   ${SOURCES_DIR}/reporters/catch_reporter_event_listener.hpp
   ${SOURCES_DIR}/reporters/catch_reporter_helpers.hpp
+  ${SOURCES_DIR}/reporters/catch_reporter_json.hpp
   ${SOURCES_DIR}/reporters/catch_reporter_junit.hpp
   ${SOURCES_DIR}/reporters/catch_reporter_multi.hpp
   ${SOURCES_DIR}/reporters/catch_reporter_registrars.hpp
@@ -302,6 +314,7 @@ set(REPORTER_SOURCES
   ${SOURCES_DIR}/reporters/catch_reporter_cumulative_base.cpp
   ${SOURCES_DIR}/reporters/catch_reporter_event_listener.cpp
   ${SOURCES_DIR}/reporters/catch_reporter_helpers.cpp
+  ${SOURCES_DIR}/reporters/catch_reporter_json.cpp
   ${SOURCES_DIR}/reporters/catch_reporter_junit.cpp
   ${SOURCES_DIR}/reporters/catch_reporter_multi.cpp
   ${SOURCES_DIR}/reporters/catch_reporter_registrars.cpp
@@ -335,7 +348,9 @@ source_group("generated headers"
 )
 
 add_library(Catch2 ${ALL_FILES})
-add_build_reproducibility_settings(Catch2)
+if (CATCH_ENABLE_REPRODUCIBLE_BUILD)
+    add_build_reproducibility_settings(Catch2)
+endif()
 add_library(Catch2::Catch2 ALIAS Catch2)
 
 if (ANDROID)
@@ -388,7 +403,9 @@ target_include_directories(Catch2
 add_library(Catch2WithMain
     ${SOURCES_DIR}/internal/catch_main.cpp
 )
-add_build_reproducibility_settings(Catch2WithMain)
+if (CATCH_ENABLE_REPRODUCIBLE_BUILD)
+    add_build_reproducibility_settings(Catch2WithMain)
+endif()
 add_library(Catch2::Catch2WithMain ALIAS Catch2WithMain)
 target_link_libraries(Catch2WithMain PUBLIC Catch2)
 set_target_properties(Catch2WithMain
diff --git a/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_benchmark.hpp b/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_benchmark.hpp
index 1cf10be6..3db40bb0 100644
--- a/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_benchmark.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_benchmark.hpp
@@ -10,26 +10,28 @@
 #ifndef CATCH_BENCHMARK_HPP_INCLUDED
 #define CATCH_BENCHMARK_HPP_INCLUDED
 
-#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/catch_user_config.hpp>
 #include <catch2/internal/catch_compiler_capabilities.hpp>
 #include <catch2/internal/catch_context.hpp>
-#include <catch2/interfaces/catch_interfaces_reporter.hpp>
-#include <catch2/internal/catch_unique_name.hpp>
 #include <catch2/internal/catch_move_and_forward.hpp>
-#include <catch2/benchmark/catch_chronometer.hpp>
+#include <catch2/internal/catch_test_failure_exception.hpp>
+#include <catch2/internal/catch_unique_name.hpp>
+#include <catch2/interfaces/catch_interfaces_capture.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
+#include <catch2/benchmark/detail/catch_benchmark_stats.hpp>
 #include <catch2/benchmark/catch_clock.hpp>
 #include <catch2/benchmark/catch_environment.hpp>
 #include <catch2/benchmark/catch_execution_plan.hpp>
 #include <catch2/benchmark/detail/catch_estimate_clock.hpp>
-#include <catch2/benchmark/detail/catch_complete_invoke.hpp>
 #include <catch2/benchmark/detail/catch_analyse.hpp>
 #include <catch2/benchmark/detail/catch_benchmark_function.hpp>
 #include <catch2/benchmark/detail/catch_run_for_at_least.hpp>
 
 #include <algorithm>
-#include <functional>
+#include <chrono>
+#include <exception>
 #include <string>
-#include <vector>
 #include <cmath>
 
 namespace Catch {
@@ -43,16 +45,18 @@ namespace Catch {
                 : fun(CATCH_MOVE(func)), name(CATCH_MOVE(benchmarkName)) {}
 
             template <typename Clock>
-            ExecutionPlan<FloatDuration<Clock>> prepare(const IConfig &cfg, Environment<FloatDuration<Clock>> env) const {
+            ExecutionPlan prepare(const IConfig &cfg, Environment env) const {
                 auto min_time = env.clock_resolution.mean * Detail::minimum_ticks;
                 auto run_time = std::max(min_time, std::chrono::duration_cast<decltype(min_time)>(cfg.benchmarkWarmupTime()));
-                auto&& test = Detail::run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(run_time), 1, fun);
+                auto&& test = Detail::run_for_at_least<Clock>(std::chrono::duration_cast<IDuration>(run_time), 1, fun);
                 int new_iters = static_cast<int>(std::ceil(min_time * test.iterations / test.elapsed));
-                return { new_iters, test.elapsed / test.iterations * new_iters * cfg.benchmarkSamples(), fun, std::chrono::duration_cast<FloatDuration<Clock>>(cfg.benchmarkWarmupTime()), Detail::warmup_iterations };
+                return { new_iters, test.elapsed / test.iterations * new_iters * cfg.benchmarkSamples(), fun, std::chrono::duration_cast<FDuration>(cfg.benchmarkWarmupTime()), Detail::warmup_iterations };
             }
 
             template <typename Clock = default_clock>
             void run() {
+                static_assert( Clock::is_steady,
+                               "Benchmarking clock should be steady" );
                 auto const* cfg = getCurrentContext().getConfig();
 
                 auto env = Detail::measure_environment<Clock>();
@@ -79,10 +83,10 @@ namespace Catch {
                         return plan.template run<Clock>(*cfg, env);
                     });
 
-                    auto analysis = Detail::analyse(*cfg, env, samples.begin(), samples.end());
-                    BenchmarkStats<FloatDuration<Clock>> stats{ CATCH_MOVE(info), CATCH_MOVE(analysis.samples), analysis.mean, analysis.standard_deviation, analysis.outliers, analysis.outlier_variance };
+                    auto analysis = Detail::analyse(*cfg, samples.data(), samples.data() + samples.size());
+                    BenchmarkStats<> stats{ CATCH_MOVE(info), CATCH_MOVE(analysis.samples), analysis.mean, analysis.standard_deviation, analysis.outliers, analysis.outlier_variance };
                     getResultCapture().benchmarkEnded(stats);
-                } CATCH_CATCH_ANON (TestFailureException) {
+                } CATCH_CATCH_ANON (TestFailureException const&) {
                     getResultCapture().benchmarkFailed("Benchmark failed due to failed assertion"_sr);
                 } CATCH_CATCH_ALL{
                     getResultCapture().benchmarkFailed(translateActiveException());
diff --git a/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_benchmark_all.hpp b/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_benchmark_all.hpp
index eb81f238..56fc7c74 100644
--- a/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_benchmark_all.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_benchmark_all.hpp
@@ -33,6 +33,8 @@
 #include <catch2/benchmark/catch_sample_analysis.hpp>
 #include <catch2/benchmark/detail/catch_analyse.hpp>
 #include <catch2/benchmark/detail/catch_benchmark_function.hpp>
+#include <catch2/benchmark/detail/catch_benchmark_stats.hpp>
+#include <catch2/benchmark/detail/catch_benchmark_stats_fwd.hpp>
 #include <catch2/benchmark/detail/catch_complete_invoke.hpp>
 #include <catch2/benchmark/detail/catch_estimate_clock.hpp>
 #include <catch2/benchmark/detail/catch_measure.hpp>
diff --git a/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_chronometer.hpp b/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_chronometer.hpp
index bce2406b..95498e6b 100644
--- a/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_chronometer.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_chronometer.hpp
@@ -12,7 +12,6 @@
 
 #include <catch2/benchmark/catch_clock.hpp>
 #include <catch2/benchmark/catch_optimizer.hpp>
-#include <catch2/benchmark/detail/catch_complete_invoke.hpp>
 #include <catch2/internal/catch_meta.hpp>
 #include <catch2/internal/catch_move_and_forward.hpp>
 
@@ -33,7 +32,10 @@ namespace Catch {
                 void start() override { started = Clock::now(); }
                 void finish() override { finished = Clock::now(); }
 
-                ClockDuration<Clock> elapsed() const { return finished - started; }
+                IDuration elapsed() const {
+                    return std::chrono::duration_cast<std::chrono::nanoseconds>(
+                        finished - started );
+                }
 
                 TimePoint<Clock> started;
                 TimePoint<Clock> finished;
diff --git a/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_clock.hpp b/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_clock.hpp
index cee46097..4068c4d2 100644
--- a/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_clock.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_clock.hpp
@@ -11,28 +11,16 @@
 #define CATCH_CLOCK_HPP_INCLUDED
 
 #include <chrono>
-#include <ratio>
 
 namespace Catch {
     namespace Benchmark {
-        template <typename Clock>
-        using ClockDuration = typename Clock::duration;
-        template <typename Clock>
-        using FloatDuration = std::chrono::duration<double, typename Clock::period>;
+        using IDuration = std::chrono::nanoseconds;
+        using FDuration = std::chrono::duration<double, std::nano>;
 
         template <typename Clock>
         using TimePoint = typename Clock::time_point;
 
         using default_clock = std::chrono::steady_clock;
-
-        template <typename Clock>
-        struct now {
-            TimePoint<Clock> operator()() const {
-                return Clock::now();
-            }
-        };
-
-        using fp_seconds = std::chrono::duration<double, std::ratio<1>>;
     } // namespace Benchmark
 } // namespace Catch
 
diff --git a/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_environment.hpp b/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_environment.hpp
index de4d77df..da3f2fa9 100644
--- a/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_environment.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_environment.hpp
@@ -15,21 +15,13 @@
 
 namespace Catch {
     namespace Benchmark {
-        template <typename Duration>
         struct EnvironmentEstimate {
-            Duration mean;
+            FDuration mean;
             OutlierClassification outliers;
-
-            template <typename Duration2>
-            operator EnvironmentEstimate<Duration2>() const {
-                return { mean, outliers };
-            }
         };
-        template <typename Clock>
         struct Environment {
-            using clock_type = Clock;
-            EnvironmentEstimate<FloatDuration<Clock>> clock_resolution;
-            EnvironmentEstimate<FloatDuration<Clock>> clock_cost;
+            EnvironmentEstimate clock_resolution;
+            EnvironmentEstimate clock_cost;
         };
     } // namespace Benchmark
 } // namespace Catch
diff --git a/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_estimate.hpp b/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_estimate.hpp
index be594a18..64383a2e 100644
--- a/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_estimate.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_estimate.hpp
@@ -12,17 +12,12 @@
 
 namespace Catch {
     namespace Benchmark {
-        template <typename Duration>
+        template <typename Type>
         struct Estimate {
-            Duration point;
-            Duration lower_bound;
-            Duration upper_bound;
+            Type point;
+            Type lower_bound;
+            Type upper_bound;
             double confidence_interval;
-
-            template <typename Duration2>
-            operator Estimate<Duration2>() const {
-                return { point, lower_bound, upper_bound, confidence_interval };
-            }
         };
     } // namespace Benchmark
 } // namespace Catch
diff --git a/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_execution_plan.hpp b/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_execution_plan.hpp
index 039de7ee..17ca589f 100644
--- a/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_execution_plan.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_execution_plan.hpp
@@ -17,38 +17,38 @@
 #include <catch2/benchmark/detail/catch_repeat.hpp>
 #include <catch2/benchmark/detail/catch_run_for_at_least.hpp>
 
-#include <algorithm>
-#include <iterator>
+#include <vector>
 
 namespace Catch {
     namespace Benchmark {
-        template <typename Duration>
         struct ExecutionPlan {
             int iterations_per_sample;
-            Duration estimated_duration;
+            FDuration estimated_duration;
             Detail::BenchmarkFunction benchmark;
-            Duration warmup_time;
+            FDuration warmup_time;
             int warmup_iterations;
 
-            template <typename Duration2>
-            operator ExecutionPlan<Duration2>() const {
-                return { iterations_per_sample, estimated_duration, benchmark, warmup_time, warmup_iterations };
-            }
-
             template <typename Clock>
-            std::vector<FloatDuration<Clock>> run(const IConfig &cfg, Environment<FloatDuration<Clock>> env) const {
+            std::vector<FDuration> run(const IConfig &cfg, Environment env) const {
                 // warmup a bit
-                Detail::run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(warmup_time), warmup_iterations, Detail::repeat(now<Clock>{}));
-
-                std::vector<FloatDuration<Clock>> times;
-                times.reserve(cfg.benchmarkSamples());
-                std::generate_n(std::back_inserter(times), cfg.benchmarkSamples(), [this, env] {
+                Detail::run_for_at_least<Clock>(
+                    std::chrono::duration_cast<IDuration>( warmup_time ),
+                    warmup_iterations,
+                    Detail::repeat( []() { return Clock::now(); } )
+                );
+
+                std::vector<FDuration> times;
+                const auto num_samples = cfg.benchmarkSamples();
+                times.reserve( num_samples );
+                for ( size_t i = 0; i < num_samples; ++i ) {
                     Detail::ChronometerModel<Clock> model;
-                    this->benchmark(Chronometer(model, iterations_per_sample));
+                    this->benchmark( Chronometer( model, iterations_per_sample ) );
                     auto sample_time = model.elapsed() - env.clock_cost.mean;
-                    if (sample_time < FloatDuration<Clock>::zero()) sample_time = FloatDuration<Clock>::zero();
-                    return sample_time / iterations_per_sample;
-                });
+                    if ( sample_time < FDuration::zero() ) {
+                        sample_time = FDuration::zero();
+                    }
+                    times.push_back(sample_time / iterations_per_sample);
+                }
                 return times;
             }
         };
diff --git a/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_optimizer.hpp b/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_optimizer.hpp
index 0dbfc145..61e6571f 100644
--- a/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_optimizer.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_optimizer.hpp
@@ -10,7 +10,7 @@
 #ifndef CATCH_OPTIMIZER_HPP_INCLUDED
 #define CATCH_OPTIMIZER_HPP_INCLUDED
 
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) || defined(__IAR_SYSTEMS_ICC__)
 #   include <atomic> // atomic_thread_fence
 #endif
 
@@ -32,16 +32,23 @@ namespace Catch {
         namespace Detail {
             inline void optimizer_barrier() { keep_memory(); }
         } // namespace Detail
-#elif defined(_MSC_VER)
+#elif defined(_MSC_VER) || defined(__IAR_SYSTEMS_ICC__)
 
+#if defined(_MSVC_VER)
 #pragma optimize("", off)
+#elif defined(__IAR_SYSTEMS_ICC__)
+// For IAR the pragma only affects the following function
+#pragma optimize=disable
+#endif
         template <typename T>
         inline void keep_memory(T* p) {
             // thanks @milleniumbug
             *reinterpret_cast<char volatile*>(p) = *reinterpret_cast<char const volatile*>(p);
         }
         // TODO equivalent keep_memory()
+#if defined(_MSVC_VER)
 #pragma optimize("", on)
+#endif
 
         namespace Detail {
             inline void optimizer_barrier() {
@@ -63,7 +70,7 @@ namespace Catch {
 
         template <typename Fn, typename... Args>
         inline auto invoke_deoptimized(Fn&& fn, Args&&... args) -> std::enable_if_t<std::is_same<void, decltype(fn(args...))>::value> {
-            CATCH_FORWARD(fn) (CATCH_FORWARD(args)...);
+            CATCH_FORWARD((fn)) (CATCH_FORWARD(args)...);
         }
     } // namespace Benchmark
 } // namespace Catch
diff --git a/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_sample_analysis.hpp b/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_sample_analysis.hpp
index d849d246..aeb87d05 100644
--- a/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_sample_analysis.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/benchmark/catch_sample_analysis.hpp
@@ -10,38 +10,20 @@
 #ifndef CATCH_SAMPLE_ANALYSIS_HPP_INCLUDED
 #define CATCH_SAMPLE_ANALYSIS_HPP_INCLUDED
 
-#include <catch2/benchmark/catch_clock.hpp>
 #include <catch2/benchmark/catch_estimate.hpp>
 #include <catch2/benchmark/catch_outlier_classification.hpp>
-#include <catch2/internal/catch_move_and_forward.hpp>
+#include <catch2/benchmark/catch_clock.hpp>
 
-#include <algorithm>
 #include <vector>
-#include <iterator>
 
 namespace Catch {
     namespace Benchmark {
-        template <typename Duration>
         struct SampleAnalysis {
-            std::vector<Duration> samples;
-            Estimate<Duration> mean;
-            Estimate<Duration> standard_deviation;
+            std::vector<FDuration> samples;
+            Estimate<FDuration> mean;
+            Estimate<FDuration> standard_deviation;
             OutlierClassification outliers;
             double outlier_variance;
-
-            template <typename Duration2>
-            operator SampleAnalysis<Duration2>() const {
-                std::vector<Duration2> samples2;
-                samples2.reserve(samples.size());
-                std::transform(samples.begin(), samples.end(), std::back_inserter(samples2), [](Duration d) { return Duration2(d); });
-                return {
-                    CATCH_MOVE(samples2),
-                    mean,
-                    standard_deviation,
-                    outliers,
-                    outlier_variance,
-                };
-            }
         };
     } // namespace Benchmark
 } // namespace Catch
diff --git a/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_analyse.cpp b/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_analyse.cpp
new file mode 100644
index 00000000..7d27daf1
--- /dev/null
+++ b/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_analyse.cpp
@@ -0,0 +1,85 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+// Adapted from donated nonius code.
+
+#include <catch2/benchmark/detail/catch_analyse.hpp>
+#include <catch2/benchmark/catch_clock.hpp>
+#include <catch2/benchmark/catch_sample_analysis.hpp>
+#include <catch2/benchmark/detail/catch_stats.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <vector>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            SampleAnalysis analyse(const IConfig &cfg, FDuration* first, FDuration* last) {
+                if (!cfg.benchmarkNoAnalysis()) {
+                    std::vector<double> samples;
+                    samples.reserve(static_cast<size_t>(last - first));
+                    for (auto current = first; current != last; ++current) {
+                        samples.push_back( current->count() );
+                    }
+
+                    auto analysis = Catch::Benchmark::Detail::analyse_samples(
+                        cfg.benchmarkConfidenceInterval(),
+                        cfg.benchmarkResamples(),
+                        samples.data(),
+                        samples.data() + samples.size() );
+                    auto outliers = Catch::Benchmark::Detail::classify_outliers(
+                        samples.data(), samples.data() + samples.size() );
+
+                    auto wrap_estimate = [](Estimate<double> e) {
+                        return Estimate<FDuration> {
+                            FDuration(e.point),
+                                FDuration(e.lower_bound),
+                                FDuration(e.upper_bound),
+                                e.confidence_interval,
+                        };
+                    };
+                    std::vector<FDuration> samples2;
+                    samples2.reserve(samples.size());
+                    for (auto s : samples) {
+                        samples2.push_back( FDuration( s ) );
+                    }
+
+                    return {
+                        CATCH_MOVE(samples2),
+                        wrap_estimate(analysis.mean),
+                        wrap_estimate(analysis.standard_deviation),
+                        outliers,
+                        analysis.outlier_variance,
+                    };
+                } else {
+                    std::vector<FDuration> samples;
+                    samples.reserve(static_cast<size_t>(last - first));
+
+                    FDuration mean = FDuration(0);
+                    int i = 0;
+                    for (auto it = first; it < last; ++it, ++i) {
+                        samples.push_back(FDuration(*it));
+                        mean += FDuration(*it);
+                    }
+                    mean /= i;
+
+                    return SampleAnalysis{
+                        CATCH_MOVE(samples),
+                        Estimate<FDuration>{ mean, mean, mean, 0.0 },
+                        Estimate<FDuration>{ FDuration( 0 ),
+                                             FDuration( 0 ),
+                                             FDuration( 0 ),
+                                             0.0 },
+                        OutlierClassification{},
+                        0.0
+                    };
+                }
+            }
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
diff --git a/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_analyse.hpp b/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_analyse.hpp
index 77b0a9d3..5e3f7b0f 100644
--- a/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_analyse.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_analyse.hpp
@@ -11,68 +11,15 @@
 #define CATCH_ANALYSE_HPP_INCLUDED
 
 #include <catch2/benchmark/catch_clock.hpp>
-#include <catch2/benchmark/catch_environment.hpp>
 #include <catch2/benchmark/catch_sample_analysis.hpp>
-#include <catch2/benchmark/detail/catch_stats.hpp>
-#include <catch2/interfaces/catch_interfaces_config.hpp>
-#include <catch2/internal/catch_move_and_forward.hpp>
 
-#include <algorithm>
-#include <iterator>
-#include <vector>
 
 namespace Catch {
+    class IConfig;
+
     namespace Benchmark {
         namespace Detail {
-            template <typename Duration, typename Iterator>
-            SampleAnalysis<Duration> analyse(const IConfig &cfg, Environment<Duration>, Iterator first, Iterator last) {
-                if (!cfg.benchmarkNoAnalysis()) {
-                    std::vector<double> samples;
-                    samples.reserve(static_cast<size_t>(last - first));
-                    std::transform(first, last, std::back_inserter(samples), [](Duration d) { return d.count(); });
-
-                    auto analysis = Catch::Benchmark::Detail::analyse_samples(cfg.benchmarkConfidenceInterval(), cfg.benchmarkResamples(), samples.begin(), samples.end());
-                    auto outliers = Catch::Benchmark::Detail::classify_outliers(samples.begin(), samples.end());
-
-                    auto wrap_estimate = [](Estimate<double> e) {
-                        return Estimate<Duration> {
-                            Duration(e.point),
-                                Duration(e.lower_bound),
-                                Duration(e.upper_bound),
-                                e.confidence_interval,
-                        };
-                    };
-                    std::vector<Duration> samples2;
-                    samples2.reserve(samples.size());
-                    std::transform(samples.begin(), samples.end(), std::back_inserter(samples2), [](double d) { return Duration(d); });
-                    return {
-                        CATCH_MOVE(samples2),
-                        wrap_estimate(analysis.mean),
-                        wrap_estimate(analysis.standard_deviation),
-                        outliers,
-                        analysis.outlier_variance,
-                    };
-                } else {
-                    std::vector<Duration> samples;
-                    samples.reserve(static_cast<size_t>(last - first));
-
-                    Duration mean = Duration(0);
-                    int i = 0;
-                    for (auto it = first; it < last; ++it, ++i) {
-                        samples.push_back(Duration(*it));
-                        mean += Duration(*it);
-                    }
-                    mean /= i;
-
-                    return {
-                        CATCH_MOVE(samples),
-                        Estimate<Duration>{mean, mean, mean, 0.0},
-                        Estimate<Duration>{Duration(0), Duration(0), Duration(0), 0.0},
-                        OutlierClassification{},
-                        0.0
-                    };
-                }
-            }
+            SampleAnalysis analyse(const IConfig &cfg, FDuration* first, FDuration* last);
         } // namespace Detail
     } // namespace Benchmark
 } // namespace Catch
diff --git a/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_benchmark_function.hpp b/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_benchmark_function.hpp
index 15298258..144e4b6e 100644
--- a/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_benchmark_function.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_benchmark_function.hpp
@@ -11,7 +11,6 @@
 #define CATCH_BENCHMARK_FUNCTION_HPP_INCLUDED
 
 #include <catch2/benchmark/catch_chronometer.hpp>
-#include <catch2/benchmark/detail/catch_complete_invoke.hpp>
 #include <catch2/internal/catch_meta.hpp>
 #include <catch2/internal/catch_unique_ptr.hpp>
 #include <catch2/internal/catch_move_and_forward.hpp>
diff --git a/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_benchmark_stats.hpp b/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_benchmark_stats.hpp
new file mode 100644
index 00000000..3633bc9f
--- /dev/null
+++ b/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_benchmark_stats.hpp
@@ -0,0 +1,48 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_BENCHMARK_STATS_HPP_INCLUDED
+#define CATCH_BENCHMARK_STATS_HPP_INCLUDED
+
+#include <catch2/benchmark/catch_estimate.hpp>
+#include <catch2/benchmark/catch_outlier_classification.hpp>
+// The fwd decl & default specialization needs to be seen by VS2017 before
+// BenchmarkStats itself, or VS2017 will report compilation error.
+#include <catch2/benchmark/detail/catch_benchmark_stats_fwd.hpp>
+
+#include <string>
+#include <vector>
+
+namespace Catch {
+
+    struct BenchmarkInfo {
+        std::string name;
+        double estimatedDuration;
+        int iterations;
+        unsigned int samples;
+        unsigned int resamples;
+        double clockResolution;
+        double clockCost;
+    };
+
+    // We need to keep template parameter for backwards compatibility,
+    // but we also do not want to use the template paraneter.
+    template <class Dummy>
+    struct BenchmarkStats {
+        BenchmarkInfo info;
+
+        std::vector<Benchmark::FDuration> samples;
+        Benchmark::Estimate<Benchmark::FDuration> mean;
+        Benchmark::Estimate<Benchmark::FDuration> standardDeviation;
+        Benchmark::OutlierClassification outliers;
+        double outlierVariance;
+    };
+
+
+} // end namespace Catch
+
+#endif // CATCH_BENCHMARK_STATS_HPP_INCLUDED
diff --git a/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_benchmark_stats_fwd.hpp b/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_benchmark_stats_fwd.hpp
new file mode 100644
index 00000000..2ccc25d5
--- /dev/null
+++ b/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_benchmark_stats_fwd.hpp
@@ -0,0 +1,23 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_BENCHMARK_STATS_FWD_HPP_INCLUDED
+#define CATCH_BENCHMARK_STATS_FWD_HPP_INCLUDED
+
+#include <catch2/benchmark/catch_clock.hpp>
+
+namespace Catch {
+
+    // We cannot forward declare the type with default template argument
+    // multiple times, so it is split out into a separate header so that
+    // we can prevent multiple declarations in dependees
+    template <typename Duration = Benchmark::FDuration>
+    struct BenchmarkStats;
+
+} // end namespace Catch
+
+#endif // CATCH_BENCHMARK_STATS_FWD_HPP_INCLUDED
diff --git a/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_complete_invoke.hpp b/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_complete_invoke.hpp
index 49db413e..4dff4b7e 100644
--- a/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_complete_invoke.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_complete_invoke.hpp
@@ -10,14 +10,9 @@
 #ifndef CATCH_COMPLETE_INVOKE_HPP_INCLUDED
 #define CATCH_COMPLETE_INVOKE_HPP_INCLUDED
 
-#include <catch2/internal/catch_test_failure_exception.hpp>
 #include <catch2/internal/catch_meta.hpp>
-#include <catch2/interfaces/catch_interfaces_capture.hpp>
-#include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
 #include <catch2/internal/catch_move_and_forward.hpp>
 
-#include <type_traits>
-
 namespace Catch {
     namespace Benchmark {
         namespace Detail {
diff --git a/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_estimate_clock.hpp b/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_estimate_clock.hpp
index 907773f2..8e355279 100644
--- a/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_estimate_clock.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_estimate_clock.hpp
@@ -19,7 +19,6 @@
 #include <catch2/internal/catch_unique_ptr.hpp>
 
 #include <algorithm>
-#include <iterator>
 #include <vector>
 #include <cmath>
 
@@ -30,46 +29,49 @@ namespace Catch {
             std::vector<double> resolution(int k) {
                 std::vector<TimePoint<Clock>> times;
                 times.reserve(static_cast<size_t>(k + 1));
-                std::generate_n(std::back_inserter(times), k + 1, now<Clock>{});
+                for ( int i = 0; i < k + 1; ++i ) {
+                    times.push_back( Clock::now() );
+                }
 
                 std::vector<double> deltas;
                 deltas.reserve(static_cast<size_t>(k));
-                std::transform(std::next(times.begin()), times.end(), times.begin(),
-                    std::back_inserter(deltas),
-                    [](TimePoint<Clock> a, TimePoint<Clock> b) { return static_cast<double>((a - b).count()); });
+                for ( size_t idx = 1; idx < times.size(); ++idx ) {
+                    deltas.push_back( static_cast<double>(
+                        ( times[idx] - times[idx - 1] ).count() ) );
+                }
 
                 return deltas;
             }
 
-            const auto warmup_iterations = 10000;
-            const auto warmup_time = std::chrono::milliseconds(100);
-            const auto minimum_ticks = 1000;
-            const auto warmup_seed = 10000;
-            const auto clock_resolution_estimation_time = std::chrono::milliseconds(500);
-            const auto clock_cost_estimation_time_limit = std::chrono::seconds(1);
-            const auto clock_cost_estimation_tick_limit = 100000;
-            const auto clock_cost_estimation_time = std::chrono::milliseconds(10);
-            const auto clock_cost_estimation_iterations = 10000;
+            constexpr auto warmup_iterations = 10000;
+            constexpr auto warmup_time = std::chrono::milliseconds(100);
+            constexpr auto minimum_ticks = 1000;
+            constexpr auto warmup_seed = 10000;
+            constexpr auto clock_resolution_estimation_time = std::chrono::milliseconds(500);
+            constexpr auto clock_cost_estimation_time_limit = std::chrono::seconds(1);
+            constexpr auto clock_cost_estimation_tick_limit = 100000;
+            constexpr auto clock_cost_estimation_time = std::chrono::milliseconds(10);
+            constexpr auto clock_cost_estimation_iterations = 10000;
 
             template <typename Clock>
             int warmup() {
-                return run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(warmup_time), warmup_seed, &resolution<Clock>)
+                return run_for_at_least<Clock>(warmup_time, warmup_seed, &resolution<Clock>)
                     .iterations;
             }
             template <typename Clock>
-            EnvironmentEstimate<FloatDuration<Clock>> estimate_clock_resolution(int iterations) {
-                auto r = run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(clock_resolution_estimation_time), iterations, &resolution<Clock>)
+            EnvironmentEstimate estimate_clock_resolution(int iterations) {
+                auto r = run_for_at_least<Clock>(clock_resolution_estimation_time, iterations, &resolution<Clock>)
                     .result;
                 return {
-                    FloatDuration<Clock>(mean(r.begin(), r.end())),
-                    classify_outliers(r.begin(), r.end()),
+                    FDuration(mean(r.data(), r.data() + r.size())),
+                    classify_outliers(r.data(), r.data() + r.size()),
                 };
             }
             template <typename Clock>
-            EnvironmentEstimate<FloatDuration<Clock>> estimate_clock_cost(FloatDuration<Clock> resolution) {
+            EnvironmentEstimate estimate_clock_cost(FDuration resolution) {
                 auto time_limit = (std::min)(
                     resolution * clock_cost_estimation_tick_limit,
-                    FloatDuration<Clock>(clock_cost_estimation_time_limit));
+                    FDuration(clock_cost_estimation_time_limit));
                 auto time_clock = [](int k) {
                     return Detail::measure<Clock>([k] {
                         for (int i = 0; i < k; ++i) {
@@ -80,26 +82,28 @@ namespace Catch {
                 };
                 time_clock(1);
                 int iters = clock_cost_estimation_iterations;
-                auto&& r = run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(clock_cost_estimation_time), iters, time_clock);
+                auto&& r = run_for_at_least<Clock>(clock_cost_estimation_time, iters, time_clock);
                 std::vector<double> times;
                 int nsamples = static_cast<int>(std::ceil(time_limit / r.elapsed));
                 times.reserve(static_cast<size_t>(nsamples));
-                std::generate_n(std::back_inserter(times), nsamples, [time_clock, &r] {
-                    return static_cast<double>((time_clock(r.iterations) / r.iterations).count());
-                });
+                for ( int s = 0; s < nsamples; ++s ) {
+                    times.push_back( static_cast<double>(
+                        ( time_clock( r.iterations ) / r.iterations )
+                            .count() ) );
+                }
                 return {
-                    FloatDuration<Clock>(mean(times.begin(), times.end())),
-                    classify_outliers(times.begin(), times.end()),
+                    FDuration(mean(times.data(), times.data() + times.size())),
+                    classify_outliers(times.data(), times.data() + times.size()),
                 };
             }
 
             template <typename Clock>
-            Environment<FloatDuration<Clock>> measure_environment() {
+            Environment measure_environment() {
 #if defined(__clang__)
 #    pragma clang diagnostic push
 #    pragma clang diagnostic ignored "-Wexit-time-destructors"
 #endif
-                static Catch::Detail::unique_ptr<Environment<FloatDuration<Clock>>> env;
+                static Catch::Detail::unique_ptr<Environment> env;
 #if defined(__clang__)
 #    pragma clang diagnostic pop
 #endif
@@ -111,7 +115,7 @@ namespace Catch {
                 auto resolution = Detail::estimate_clock_resolution<Clock>(iters);
                 auto cost = Detail::estimate_clock_cost<Clock>(resolution.mean);
 
-                env = Catch::Detail::make_unique<Environment<FloatDuration<Clock>>>( Environment<FloatDuration<Clock>>{resolution, cost} );
+                env = Catch::Detail::make_unique<Environment>( Environment{resolution, cost} );
                 return *env;
             }
         } // namespace Detail
diff --git a/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_measure.hpp b/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_measure.hpp
index 388814c1..37494a68 100644
--- a/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_measure.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_measure.hpp
@@ -10,7 +10,6 @@
 #ifndef CATCH_MEASURE_HPP_INCLUDED
 #define CATCH_MEASURE_HPP_INCLUDED
 
-#include <catch2/benchmark/catch_clock.hpp>
 #include <catch2/benchmark/detail/catch_complete_invoke.hpp>
 #include <catch2/benchmark/detail/catch_timing.hpp>
 #include <catch2/internal/catch_move_and_forward.hpp>
@@ -19,7 +18,7 @@ namespace Catch {
     namespace Benchmark {
         namespace Detail {
             template <typename Clock, typename Fun, typename... Args>
-            TimingOf<Clock, Fun, Args...> measure(Fun&& fun, Args&&... args) {
+            TimingOf<Fun, Args...> measure(Fun&& fun, Args&&... args) {
                 auto start = Clock::now();
                 auto&& r = Detail::complete_invoke(fun, CATCH_FORWARD(args)...);
                 auto end = Clock::now();
diff --git a/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_run_for_at_least.cpp b/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_run_for_at_least.cpp
index 35778b27..3ebdcc05 100644
--- a/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_run_for_at_least.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_run_for_at_least.cpp
@@ -7,9 +7,10 @@
 // SPDX-License-Identifier: BSL-1.0
 
 #include <catch2/benchmark/detail/catch_run_for_at_least.hpp>
-#include <exception>
 #include <catch2/internal/catch_enforce.hpp>
 
+#include <exception>
+
 namespace Catch {
     namespace Benchmark {
         namespace Detail {
diff --git a/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_run_for_at_least.hpp b/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_run_for_at_least.hpp
index 976a4b24..4dfa8bbb 100644
--- a/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_run_for_at_least.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_run_for_at_least.hpp
@@ -24,11 +24,11 @@ namespace Catch {
     namespace Benchmark {
         namespace Detail {
             template <typename Clock, typename Fun>
-            TimingOf<Clock, Fun, int> measure_one(Fun&& fun, int iters, std::false_type) {
+            TimingOf<Fun, int> measure_one(Fun&& fun, int iters, std::false_type) {
                 return Detail::measure<Clock>(fun, iters);
             }
             template <typename Clock, typename Fun>
-            TimingOf<Clock, Fun, Chronometer> measure_one(Fun&& fun, int iters, std::true_type) {
+            TimingOf<Fun, Chronometer> measure_one(Fun&& fun, int iters, std::true_type) {
                 Detail::ChronometerModel<Clock> meter;
                 auto&& result = Detail::complete_invoke(fun, Chronometer(meter, iters));
 
@@ -43,8 +43,8 @@ namespace Catch {
             void throw_optimized_away_error();
 
             template <typename Clock, typename Fun>
-            TimingOf<Clock, Fun, run_for_at_least_argument_t<Clock, Fun>>
-                run_for_at_least(ClockDuration<Clock> how_long,
+            TimingOf<Fun, run_for_at_least_argument_t<Clock, Fun>>
+                run_for_at_least(IDuration how_long,
                                  const int initial_iterations,
                                  Fun&& fun) {
                 auto iters = initial_iterations;
diff --git a/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_stats.cpp b/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_stats.cpp
index 514ed1f7..52cee4ee 100644
--- a/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_stats.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_stats.cpp
@@ -10,10 +10,14 @@
 #include <catch2/benchmark/detail/catch_stats.hpp>
 
 #include <catch2/internal/catch_compiler_capabilities.hpp>
+#include <catch2/internal/catch_floating_point_helpers.hpp>
+#include <catch2/internal/catch_random_number_generator.hpp>
 
+#include <algorithm>
 #include <cassert>
+#include <cmath>
 #include <cstddef>
-#include <iterator>
+#include <numeric>
 #include <random>
 
 
@@ -21,139 +25,199 @@
 #include <future>
 #endif
 
-namespace {
-
-using Catch::Benchmark::Detail::sample;
-
-     template <typename URng, typename Estimator>
-     sample resample(URng& rng, unsigned int resamples, std::vector<double>::iterator first, std::vector<double>::iterator last, Estimator& estimator) {
-         auto n = static_cast<size_t>(last - first);
-         std::uniform_int_distribution<decltype(n)> dist(0, n - 1);
-
-         sample out;
-         out.reserve(resamples);
-         std::generate_n(std::back_inserter(out), resamples, [n, first, &estimator, &dist, &rng] {
-             std::vector<double> resampled;
-             resampled.reserve(n);
-             std::generate_n(std::back_inserter(resampled), n, [first, &dist, &rng] { return first[static_cast<std::ptrdiff_t>(dist(rng))]; });
-             return estimator(resampled.begin(), resampled.end());
-         });
-         std::sort(out.begin(), out.end());
-         return out;
-     }
-
-
-    double erf_inv(double x) {
-        // Code accompanying the article "Approximating the erfinv function" in GPU Computing Gems, Volume 2
-        double w, p;
-
-        w = -log((1.0 - x) * (1.0 + x));
-
-        if (w < 6.250000) {
-            w = w - 3.125000;
-            p = -3.6444120640178196996e-21;
-            p = -1.685059138182016589e-19 + p * w;
-            p = 1.2858480715256400167e-18 + p * w;
-            p = 1.115787767802518096e-17 + p * w;
-            p = -1.333171662854620906e-16 + p * w;
-            p = 2.0972767875968561637e-17 + p * w;
-            p = 6.6376381343583238325e-15 + p * w;
-            p = -4.0545662729752068639e-14 + p * w;
-            p = -8.1519341976054721522e-14 + p * w;
-            p = 2.6335093153082322977e-12 + p * w;
-            p = -1.2975133253453532498e-11 + p * w;
-            p = -5.4154120542946279317e-11 + p * w;
-            p = 1.051212273321532285e-09 + p * w;
-            p = -4.1126339803469836976e-09 + p * w;
-            p = -2.9070369957882005086e-08 + p * w;
-            p = 4.2347877827932403518e-07 + p * w;
-            p = -1.3654692000834678645e-06 + p * w;
-            p = -1.3882523362786468719e-05 + p * w;
-            p = 0.0001867342080340571352 + p * w;
-            p = -0.00074070253416626697512 + p * w;
-            p = -0.0060336708714301490533 + p * w;
-            p = 0.24015818242558961693 + p * w;
-            p = 1.6536545626831027356 + p * w;
-        } else if (w < 16.000000) {
-            w = sqrt(w) - 3.250000;
-            p = 2.2137376921775787049e-09;
-            p = 9.0756561938885390979e-08 + p * w;
-            p = -2.7517406297064545428e-07 + p * w;
-            p = 1.8239629214389227755e-08 + p * w;
-            p = 1.5027403968909827627e-06 + p * w;
-            p = -4.013867526981545969e-06 + p * w;
-            p = 2.9234449089955446044e-06 + p * w;
-            p = 1.2475304481671778723e-05 + p * w;
-            p = -4.7318229009055733981e-05 + p * w;
-            p = 6.8284851459573175448e-05 + p * w;
-            p = 2.4031110387097893999e-05 + p * w;
-            p = -0.0003550375203628474796 + p * w;
-            p = 0.00095328937973738049703 + p * w;
-            p = -0.0016882755560235047313 + p * w;
-            p = 0.0024914420961078508066 + p * w;
-            p = -0.0037512085075692412107 + p * w;
-            p = 0.005370914553590063617 + p * w;
-            p = 1.0052589676941592334 + p * w;
-            p = 3.0838856104922207635 + p * w;
-        } else {
-            w = sqrt(w) - 5.000000;
-            p = -2.7109920616438573243e-11;
-            p = -2.5556418169965252055e-10 + p * w;
-            p = 1.5076572693500548083e-09 + p * w;
-            p = -3.7894654401267369937e-09 + p * w;
-            p = 7.6157012080783393804e-09 + p * w;
-            p = -1.4960026627149240478e-08 + p * w;
-            p = 2.9147953450901080826e-08 + p * w;
-            p = -6.7711997758452339498e-08 + p * w;
-            p = 2.2900482228026654717e-07 + p * w;
-            p = -9.9298272942317002539e-07 + p * w;
-            p = 4.5260625972231537039e-06 + p * w;
-            p = -1.9681778105531670567e-05 + p * w;
-            p = 7.5995277030017761139e-05 + p * w;
-            p = -0.00021503011930044477347 + p * w;
-            p = -0.00013871931833623122026 + p * w;
-            p = 1.0103004648645343977 + p * w;
-            p = 4.8499064014085844221 + p * w;
-        }
-        return p * x;
-    }
-
-    double standard_deviation(std::vector<double>::iterator first, std::vector<double>::iterator last) {
-        auto m = Catch::Benchmark::Detail::mean(first, last);
-        double variance = std::accumulate( first,
-                                           last,
-                                           0.,
-                                           [m]( double a, double b ) {
-                                               double diff = b - m;
-                                               return a + diff * diff;
-                                           } ) /
-                          ( last - first );
-        return std::sqrt( variance );
-    }
-
-}
-
 namespace Catch {
     namespace Benchmark {
         namespace Detail {
+            namespace {
+
+                template <typename URng, typename Estimator>
+                static sample
+                resample( URng& rng,
+                          unsigned int resamples,
+                          double const* first,
+                          double const* last,
+                          Estimator& estimator ) {
+                    auto n = static_cast<size_t>( last - first );
+                    std::uniform_int_distribution<size_t> dist( 0, n - 1 );
+
+                    sample out;
+                    out.reserve( resamples );
+                    std::vector<double> resampled;
+                    resampled.reserve( n );
+                    for ( size_t i = 0; i < resamples; ++i ) {
+                        resampled.clear();
+                        for ( size_t s = 0; s < n; ++s ) {
+                            resampled.push_back( first[dist( rng )] );
+                        }
+                        const auto estimate =
+                            estimator( resampled.data(), resampled.data() + resampled.size() );
+                        out.push_back( estimate );
+                    }
+                    std::sort( out.begin(), out.end() );
+                    return out;
+                }
 
-#if defined( __GNUC__ ) || defined( __clang__ )
-#    pragma GCC diagnostic push
-#    pragma GCC diagnostic ignored "-Wfloat-equal"
-#endif
-            bool directCompare( double lhs, double rhs ) { return lhs == rhs; }
-#if defined( __GNUC__ ) || defined( __clang__ )
-#    pragma GCC diagnostic pop
-#endif
+                static double outlier_variance( Estimate<double> mean,
+                                                Estimate<double> stddev,
+                                                int n ) {
+                    double sb = stddev.point;
+                    double mn = mean.point / n;
+                    double mg_min = mn / 2.;
+                    double sg = (std::min)( mg_min / 4., sb / std::sqrt( n ) );
+                    double sg2 = sg * sg;
+                    double sb2 = sb * sb;
+
+                    auto c_max = [n, mn, sb2, sg2]( double x ) -> double {
+                        double k = mn - x;
+                        double d = k * k;
+                        double nd = n * d;
+                        double k0 = -n * nd;
+                        double k1 = sb2 - n * sg2 + nd;
+                        double det = k1 * k1 - 4 * sg2 * k0;
+                        return static_cast<int>( -2. * k0 /
+                                                 ( k1 + std::sqrt( det ) ) );
+                    };
+
+                    auto var_out = [n, sb2, sg2]( double c ) {
+                        double nc = n - c;
+                        return ( nc / n ) * ( sb2 - nc * sg2 );
+                    };
+
+                    return (std::min)( var_out( 1 ),
+                                       var_out(
+                                           (std::min)( c_max( 0. ),
+                                                       c_max( mg_min ) ) ) ) /
+                           sb2;
+                }
+
+                static double erf_inv( double x ) {
+                    // Code accompanying the article "Approximating the erfinv
+                    // function" in GPU Computing Gems, Volume 2
+                    double w, p;
+
+                    w = -log( ( 1.0 - x ) * ( 1.0 + x ) );
+
+                    if ( w < 6.250000 ) {
+                        w = w - 3.125000;
+                        p = -3.6444120640178196996e-21;
+                        p = -1.685059138182016589e-19 + p * w;
+                        p = 1.2858480715256400167e-18 + p * w;
+                        p = 1.115787767802518096e-17 + p * w;
+                        p = -1.333171662854620906e-16 + p * w;
+                        p = 2.0972767875968561637e-17 + p * w;
+                        p = 6.6376381343583238325e-15 + p * w;
+                        p = -4.0545662729752068639e-14 + p * w;
+                        p = -8.1519341976054721522e-14 + p * w;
+                        p = 2.6335093153082322977e-12 + p * w;
+                        p = -1.2975133253453532498e-11 + p * w;
+                        p = -5.4154120542946279317e-11 + p * w;
+                        p = 1.051212273321532285e-09 + p * w;
+                        p = -4.1126339803469836976e-09 + p * w;
+                        p = -2.9070369957882005086e-08 + p * w;
+                        p = 4.2347877827932403518e-07 + p * w;
+                        p = -1.3654692000834678645e-06 + p * w;
+                        p = -1.3882523362786468719e-05 + p * w;
+                        p = 0.0001867342080340571352 + p * w;
+                        p = -0.00074070253416626697512 + p * w;
+                        p = -0.0060336708714301490533 + p * w;
+                        p = 0.24015818242558961693 + p * w;
+                        p = 1.6536545626831027356 + p * w;
+                    } else if ( w < 16.000000 ) {
+                        w = sqrt( w ) - 3.250000;
+                        p = 2.2137376921775787049e-09;
+                        p = 9.0756561938885390979e-08 + p * w;
+                        p = -2.7517406297064545428e-07 + p * w;
+                        p = 1.8239629214389227755e-08 + p * w;
+                        p = 1.5027403968909827627e-06 + p * w;
+                        p = -4.013867526981545969e-06 + p * w;
+                        p = 2.9234449089955446044e-06 + p * w;
+                        p = 1.2475304481671778723e-05 + p * w;
+                        p = -4.7318229009055733981e-05 + p * w;
+                        p = 6.8284851459573175448e-05 + p * w;
+                        p = 2.4031110387097893999e-05 + p * w;
+                        p = -0.0003550375203628474796 + p * w;
+                        p = 0.00095328937973738049703 + p * w;
+                        p = -0.0016882755560235047313 + p * w;
+                        p = 0.0024914420961078508066 + p * w;
+                        p = -0.0037512085075692412107 + p * w;
+                        p = 0.005370914553590063617 + p * w;
+                        p = 1.0052589676941592334 + p * w;
+                        p = 3.0838856104922207635 + p * w;
+                    } else {
+                        w = sqrt( w ) - 5.000000;
+                        p = -2.7109920616438573243e-11;
+                        p = -2.5556418169965252055e-10 + p * w;
+                        p = 1.5076572693500548083e-09 + p * w;
+                        p = -3.7894654401267369937e-09 + p * w;
+                        p = 7.6157012080783393804e-09 + p * w;
+                        p = -1.4960026627149240478e-08 + p * w;
+                        p = 2.9147953450901080826e-08 + p * w;
+                        p = -6.7711997758452339498e-08 + p * w;
+                        p = 2.2900482228026654717e-07 + p * w;
+                        p = -9.9298272942317002539e-07 + p * w;
+                        p = 4.5260625972231537039e-06 + p * w;
+                        p = -1.9681778105531670567e-05 + p * w;
+                        p = 7.5995277030017761139e-05 + p * w;
+                        p = -0.00021503011930044477347 + p * w;
+                        p = -0.00013871931833623122026 + p * w;
+                        p = 1.0103004648645343977 + p * w;
+                        p = 4.8499064014085844221 + p * w;
+                    }
+                    return p * x;
+                }
+
+                static double
+                standard_deviation( double const* first, double const* last ) {
+                    auto m = Catch::Benchmark::Detail::mean( first, last );
+                    double variance =
+                        std::accumulate( first,
+                                         last,
+                                         0.,
+                                         [m]( double a, double b ) {
+                                             double diff = b - m;
+                                             return a + diff * diff;
+                                         } ) /
+                        ( last - first );
+                    return std::sqrt( variance );
+                }
+
+                static sample jackknife( double ( *estimator )( double const*,
+                                                                double const* ),
+                                         double* first,
+                                         double* last ) {
+                    const auto second = first + 1;
+                    sample results;
+                    results.reserve( static_cast<size_t>( last - first ) );
+
+                    for ( auto it = first; it != last; ++it ) {
+                        std::iter_swap( it, first );
+                        results.push_back( estimator( second, last ) );
+                    }
+
+                    return results;
+                }
+
+
+            } // namespace
+        }     // namespace Detail
+    }         // namespace Benchmark
+} // namespace Catch
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
 
-            double weighted_average_quantile(int k, int q, std::vector<double>::iterator first, std::vector<double>::iterator last) {
+            double weighted_average_quantile( int k,
+                                              int q,
+                                              double* first,
+                                              double* last ) {
                 auto count = last - first;
                 double idx = (count - 1) * k / static_cast<double>(q);
                 int j = static_cast<int>(idx);
                 double g = idx - j;
                 std::nth_element(first, first + j, last);
                 auto xj = first[j];
-                if ( directCompare( g, 0 ) ) {
+                if ( Catch::Detail::directCompare( g, 0 ) ) {
                     return xj;
                 }
 
@@ -161,6 +225,48 @@ namespace Catch {
                 return xj + g * (xj1 - xj);
             }
 
+            OutlierClassification
+            classify_outliers( double const* first, double const* last ) {
+                std::vector<double> copy( first, last );
+
+                auto q1 = weighted_average_quantile( 1, 4, copy.data(), copy.data() + copy.size() );
+                auto q3 = weighted_average_quantile( 3, 4, copy.data(), copy.data() + copy.size() );
+                auto iqr = q3 - q1;
+                auto los = q1 - ( iqr * 3. );
+                auto lom = q1 - ( iqr * 1.5 );
+                auto him = q3 + ( iqr * 1.5 );
+                auto his = q3 + ( iqr * 3. );
+
+                OutlierClassification o;
+                for ( ; first != last; ++first ) {
+                    const double t = *first;
+                    if ( t < los ) {
+                        ++o.low_severe;
+                    } else if ( t < lom ) {
+                        ++o.low_mild;
+                    } else if ( t > his ) {
+                        ++o.high_severe;
+                    } else if ( t > him ) {
+                        ++o.high_mild;
+                    }
+                    ++o.samples_seen;
+                }
+                return o;
+            }
+
+            double mean( double const* first, double const* last ) {
+                auto count = last - first;
+                double sum = 0.;
+                while (first != last) {
+                    sum += *first;
+                    ++first;
+                }
+                return sum / static_cast<double>(count);
+            }
+
+            double normal_cdf( double x ) {
+                return std::erfc( -x / std::sqrt( 2.0 ) ) / 2.0;
+            }
 
             double erfc_inv(double x) {
                 return erf_inv(1.0 - x);
@@ -182,50 +288,77 @@ namespace Catch {
                 return result;
             }
 
+            Estimate<double>
+            bootstrap( double confidence_level,
+                       double* first,
+                       double* last,
+                       sample const& resample,
+                       double ( *estimator )( double const*, double const* ) ) {
+                auto n_samples = last - first;
+
+                double point = estimator( first, last );
+                // Degenerate case with a single sample
+                if ( n_samples == 1 )
+                    return { point, point, point, confidence_level };
+
+                sample jack = jackknife( estimator, first, last );
+                double jack_mean =
+                    mean( jack.data(), jack.data() + jack.size() );
+                double sum_squares = 0, sum_cubes = 0;
+                for ( double x : jack ) {
+                    auto difference = jack_mean - x;
+                    auto square = difference * difference;
+                    auto cube = square * difference;
+                    sum_squares += square;
+                    sum_cubes += cube;
+                }
 
-            double outlier_variance(Estimate<double> mean, Estimate<double> stddev, int n) {
-                double sb = stddev.point;
-                double mn = mean.point / n;
-                double mg_min = mn / 2.;
-                double sg = (std::min)(mg_min / 4., sb / std::sqrt(n));
-                double sg2 = sg * sg;
-                double sb2 = sb * sb;
-
-                auto c_max = [n, mn, sb2, sg2](double x) -> double {
-                    double k = mn - x;
-                    double d = k * k;
-                    double nd = n * d;
-                    double k0 = -n * nd;
-                    double k1 = sb2 - n * sg2 + nd;
-                    double det = k1 * k1 - 4 * sg2 * k0;
-                    return static_cast<int>(-2. * k0 / (k1 + std::sqrt(det)));
-                };
+                double accel = sum_cubes / ( 6 * std::pow( sum_squares, 1.5 ) );
+                long n = static_cast<long>( resample.size() );
+                double prob_n =
+                    std::count_if( resample.begin(),
+                                   resample.end(),
+                                   [point]( double x ) { return x < point; } ) /
+                    static_cast<double>( n );
+                // degenerate case with uniform samples
+                if ( Catch::Detail::directCompare( prob_n, 0. ) ) {
+                    return { point, point, point, confidence_level };
+                }
 
-                auto var_out = [n, sb2, sg2](double c) {
-                    double nc = n - c;
-                    return (nc / n) * (sb2 - nc * sg2);
-                };
+                double bias = normal_quantile( prob_n );
+                double z1 = normal_quantile( ( 1. - confidence_level ) / 2. );
 
-                return (std::min)(var_out(1), var_out((std::min)(c_max(0.), c_max(mg_min)))) / sb2;
+                auto cumn = [n]( double x ) -> long {
+                    return std::lround( normal_cdf( x ) *
+                                        static_cast<double>( n ) );
+                };
+                auto a = [bias, accel]( double b ) {
+                    return bias + b / ( 1. - accel * b );
+                };
+                double b1 = bias + z1;
+                double b2 = bias - z1;
+                double a1 = a( b1 );
+                double a2 = a( b2 );
+                auto lo = static_cast<size_t>( (std::max)( cumn( a1 ), 0l ) );
+                auto hi =
+                    static_cast<size_t>( (std::min)( cumn( a2 ), n - 1 ) );
+
+                return { point, resample[lo], resample[hi], confidence_level };
             }
 
-
-            bootstrap_analysis analyse_samples(double confidence_level, unsigned int n_resamples, std::vector<double>::iterator first, std::vector<double>::iterator last) {
-                CATCH_INTERNAL_START_WARNINGS_SUPPRESSION
-                CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS
-                static std::random_device entropy;
-                CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
-
-                auto n = static_cast<int>(last - first); // seriously, one can't use integral types without hell in C++
-
-                auto mean = &Detail::mean<std::vector<double>::iterator>;
+            bootstrap_analysis analyse_samples(double confidence_level,
+                                               unsigned int n_resamples,
+                                               double* first,
+                                               double* last) {
+                auto mean = &Detail::mean;
                 auto stddev = &standard_deviation;
 
 #if defined(CATCH_CONFIG_USE_ASYNC)
-                auto Estimate = [=](double(*f)(std::vector<double>::iterator, std::vector<double>::iterator)) {
-                    auto seed = entropy();
+                auto Estimate = [=](double(*f)(double const*, double const*)) {
+                    std::random_device rd;
+                    auto seed = rd();
                     return std::async(std::launch::async, [=] {
-                        std::mt19937 rng(seed);
+                        SimplePcg32 rng( seed );
                         auto resampled = resample(rng, n_resamples, first, last, f);
                         return bootstrap(confidence_level, first, last, resampled, f);
                     });
@@ -237,9 +370,10 @@ namespace Catch {
                 auto mean_estimate = mean_future.get();
                 auto stddev_estimate = stddev_future.get();
 #else
-                auto Estimate = [=](double(*f)(std::vector<double>::iterator, std::vector<double>::iterator)) {
-                    auto seed = entropy();
-                    std::mt19937 rng(seed);
+                auto Estimate = [=](double(*f)(double const* , double const*)) {
+                    std::random_device rd;
+                    auto seed = rd();
+                    SimplePcg32 rng( seed );
                     auto resampled = resample(rng, n_resamples, first, last, f);
                     return bootstrap(confidence_level, first, last, resampled, f);
                 };
@@ -248,6 +382,7 @@ namespace Catch {
                 auto stddev_estimate = Estimate(stddev);
 #endif // CATCH_USE_ASYNC
 
+                auto n = static_cast<int>(last - first); // seriously, one can't use integral types without hell in C++
                 double outlier_variance = Detail::outlier_variance(mean_estimate, stddev_estimate, n);
 
                 return { mean_estimate, stddev_estimate, outlier_variance };
diff --git a/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_stats.hpp b/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_stats.hpp
index 4c54ec52..3bea612f 100644
--- a/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_stats.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_stats.hpp
@@ -13,122 +13,35 @@
 #include <catch2/benchmark/catch_estimate.hpp>
 #include <catch2/benchmark/catch_outlier_classification.hpp>
 
-#include <algorithm>
 #include <vector>
-#include <numeric>
-#include <tuple>
-#include <cmath>
 
 namespace Catch {
     namespace Benchmark {
         namespace Detail {
             using sample = std::vector<double>;
 
-            // Used when we know we want == comparison of two doubles
-            // to centralize warning suppression
-            bool directCompare( double lhs, double rhs );
-
-            double weighted_average_quantile(int k, int q, std::vector<double>::iterator first, std::vector<double>::iterator last);
-
-            template <typename Iterator>
-            OutlierClassification classify_outliers(Iterator first, Iterator last) {
-                std::vector<double> copy(first, last);
-
-                auto q1 = weighted_average_quantile(1, 4, copy.begin(), copy.end());
-                auto q3 = weighted_average_quantile(3, 4, copy.begin(), copy.end());
-                auto iqr = q3 - q1;
-                auto los = q1 - (iqr * 3.);
-                auto lom = q1 - (iqr * 1.5);
-                auto him = q3 + (iqr * 1.5);
-                auto his = q3 + (iqr * 3.);
-
-                OutlierClassification o;
-                for (; first != last; ++first) {
-                    auto&& t = *first;
-                    if (t < los) ++o.low_severe;
-                    else if (t < lom) ++o.low_mild;
-                    else if (t > his) ++o.high_severe;
-                    else if (t > him) ++o.high_mild;
-                    ++o.samples_seen;
-                }
-                return o;
-            }
-
-            template <typename Iterator>
-            double mean(Iterator first, Iterator last) {
-                auto count = last - first;
-                double sum = std::accumulate(first, last, 0.);
-                return sum / static_cast<double>(count);
-            }
-
-            template <typename Estimator, typename Iterator>
-            sample jackknife(Estimator&& estimator, Iterator first, Iterator last) {
-                auto n = static_cast<size_t>(last - first);
-                auto second = first;
-                ++second;
-                sample results;
-                results.reserve(n);
-
-                for (auto it = first; it != last; ++it) {
-                    std::iter_swap(it, first);
-                    results.push_back(estimator(second, last));
-                }
-
-                return results;
-            }
-
-            inline double normal_cdf(double x) {
-                return std::erfc(-x / std::sqrt(2.0)) / 2.0;
-            }
+            double weighted_average_quantile( int k,
+                                              int q,
+                                              double* first,
+                                              double* last );
+
+            OutlierClassification
+            classify_outliers( double const* first, double const* last );
+
+            double mean( double const* first, double const* last );
+
+            double normal_cdf( double x );
 
             double erfc_inv(double x);
 
             double normal_quantile(double p);
 
-            template <typename Iterator, typename Estimator>
-            Estimate<double> bootstrap(double confidence_level, Iterator first, Iterator last, sample const& resample, Estimator&& estimator) {
-                auto n_samples = last - first;
-
-                double point = estimator(first, last);
-                // Degenerate case with a single sample
-                if (n_samples == 1) return { point, point, point, confidence_level };
-
-                sample jack = jackknife(estimator, first, last);
-                double jack_mean = mean(jack.begin(), jack.end());
-                double sum_squares, sum_cubes;
-                std::tie(sum_squares, sum_cubes) = std::accumulate(jack.begin(), jack.end(), std::make_pair(0., 0.), [jack_mean](std::pair<double, double> sqcb, double x) -> std::pair<double, double> {
-                    auto d = jack_mean - x;
-                    auto d2 = d * d;
-                    auto d3 = d2 * d;
-                    return { sqcb.first + d2, sqcb.second + d3 };
-                });
-
-                double accel = sum_cubes / (6 * std::pow(sum_squares, 1.5));
-                long n = static_cast<long>(resample.size());
-                double prob_n = std::count_if(resample.begin(), resample.end(), [point](double x) { return x < point; }) / static_cast<double>(n);
-                // degenerate case with uniform samples
-                if ( directCompare( prob_n, 0. ) ) {
-                    return { point, point, point, confidence_level };
-                }
-
-                double bias = normal_quantile(prob_n);
-                double z1 = normal_quantile((1. - confidence_level) / 2.);
-
-                auto cumn = [n]( double x ) -> long {
-                    return std::lround( normal_cdf( x ) * static_cast<double>(n) );
-                };
-                auto a = [bias, accel](double b) { return bias + b / (1. - accel * b); };
-                double b1 = bias + z1;
-                double b2 = bias - z1;
-                double a1 = a(b1);
-                double a2 = a(b2);
-                auto lo = static_cast<size_t>((std::max)(cumn(a1), 0l));
-                auto hi = static_cast<size_t>((std::min)(cumn(a2), n - 1));
-
-                return { point, resample[lo], resample[hi], confidence_level };
-            }
-
-            double outlier_variance(Estimate<double> mean, Estimate<double> stddev, int n);
+            Estimate<double>
+            bootstrap( double confidence_level,
+                       double* first,
+                       double* last,
+                       sample const& resample,
+                       double ( *estimator )( double const*, double const* ) );
 
             struct bootstrap_analysis {
                 Estimate<double> mean;
@@ -136,7 +49,10 @@ namespace Catch {
                 double outlier_variance;
             };
 
-            bootstrap_analysis analyse_samples(double confidence_level, unsigned int n_resamples, std::vector<double>::iterator first, std::vector<double>::iterator last);
+            bootstrap_analysis analyse_samples(double confidence_level,
+                                               unsigned int n_resamples,
+                                               double* first,
+                                               double* last);
         } // namespace Detail
     } // namespace Benchmark
 } // namespace Catch
diff --git a/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_timing.hpp b/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_timing.hpp
index f5c25571..da567190 100644
--- a/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_timing.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/benchmark/detail/catch_timing.hpp
@@ -17,14 +17,14 @@
 
 namespace Catch {
     namespace Benchmark {
-        template <typename Duration, typename Result>
+        template <typename Result>
         struct Timing {
-            Duration elapsed;
+            IDuration elapsed;
             Result result;
             int iterations;
         };
-        template <typename Clock, typename Func, typename... Args>
-        using TimingOf = Timing<ClockDuration<Clock>, Detail::CompleteType_t<FunctionReturnType<Func, Args...>>>;
+        template <typename Func, typename... Args>
+        using TimingOf = Timing<Detail::CompleteType_t<FunctionReturnType<Func, Args...>>>;
     } // namespace Benchmark
 } // namespace Catch
 
diff --git a/alpaka/thirdParty/catch2/src/catch2/catch_all.hpp b/alpaka/thirdParty/catch2/src/catch2/catch_all.hpp
index be146421..f2cc8536 100644
--- a/alpaka/thirdParty/catch2/src/catch2/catch_all.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/catch_all.hpp
@@ -54,6 +54,8 @@
 #include <catch2/internal/catch_compiler_capabilities.hpp>
 #include <catch2/internal/catch_config_android_logwrite.hpp>
 #include <catch2/internal/catch_config_counter.hpp>
+#include <catch2/internal/catch_config_prefix_messages.hpp>
+#include <catch2/internal/catch_config_static_analysis_support.hpp>
 #include <catch2/internal/catch_config_uncaught_exceptions.hpp>
 #include <catch2/internal/catch_config_wchar.hpp>
 #include <catch2/internal/catch_console_colour.hpp>
@@ -72,6 +74,7 @@
 #include <catch2/internal/catch_getenv.hpp>
 #include <catch2/internal/catch_is_permutation.hpp>
 #include <catch2/internal/catch_istream.hpp>
+#include <catch2/internal/catch_jsonwriter.hpp>
 #include <catch2/internal/catch_lazy_expr.hpp>
 #include <catch2/internal/catch_leak_detector.hpp>
 #include <catch2/internal/catch_list.hpp>
@@ -86,7 +89,10 @@
 #include <catch2/internal/catch_platform.hpp>
 #include <catch2/internal/catch_polyfills.hpp>
 #include <catch2/internal/catch_preprocessor.hpp>
+#include <catch2/internal/catch_preprocessor_internal_stringify.hpp>
 #include <catch2/internal/catch_preprocessor_remove_parens.hpp>
+#include <catch2/internal/catch_random_floating_point_helpers.hpp>
+#include <catch2/internal/catch_random_integer_helpers.hpp>
 #include <catch2/internal/catch_random_number_generator.hpp>
 #include <catch2/internal/catch_random_seed_generation.hpp>
 #include <catch2/internal/catch_reporter_registry.hpp>
@@ -111,10 +117,13 @@
 #include <catch2/internal/catch_test_failure_exception.hpp>
 #include <catch2/internal/catch_test_macro_impl.hpp>
 #include <catch2/internal/catch_test_registry.hpp>
+#include <catch2/internal/catch_test_run_info.hpp>
 #include <catch2/internal/catch_test_spec_parser.hpp>
 #include <catch2/internal/catch_textflow.hpp>
 #include <catch2/internal/catch_to_string.hpp>
 #include <catch2/internal/catch_uncaught_exceptions.hpp>
+#include <catch2/internal/catch_uniform_floating_point_distribution.hpp>
+#include <catch2/internal/catch_uniform_integer_distribution.hpp>
 #include <catch2/internal/catch_unique_name.hpp>
 #include <catch2/internal/catch_unique_ptr.hpp>
 #include <catch2/internal/catch_void_type.hpp>
diff --git a/alpaka/thirdParty/catch2/src/catch2/catch_approx.cpp b/alpaka/thirdParty/catch2/src/catch2/catch_approx.cpp
index 407586d1..9ad4ce3e 100644
--- a/alpaka/thirdParty/catch2/src/catch2/catch_approx.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/catch_approx.cpp
@@ -70,10 +70,10 @@ namespace Catch {
     }
 
 namespace literals {
-    Approx operator "" _a(long double val) {
+    Approx operator ""_a(long double val) {
         return Approx(val);
     }
-    Approx operator "" _a(unsigned long long val) {
+    Approx operator ""_a(unsigned long long val) {
         return Approx(val);
     }
 } // end namespace literals
diff --git a/alpaka/thirdParty/catch2/src/catch2/catch_config.cpp b/alpaka/thirdParty/catch2/src/catch2/catch_config.cpp
index eb4f5ad3..34f50f17 100644
--- a/alpaka/thirdParty/catch2/src/catch2/catch_config.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/catch_config.cpp
@@ -105,7 +105,7 @@ namespace Catch {
             elem = trim(elem);
         }
 
-        // Insert the default reporter if user hasn't asked for a specfic one
+        // Insert the default reporter if user hasn't asked for a specific one
         if ( m_data.reporterSpecifications.empty() ) {
             m_data.reporterSpecifications.push_back( {
 #if defined( CATCH_CONFIG_DEFAULT_REPORTER )
diff --git a/alpaka/thirdParty/catch2/src/catch2/catch_config.hpp b/alpaka/thirdParty/catch2/src/catch2/catch_config.hpp
index 784de4aa..17e983e5 100644
--- a/alpaka/thirdParty/catch2/src/catch2/catch_config.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/catch_config.hpp
@@ -69,7 +69,7 @@ namespace Catch {
         bool benchmarkNoAnalysis = false;
         unsigned int benchmarkSamples = 100;
         double benchmarkConfidenceInterval = 0.95;
-        unsigned int benchmarkResamples = 100000;
+        unsigned int benchmarkResamples = 100'000;
         std::chrono::milliseconds::rep benchmarkWarmupTime = 100;
 
         Verbosity verbosity = Verbosity::Normal;
diff --git a/alpaka/thirdParty/catch2/src/catch2/catch_message.cpp b/alpaka/thirdParty/catch2/src/catch2/catch_message.cpp
index d4723e94..384f180e 100644
--- a/alpaka/thirdParty/catch2/src/catch2/catch_message.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/catch_message.cpp
@@ -37,7 +37,11 @@ namespace Catch {
     }
 
 
-    Capturer::Capturer( StringRef macroName, SourceLineInfo const& lineInfo, ResultWas::OfType resultType, StringRef names ) {
+    Capturer::Capturer( StringRef macroName,
+                        SourceLineInfo const& lineInfo,
+                        ResultWas::OfType resultType,
+                        StringRef names ):
+        m_resultCapture( getResultCapture() ) {
         auto trimmed = [&] (size_t start, size_t end) {
             while (names[start] == ',' || isspace(static_cast<unsigned char>(names[start]))) {
                 ++start;
diff --git a/alpaka/thirdParty/catch2/src/catch2/catch_message.hpp b/alpaka/thirdParty/catch2/src/catch2/catch_message.hpp
index e6bc1b5d..05325ee8 100644
--- a/alpaka/thirdParty/catch2/src/catch2/catch_message.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/catch_message.hpp
@@ -8,12 +8,13 @@
 #ifndef CATCH_MESSAGE_HPP_INCLUDED
 #define CATCH_MESSAGE_HPP_INCLUDED
 
+#include <catch2/internal/catch_config_prefix_messages.hpp>
 #include <catch2/internal/catch_result_type.hpp>
 #include <catch2/internal/catch_reusable_string_stream.hpp>
 #include <catch2/internal/catch_stream_end_stop.hpp>
 #include <catch2/internal/catch_message_info.hpp>
-#include <catch2/interfaces/catch_interfaces_capture.hpp>
 #include <catch2/catch_tostring.hpp>
+#include <catch2/interfaces/catch_interfaces_capture.hpp>
 
 #include <string>
 #include <vector>
@@ -21,6 +22,7 @@
 namespace Catch {
 
     struct SourceLineInfo;
+    class IResultCapture;
 
     struct MessageStream {
 
@@ -61,7 +63,7 @@ namespace Catch {
 
     class Capturer {
         std::vector<MessageInfo> m_messages;
-        IResultCapture& m_resultCapture = getResultCapture();
+        IResultCapture& m_resultCapture;
         size_t m_captured = 0;
     public:
         Capturer( StringRef macroName, SourceLineInfo const& lineInfo, ResultWas::OfType resultType, StringRef names );
@@ -112,28 +114,28 @@ namespace Catch {
     Catch::getResultCapture().emplaceUnscopedMessage( Catch::MessageBuilder( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, Catch::ResultWas::Info ) << log )
 
 
-#if defined(CATCH_CONFIG_PREFIX_ALL) && !defined(CATCH_CONFIG_DISABLE)
+#if defined(CATCH_CONFIG_PREFIX_MESSAGES) && !defined(CATCH_CONFIG_DISABLE)
 
   #define CATCH_INFO( msg ) INTERNAL_CATCH_INFO( "CATCH_INFO", msg )
   #define CATCH_UNSCOPED_INFO( msg ) INTERNAL_CATCH_UNSCOPED_INFO( "CATCH_UNSCOPED_INFO", msg )
   #define CATCH_WARN( msg ) INTERNAL_CATCH_MSG( "CATCH_WARN", Catch::ResultWas::Warning, Catch::ResultDisposition::ContinueOnFailure, msg )
   #define CATCH_CAPTURE( ... ) INTERNAL_CATCH_CAPTURE( INTERNAL_CATCH_UNIQUE_NAME(capturer), "CATCH_CAPTURE", __VA_ARGS__ )
 
-#elif defined(CATCH_CONFIG_PREFIX_ALL) && defined(CATCH_CONFIG_DISABLE)
+#elif defined(CATCH_CONFIG_PREFIX_MESSAGES) && defined(CATCH_CONFIG_DISABLE)
 
   #define CATCH_INFO( msg )          (void)(0)
   #define CATCH_UNSCOPED_INFO( msg ) (void)(0)
   #define CATCH_WARN( msg )          (void)(0)
   #define CATCH_CAPTURE( ... )       (void)(0)
 
-#elif !defined(CATCH_CONFIG_PREFIX_ALL) && !defined(CATCH_CONFIG_DISABLE)
+#elif !defined(CATCH_CONFIG_PREFIX_MESSAGES) && !defined(CATCH_CONFIG_DISABLE)
 
   #define INFO( msg ) INTERNAL_CATCH_INFO( "INFO", msg )
   #define UNSCOPED_INFO( msg ) INTERNAL_CATCH_UNSCOPED_INFO( "UNSCOPED_INFO", msg )
   #define WARN( msg ) INTERNAL_CATCH_MSG( "WARN", Catch::ResultWas::Warning, Catch::ResultDisposition::ContinueOnFailure, msg )
   #define CAPTURE( ... ) INTERNAL_CATCH_CAPTURE( INTERNAL_CATCH_UNIQUE_NAME(capturer), "CAPTURE", __VA_ARGS__ )
 
-#elif !defined(CATCH_CONFIG_PREFIX_ALL) && defined(CATCH_CONFIG_DISABLE)
+#elif !defined(CATCH_CONFIG_PREFIX_MESSAGES) && defined(CATCH_CONFIG_DISABLE)
 
   #define INFO( msg )          (void)(0)
   #define UNSCOPED_INFO( msg ) (void)(0)
diff --git a/alpaka/thirdParty/catch2/src/catch2/catch_registry_hub.cpp b/alpaka/thirdParty/catch2/src/catch2/catch_registry_hub.cpp
index 243dd2b0..8716db3a 100644
--- a/alpaka/thirdParty/catch2/src/catch2/catch_registry_hub.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/catch_registry_hub.cpp
@@ -20,6 +20,9 @@
 #include <catch2/internal/catch_noncopyable.hpp>
 #include <catch2/interfaces/catch_interfaces_reporter_factory.hpp>
 #include <catch2/internal/catch_move_and_forward.hpp>
+#include <catch2/internal/catch_reporter_registry.hpp>
+
+#include <exception>
 
 namespace Catch {
 
@@ -31,7 +34,7 @@ namespace Catch {
 
         public: // IRegistryHub
             RegistryHub() = default;
-            IReporterRegistry const& getReporterRegistry() const override {
+            ReporterRegistry const& getReporterRegistry() const override {
                 return m_reporterRegistry;
             }
             ITestCaseRegistry const& getTestCaseRegistry() const override {
diff --git a/alpaka/thirdParty/catch2/src/catch2/catch_session.cpp b/alpaka/thirdParty/catch2/src/catch2/catch_session.cpp
index 43465f0c..f1ed5f9c 100644
--- a/alpaka/thirdParty/catch2/src/catch2/catch_session.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/catch_session.cpp
@@ -13,13 +13,13 @@
 #include <catch2/internal/catch_run_context.hpp>
 #include <catch2/catch_test_spec.hpp>
 #include <catch2/catch_version.hpp>
-#include <catch2/interfaces/catch_interfaces_reporter.hpp>
 #include <catch2/internal/catch_startup_exception_registry.hpp>
 #include <catch2/internal/catch_sharding.hpp>
+#include <catch2/internal/catch_test_case_registry_impl.hpp>
 #include <catch2/internal/catch_textflow.hpp>
 #include <catch2/internal/catch_windows_h_proxy.hpp>
 #include <catch2/reporters/catch_reporter_multi.hpp>
-#include <catch2/interfaces/catch_interfaces_reporter_registry.hpp>
+#include <catch2/internal/catch_reporter_registry.hpp>
 #include <catch2/interfaces/catch_interfaces_reporter_factory.hpp>
 #include <catch2/internal/catch_move_and_forward.hpp>
 #include <catch2/internal/catch_stdstreams.hpp>
@@ -27,6 +27,7 @@
 
 #include <algorithm>
 #include <cassert>
+#include <exception>
 #include <iomanip>
 #include <set>
 
diff --git a/alpaka/thirdParty/catch2/src/catch2/catch_test_case_info.cpp b/alpaka/thirdParty/catch2/src/catch2/catch_test_case_info.cpp
index a6adce0a..c38ee55a 100644
--- a/alpaka/thirdParty/catch2/src/catch2/catch_test_case_info.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/catch_test_case_info.cpp
@@ -9,6 +9,7 @@
 #include <catch2/internal/catch_enforce.hpp>
 #include <catch2/internal/catch_string_manip.hpp>
 #include <catch2/internal/catch_case_insensitive_comparisons.hpp>
+#include <catch2/internal/catch_test_registry.hpp>
 
 #include <cassert>
 #include <cctype>
@@ -139,12 +140,20 @@ namespace Catch {
         for (size_t idx = 0; idx < originalTags.size(); ++idx) {
             auto c = originalTags[idx];
             if (c == '[') {
-                assert(!inTag);
+                CATCH_ENFORCE(
+                    !inTag,
+                    "Found '[' inside a tag while registering test case '"
+                        << _nameAndTags.name << "' at " << _lineInfo );
+
                 inTag = true;
                 tagStart = idx;
             }
             if (c == ']') {
-                assert(inTag);
+                CATCH_ENFORCE(
+                    inTag,
+                    "Found unmatched ']' while registering test case '"
+                        << _nameAndTags.name << "' at " << _lineInfo );
+
                 inTag = false;
                 tagEnd = idx;
                 assert(tagStart < tagEnd);
@@ -153,7 +162,11 @@ namespace Catch {
                 // it over to backing storage and actually reference the
                 // backing storage in the saved tags
                 StringRef tagStr = originalTags.substr(tagStart+1, tagEnd - tagStart - 1);
-                CATCH_ENFORCE(!tagStr.empty(), "Empty tags are not allowed");
+                CATCH_ENFORCE( !tagStr.empty(),
+                               "Found an empty tag while registering test case '"
+                                   << _nameAndTags.name << "' at "
+                                   << _lineInfo );
+
                 enforceNotReservedTag(tagStr, lineInfo);
                 properties |= parseSpecialTag(tagStr);
                 // When copying a tag to the backing storage, we need to
@@ -167,8 +180,12 @@ namespace Catch {
                 // the tags.
                 internalAppendTag(tagStr);
             }
-            (void)inTag; // Silence "set-but-unused" warning in release mode.
         }
+        CATCH_ENFORCE( !inTag,
+                       "Found an unclosed tag while registering test case '"
+                           << _nameAndTags.name << "' at " << _lineInfo );
+
+
         // Add [.] if relevant
         if (isHidden()) {
             internalAppendTag("."_sr);
diff --git a/alpaka/thirdParty/catch2/src/catch2/catch_test_case_info.hpp b/alpaka/thirdParty/catch2/src/catch2/catch_test_case_info.hpp
index 5ff3e3e7..a2f4b43e 100644
--- a/alpaka/thirdParty/catch2/src/catch2/catch_test_case_info.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/catch_test_case_info.hpp
@@ -8,10 +8,10 @@
 #ifndef CATCH_TEST_CASE_INFO_HPP_INCLUDED
 #define CATCH_TEST_CASE_INFO_HPP_INCLUDED
 
+#include <catch2/interfaces/catch_interfaces_test_invoker.hpp>
 #include <catch2/internal/catch_source_line_info.hpp>
 #include <catch2/internal/catch_noncopyable.hpp>
 #include <catch2/internal/catch_stringref.hpp>
-#include <catch2/internal/catch_test_registry.hpp>
 #include <catch2/internal/catch_unique_ptr.hpp>
 
 
@@ -44,6 +44,7 @@ namespace Catch {
     };
 
     class ITestInvoker;
+    struct NameAndTags;
 
     enum class TestCaseProperties : uint8_t {
         None = 0,
diff --git a/alpaka/thirdParty/catch2/src/catch2/catch_test_spec.cpp b/alpaka/thirdParty/catch2/src/catch2/catch_test_spec.cpp
index f27ce99c..f32f9864 100644
--- a/alpaka/thirdParty/catch2/src/catch2/catch_test_spec.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/catch_test_spec.cpp
@@ -6,6 +6,8 @@
 
 // SPDX-License-Identifier: BSL-1.0
 #include <catch2/catch_test_spec.hpp>
+#include <catch2/interfaces/catch_interfaces_testcase.hpp>
+#include <catch2/internal/catch_test_case_registry_impl.hpp>
 #include <catch2/internal/catch_reusable_string_stream.hpp>
 #include <catch2/internal/catch_string_manip.hpp>
 #include <catch2/catch_test_case_info.hpp>
@@ -106,16 +108,18 @@ namespace Catch {
         return std::any_of( m_filters.begin(), m_filters.end(), [&]( Filter const& f ){ return f.matches( testCase ); } );
     }
 
-    TestSpec::Matches TestSpec::matchesByFilter( std::vector<TestCaseHandle> const& testCases, IConfig const& config ) const
-    {
-        Matches matches( m_filters.size() );
-        std::transform( m_filters.begin(), m_filters.end(), matches.begin(), [&]( Filter const& filter ){
+    TestSpec::Matches TestSpec::matchesByFilter( std::vector<TestCaseHandle> const& testCases, IConfig const& config ) const {
+        Matches matches;
+        matches.reserve( m_filters.size() );
+        for ( auto const& filter : m_filters ) {
             std::vector<TestCaseHandle const*> currentMatches;
-            for( auto const& test : testCases )
-                if( isThrowSafe( test, config ) && filter.matches( test.getTestCaseInfo() ) )
+            for ( auto const& test : testCases )
+                if ( isThrowSafe( test, config ) &&
+                     filter.matches( test.getTestCaseInfo() ) )
                     currentMatches.emplace_back( &test );
-            return FilterMatch{ extractFilterName(filter), currentMatches };
-        } );
+            matches.push_back(
+                FilterMatch{ extractFilterName( filter ), currentMatches } );
+        }
         return matches;
     }
 
diff --git a/alpaka/thirdParty/catch2/src/catch2/catch_tostring.hpp b/alpaka/thirdParty/catch2/src/catch2/catch_tostring.hpp
index 904caa7e..f3fb0beb 100644
--- a/alpaka/thirdParty/catch2/src/catch2/catch_tostring.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/catch_tostring.hpp
@@ -116,7 +116,6 @@ namespace Catch {
     } // namespace Detail
 
 
-    // If we decide for C++14, change these to enable_if_ts
     template <typename T, typename = void>
     struct StringMaker {
         template <typename Fake = T>
@@ -399,6 +398,12 @@ namespace Catch {
             }
         }
     };
+    template <>
+    struct StringMaker<std::nullopt_t> {
+        static std::string convert(const std::nullopt_t&) {
+            return "{ }";
+        }
+    };
 }
 #endif // CATCH_CONFIG_ENABLE_OPTIONAL_STRINGMAKER
 
diff --git a/alpaka/thirdParty/catch2/src/catch2/catch_translate_exception.cpp b/alpaka/thirdParty/catch2/src/catch2/catch_translate_exception.cpp
new file mode 100644
index 00000000..c4b28944
--- /dev/null
+++ b/alpaka/thirdParty/catch2/src/catch2/catch_translate_exception.cpp
@@ -0,0 +1,20 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/catch_translate_exception.hpp>
+#include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
+
+namespace Catch {
+    namespace Detail {
+        void registerTranslatorImpl(
+            Detail::unique_ptr<IExceptionTranslator>&& translator ) {
+            getMutableRegistryHub().registerTranslator(
+                CATCH_MOVE( translator ) );
+        }
+    } // namespace Detail
+} // namespace Catch
diff --git a/alpaka/thirdParty/catch2/src/catch2/catch_translate_exception.hpp b/alpaka/thirdParty/catch2/src/catch2/catch_translate_exception.hpp
index 2dbeb17e..5a4dc5e3 100644
--- a/alpaka/thirdParty/catch2/src/catch2/catch_translate_exception.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/catch_translate_exception.hpp
@@ -15,6 +15,10 @@
 #include <exception>
 
 namespace Catch {
+    namespace Detail {
+        void registerTranslatorImpl(
+            Detail::unique_ptr<IExceptionTranslator>&& translator );
+    }
 
     class ExceptionTranslatorRegistrar {
         template<typename T>
@@ -48,9 +52,9 @@ namespace Catch {
     public:
         template<typename T>
         ExceptionTranslatorRegistrar( std::string(*translateFunction)( T const& ) ) {
-            getMutableRegistryHub().registerTranslator(
-                Detail::make_unique<ExceptionTranslator<T>>(translateFunction)
-            );
+            Detail::registerTranslatorImpl(
+                Detail::make_unique<ExceptionTranslator<T>>(
+                    translateFunction ) );
         }
     };
 
diff --git a/alpaka/thirdParty/catch2/src/catch2/catch_user_config.hpp.in b/alpaka/thirdParty/catch2/src/catch2/catch_user_config.hpp.in
index 3f6b10e8..10d61937 100644
--- a/alpaka/thirdParty/catch2/src/catch2/catch_user_config.hpp.in
+++ b/alpaka/thirdParty/catch2/src/catch2/catch_user_config.hpp.in
@@ -169,9 +169,18 @@
 #endif
 
 
+#cmakedefine CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT
+#cmakedefine CATCH_CONFIG_NO_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT
+
+#if defined( CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT ) && \
+    defined( CATCH_CONFIG_NO_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT )
+#    error Cannot force STATIC_ANALYSIS_SUPPORT to both ON and OFF
+#endif
+
+
 // ------
 // Simple toggle defines
-// their value is never used and they cannot be overriden
+// their value is never used and they cannot be overridden
 // ------
 
 
@@ -189,6 +198,7 @@
 #cmakedefine CATCH_CONFIG_FAST_COMPILE
 #cmakedefine CATCH_CONFIG_NOSTDOUT
 #cmakedefine CATCH_CONFIG_PREFIX_ALL
+#cmakedefine CATCH_CONFIG_PREFIX_MESSAGES
 #cmakedefine CATCH_CONFIG_WINDOWS_CRTDBG
 
 #cmakedefine CATCH_CONFIG_SHARED_LIBRARY
diff --git a/alpaka/thirdParty/catch2/src/catch2/catch_version.cpp b/alpaka/thirdParty/catch2/src/catch2/catch_version.cpp
index 19cab91b..4e67d968 100644
--- a/alpaka/thirdParty/catch2/src/catch2/catch_version.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/catch_version.cpp
@@ -36,7 +36,7 @@ namespace Catch {
     }
 
     Version const& libraryVersion() {
-        static Version version( 3, 3, 2, "", 0 );
+        static Version version( 3, 5, 2, "", 0 );
         return version;
     }
 
diff --git a/alpaka/thirdParty/catch2/src/catch2/catch_version_macros.hpp b/alpaka/thirdParty/catch2/src/catch2/catch_version_macros.hpp
index 9ece8505..be2a04d2 100644
--- a/alpaka/thirdParty/catch2/src/catch2/catch_version_macros.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/catch_version_macros.hpp
@@ -9,7 +9,7 @@
 #define CATCH_VERSION_MACROS_HPP_INCLUDED
 
 #define CATCH_VERSION_MAJOR 3
-#define CATCH_VERSION_MINOR 3
+#define CATCH_VERSION_MINOR 5
 #define CATCH_VERSION_PATCH 2
 
 #endif // CATCH_VERSION_MACROS_HPP_INCLUDED
diff --git a/alpaka/thirdParty/catch2/src/catch2/generators/catch_generators.hpp b/alpaka/thirdParty/catch2/src/catch2/generators/catch_generators.hpp
index 117f1901..0f35a996 100644
--- a/alpaka/thirdParty/catch2/src/catch2/generators/catch_generators.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/generators/catch_generators.hpp
@@ -37,12 +37,6 @@ namespace Detail {
         }
 
     public:
-        ~IGenerator() override = default;
-        IGenerator() = default;
-        IGenerator(IGenerator const&) = default;
-        IGenerator& operator=(IGenerator const&) = default;
-
-
         // Returns the current element of the generator
         //
         // \Precondition The generator is either freshly constructed,
diff --git a/alpaka/thirdParty/catch2/src/catch2/generators/catch_generators_random.cpp b/alpaka/thirdParty/catch2/src/catch2/generators/catch_generators_random.cpp
index 2e3390fd..00a8e634 100644
--- a/alpaka/thirdParty/catch2/src/catch2/generators/catch_generators_random.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/generators/catch_generators_random.cpp
@@ -7,7 +7,35 @@
 // SPDX-License-Identifier: BSL-1.0
 
 #include <catch2/generators/catch_generators_random.hpp>
-
 #include <catch2/internal/catch_context.hpp>
 
-std::uint32_t Catch::Generators::Detail::getSeed() { return sharedRng()(); }
+#include <random>
+
+namespace Catch {
+    namespace Generators {
+        namespace Detail {
+            std::uint32_t getSeed() { return sharedRng()(); }
+        } // namespace Detail
+
+        struct RandomFloatingGenerator<long double>::PImpl {
+            PImpl( long double a, long double b, uint32_t seed ):
+                rng( seed ), dist( a, b ) {}
+
+            Catch::SimplePcg32 rng;
+            std::uniform_real_distribution<long double> dist;
+        };
+
+        RandomFloatingGenerator<long double>::RandomFloatingGenerator(
+            long double a, long double b, std::uint32_t seed) :
+            m_pimpl(Catch::Detail::make_unique<PImpl>(a, b, seed)) {
+            static_cast<void>( next() );
+        }
+
+        RandomFloatingGenerator<long double>::~RandomFloatingGenerator() =
+            default;
+        bool RandomFloatingGenerator<long double>::next() {
+            m_current_number = m_pimpl->dist( m_pimpl->rng );
+            return true;
+        }
+    } // namespace Generators
+} // namespace Catch
diff --git a/alpaka/thirdParty/catch2/src/catch2/generators/catch_generators_random.hpp b/alpaka/thirdParty/catch2/src/catch2/generators/catch_generators_random.hpp
index bcd4888d..71283561 100644
--- a/alpaka/thirdParty/catch2/src/catch2/generators/catch_generators_random.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/generators/catch_generators_random.hpp
@@ -8,11 +8,11 @@
 #ifndef CATCH_GENERATORS_RANDOM_HPP_INCLUDED
 #define CATCH_GENERATORS_RANDOM_HPP_INCLUDED
 
-#include <catch2/internal/catch_context.hpp>
 #include <catch2/generators/catch_generators.hpp>
 #include <catch2/internal/catch_random_number_generator.hpp>
-
-#include <random>
+#include <catch2/internal/catch_uniform_integer_distribution.hpp>
+#include <catch2/internal/catch_uniform_floating_point_distribution.hpp>
+#include <catch2/internal/catch_unique_ptr.hpp>
 
 namespace Catch {
 namespace Generators {
@@ -26,7 +26,7 @@ namespace Detail {
 template <typename Float>
 class RandomFloatingGenerator final : public IGenerator<Float> {
     Catch::SimplePcg32 m_rng;
-    std::uniform_real_distribution<Float> m_dist;
+    Catch::uniform_floating_point_distribution<Float> m_dist;
     Float m_current_number;
 public:
     RandomFloatingGenerator( Float a, Float b, std::uint32_t seed ):
@@ -44,10 +44,27 @@ class RandomFloatingGenerator final : public IGenerator<Float> {
     }
 };
 
+template <>
+class RandomFloatingGenerator<long double> final : public IGenerator<long double> {
+    // We still rely on <random> for this specialization, but we don't
+    // want to drag it into the header.
+    struct PImpl;
+    Catch::Detail::unique_ptr<PImpl> m_pimpl;
+    long double m_current_number;
+
+public:
+    RandomFloatingGenerator( long double a, long double b, std::uint32_t seed );
+
+    long double const& get() const override { return m_current_number; }
+    bool next() override;
+
+    ~RandomFloatingGenerator() override; // = default
+};
+
 template <typename Integer>
 class RandomIntegerGenerator final : public IGenerator<Integer> {
     Catch::SimplePcg32 m_rng;
-    std::uniform_int_distribution<Integer> m_dist;
+    Catch::uniform_integer_distribution<Integer> m_dist;
     Integer m_current_number;
 public:
     RandomIntegerGenerator( Integer a, Integer b, std::uint32_t seed ):
@@ -68,14 +85,6 @@ class RandomIntegerGenerator final : public IGenerator<Integer> {
 template <typename T>
 std::enable_if_t<std::is_integral<T>::value, GeneratorWrapper<T>>
 random(T a, T b) {
-    static_assert(
-        !std::is_same<T, char>::value &&
-        !std::is_same<T, int8_t>::value &&
-        !std::is_same<T, uint8_t>::value &&
-        !std::is_same<T, signed char>::value &&
-        !std::is_same<T, unsigned char>::value &&
-        !std::is_same<T, bool>::value,
-        "The requested type is not supported by the underlying random distributions from std" );
     return GeneratorWrapper<T>(
         Catch::Detail::make_unique<RandomIntegerGenerator<T>>(a, b, Detail::getSeed())
     );
diff --git a/alpaka/thirdParty/catch2/src/catch2/generators/catch_generators_range.hpp b/alpaka/thirdParty/catch2/src/catch2/generators/catch_generators_range.hpp
index 495acb95..b67c1590 100644
--- a/alpaka/thirdParty/catch2/src/catch2/generators/catch_generators_range.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/generators/catch_generators_range.hpp
@@ -96,10 +96,11 @@ GeneratorWrapper<ResultType> from_range(InputIterator from, InputSentinel to) {
     return GeneratorWrapper<ResultType>(Catch::Detail::make_unique<IteratorGenerator<ResultType>>(from, to));
 }
 
-template <typename Container,
-          typename ResultType = typename Container::value_type>
-GeneratorWrapper<ResultType> from_range(Container const& cnt) {
-    return GeneratorWrapper<ResultType>(Catch::Detail::make_unique<IteratorGenerator<ResultType>>(cnt.begin(), cnt.end()));
+template <typename Container>
+auto from_range(Container const& cnt) {
+    using std::begin;
+    using std::end;
+    return from_range( begin( cnt ), end( cnt ) );
 }
 
 
diff --git a/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_all.hpp b/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_all.hpp
index 87b746d8..a99fdcdc 100644
--- a/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_all.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_all.hpp
@@ -30,8 +30,8 @@
 #include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
 #include <catch2/interfaces/catch_interfaces_reporter.hpp>
 #include <catch2/interfaces/catch_interfaces_reporter_factory.hpp>
-#include <catch2/interfaces/catch_interfaces_reporter_registry.hpp>
 #include <catch2/interfaces/catch_interfaces_tag_alias_registry.hpp>
+#include <catch2/interfaces/catch_interfaces_test_invoker.hpp>
 #include <catch2/interfaces/catch_interfaces_testcase.hpp>
 
 #endif // CATCH_INTERFACES_ALL_HPP_INCLUDED
diff --git a/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_capture.hpp b/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_capture.hpp
index 2a469c12..a1876a4c 100644
--- a/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_capture.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_capture.hpp
@@ -14,6 +14,7 @@
 #include <catch2/internal/catch_stringref.hpp>
 #include <catch2/internal/catch_result_type.hpp>
 #include <catch2/internal/catch_unique_ptr.hpp>
+#include <catch2/benchmark/detail/catch_benchmark_stats_fwd.hpp>
 
 namespace Catch {
 
@@ -31,8 +32,6 @@ namespace Catch {
     class IGeneratorTracker;
 
     struct BenchmarkInfo;
-    template <typename Duration = std::chrono::duration<double, std::nano>>
-    struct BenchmarkStats;
 
     namespace Generators {
         class GeneratorUntypedBase;
@@ -44,6 +43,7 @@ namespace Catch {
     public:
         virtual ~IResultCapture();
 
+        virtual void notifyAssertionStarted( AssertionInfo const& info ) = 0;
         virtual bool sectionStarted( StringRef sectionName,
                                      SourceLineInfo const& sectionLineInfo,
                                      Counts& assertions ) = 0;
@@ -84,7 +84,7 @@ namespace Catch {
                     AssertionReaction& reaction ) = 0;
         virtual void handleUnexpectedInflightException
                 (   AssertionInfo const& info,
-                    std::string const& message,
+                    std::string&& message,
                     AssertionReaction& reaction ) = 0;
         virtual void handleIncomplete
                 (   AssertionInfo const& info ) = 0;
diff --git a/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_exception.hpp b/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_exception.hpp
index 9177666a..fcc2a8f9 100644
--- a/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_exception.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_exception.hpp
@@ -8,7 +8,6 @@
 #ifndef CATCH_INTERFACES_EXCEPTION_HPP_INCLUDED
 #define CATCH_INTERFACES_EXCEPTION_HPP_INCLUDED
 
-#include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
 #include <catch2/internal/catch_unique_ptr.hpp>
 
 #include <string>
diff --git a/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_registry_hub.hpp b/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_registry_hub.hpp
index 8813b538..113f223e 100644
--- a/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_registry_hub.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_registry_hub.hpp
@@ -19,7 +19,7 @@ namespace Catch {
     class ITestCaseRegistry;
     class IExceptionTranslatorRegistry;
     class IExceptionTranslator;
-    class IReporterRegistry;
+    class ReporterRegistry;
     class IReporterFactory;
     class ITagAliasRegistry;
     class ITestInvoker;
@@ -35,7 +35,7 @@ namespace Catch {
     public:
         virtual ~IRegistryHub(); // = default
 
-        virtual IReporterRegistry const& getReporterRegistry() const = 0;
+        virtual ReporterRegistry const& getReporterRegistry() const = 0;
         virtual ITestCaseRegistry const& getTestCaseRegistry() const = 0;
         virtual ITagAliasRegistry const& getTagAliasRegistry() const = 0;
         virtual IExceptionTranslatorRegistry const& getExceptionTranslatorRegistry() const = 0;
diff --git a/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_reporter.cpp b/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_reporter.cpp
index 67c5c80e..90536bb3 100644
--- a/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_reporter.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_reporter.cpp
@@ -7,19 +7,11 @@
 // SPDX-License-Identifier: BSL-1.0
 #include <catch2/interfaces/catch_interfaces_reporter.hpp>
 #include <catch2/interfaces/catch_interfaces_config.hpp>
-#include <catch2/internal/catch_console_colour.hpp>
-#include <catch2/internal/catch_console_width.hpp>
 #include <catch2/catch_message.hpp>
-#include <catch2/internal/catch_list.hpp>
-#include <catch2/internal/catch_string_manip.hpp>
-#include <catch2/catch_test_case_info.hpp>
-#include <catch2/reporters/catch_reporter_helpers.hpp>
 #include <catch2/internal/catch_move_and_forward.hpp>
 #include <catch2/internal/catch_istream.hpp>
 
-#include <algorithm>
 #include <cassert>
-#include <iomanip>
 
 namespace Catch {
 
@@ -54,8 +46,6 @@ namespace Catch {
         infoMessages( _infoMessages ),
         totals( _totals )
     {
-        assertionResult.m_resultData.lazyExpression.m_transientExpression = _assertionResult.m_resultData.lazyExpression.m_transientExpression;
-
         if( assertionResult.hasMessage() ) {
             // Copy message into messages list.
             // !TBD This should have been done earlier, somewhere
diff --git a/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_reporter.hpp b/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_reporter.hpp
index cf414f10..a052c5db 100644
--- a/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_reporter.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_reporter.hpp
@@ -13,11 +13,9 @@
 #include <catch2/catch_assertion_result.hpp>
 #include <catch2/internal/catch_message_info.hpp>
 #include <catch2/internal/catch_stringref.hpp>
+#include <catch2/internal/catch_test_run_info.hpp>
 #include <catch2/internal/catch_unique_ptr.hpp>
-#include <catch2/internal/catch_move_and_forward.hpp>
-#include <catch2/benchmark/catch_estimate.hpp>
-#include <catch2/benchmark/catch_outlier_classification.hpp>
-
+#include <catch2/benchmark/detail/catch_benchmark_stats.hpp>
 
 #include <map>
 #include <string>
@@ -57,11 +55,6 @@ namespace Catch {
         std::map<std::string, std::string> m_customOptions;
     };
 
-    struct TestRunInfo {
-        constexpr TestRunInfo(StringRef _name) : name(_name) {}
-        StringRef name;
-    };
-
     struct AssertionStats {
         AssertionStats( AssertionResult const& _assertionResult,
                         std::vector<MessageInfo> const& _infoMessages,
@@ -113,45 +106,6 @@ namespace Catch {
         bool aborting;
     };
 
-
-    struct BenchmarkInfo {
-        std::string name;
-        double estimatedDuration;
-        int iterations;
-        unsigned int samples;
-        unsigned int resamples;
-        double clockResolution;
-        double clockCost;
-    };
-
-    template <class Duration>
-    struct BenchmarkStats {
-        BenchmarkInfo info;
-
-        std::vector<Duration> samples;
-        Benchmark::Estimate<Duration> mean;
-        Benchmark::Estimate<Duration> standardDeviation;
-        Benchmark::OutlierClassification outliers;
-        double outlierVariance;
-
-        template <typename Duration2>
-        operator BenchmarkStats<Duration2>() const {
-            std::vector<Duration2> samples2;
-            samples2.reserve(samples.size());
-            for (auto const& sample : samples) {
-                samples2.push_back(Duration2(sample));
-            }
-            return {
-                info,
-                CATCH_MOVE(samples2),
-                mean,
-                standardDeviation,
-                outliers,
-                outlierVariance,
-            };
-        }
-    };
-
     //! By setting up its preferences, a reporter can modify Catch2's behaviour
     //! in some regards, e.g. it can request Catch2 to capture writes to
     //! stdout/stderr during test execution, and pass them to the reporter.
@@ -250,7 +204,7 @@ namespace Catch {
          */
         virtual void skipTest( TestCaseInfo const& testInfo ) = 0;
 
-        //! Called if a fatal error (signal/structured exception) occured
+        //! Called if a fatal error (signal/structured exception) occurred
         virtual void fatalErrorEncountered( StringRef error ) = 0;
 
         //! Writes out information about provided reporters using reporter-specific format
diff --git a/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_reporter_registry.cpp b/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_reporter_registry.cpp
deleted file mode 100644
index f620cbc8..00000000
--- a/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_reporter_registry.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-
-//              Copyright Catch2 Authors
-// Distributed under the Boost Software License, Version 1.0.
-//   (See accompanying file LICENSE.txt or copy at
-//        https://www.boost.org/LICENSE_1_0.txt)
-
-// SPDX-License-Identifier: BSL-1.0
-
-#include <catch2/interfaces/catch_interfaces_reporter_registry.hpp>
-
-namespace Catch {
-    IReporterRegistry::~IReporterRegistry() = default;
-}
diff --git a/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_reporter_registry.hpp b/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_reporter_registry.hpp
deleted file mode 100644
index 277d1761..00000000
--- a/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_reporter_registry.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-
-//              Copyright Catch2 Authors
-// Distributed under the Boost Software License, Version 1.0.
-//   (See accompanying file LICENSE.txt or copy at
-//        https://www.boost.org/LICENSE_1_0.txt)
-
-// SPDX-License-Identifier: BSL-1.0
-#ifndef CATCH_INTERFACES_REPORTER_REGISTRY_HPP_INCLUDED
-#define CATCH_INTERFACES_REPORTER_REGISTRY_HPP_INCLUDED
-
-#include <catch2/internal/catch_case_insensitive_comparisons.hpp>
-#include <catch2/internal/catch_unique_ptr.hpp>
-
-#include <string>
-#include <vector>
-#include <map>
-
-namespace Catch {
-
-    class IConfig;
-
-    class IEventListener;
-    using IEventListenerPtr = Detail::unique_ptr<IEventListener>;
-    class IReporterFactory;
-    using IReporterFactoryPtr = Detail::unique_ptr<IReporterFactory>;
-    struct ReporterConfig;
-    class EventListenerFactory;
-
-    class IReporterRegistry {
-    public:
-        using FactoryMap = std::map<std::string, IReporterFactoryPtr, Detail::CaseInsensitiveLess>;
-        using Listeners = std::vector<Detail::unique_ptr<EventListenerFactory>>;
-
-        virtual ~IReporterRegistry(); // = default
-        virtual IEventListenerPtr create( std::string const& name, ReporterConfig&& config ) const = 0;
-        virtual FactoryMap const& getFactories() const = 0;
-        virtual Listeners const& getListeners() const = 0;
-    };
-
-} // end namespace Catch
-
-#endif // CATCH_INTERFACES_REPORTER_REGISTRY_HPP_INCLUDED
diff --git a/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_test_invoker.hpp b/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_test_invoker.hpp
new file mode 100644
index 00000000..3caeff9a
--- /dev/null
+++ b/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_test_invoker.hpp
@@ -0,0 +1,21 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_INTERFACES_TEST_INVOKER_HPP_INCLUDED
+#define CATCH_INTERFACES_TEST_INVOKER_HPP_INCLUDED
+
+namespace Catch {
+
+    class ITestInvoker {
+    public:
+        virtual void invoke() const = 0;
+        virtual ~ITestInvoker(); // = default
+    };
+
+} // namespace Catch
+
+#endif // CATCH_INTERFACES_TEST_INVOKER_HPP_INCLUDED
diff --git a/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_testcase.cpp b/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_testcase.cpp
index 5e632ba8..a543116c 100644
--- a/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_testcase.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_testcase.cpp
@@ -9,6 +9,5 @@
 #include <catch2/interfaces/catch_interfaces_testcase.hpp>
 
 namespace Catch {
-    ITestInvoker::~ITestInvoker() = default;
     ITestCaseRegistry::~ITestCaseRegistry() = default;
 }
diff --git a/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_testcase.hpp b/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_testcase.hpp
index 78ee2021..daee8482 100644
--- a/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_testcase.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/interfaces/catch_interfaces_testcase.hpp
@@ -12,15 +12,7 @@
 
 namespace Catch {
 
-    class TestSpec;
     struct TestCaseInfo;
-
-    class ITestInvoker {
-    public:
-        virtual void invoke () const = 0;
-        virtual ~ITestInvoker(); // = default
-    };
-
     class TestCaseHandle;
     class IConfig;
 
@@ -33,11 +25,6 @@ namespace Catch {
         virtual std::vector<TestCaseHandle> const& getAllTestsSorted( IConfig const& config ) const = 0;
     };
 
-    bool isThrowSafe( TestCaseHandle const& testCase, IConfig const& config );
-    bool matchTest( TestCaseHandle const& testCase, TestSpec const& testSpec, IConfig const& config );
-    std::vector<TestCaseHandle> filterTests( std::vector<TestCaseHandle> const& testCases, TestSpec const& testSpec, IConfig const& config );
-    std::vector<TestCaseHandle> const& getAllTestCasesSorted( IConfig const& config );
-
 }
 
 #endif // CATCH_INTERFACES_TESTCASE_HPP_INCLUDED
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_assertion_handler.cpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_assertion_handler.cpp
index 0b14e0bb..f650a707 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_assertion_handler.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_assertion_handler.cpp
@@ -8,11 +8,8 @@
 #include <catch2/internal/catch_assertion_handler.hpp>
 #include <catch2/interfaces/catch_interfaces_config.hpp>
 #include <catch2/internal/catch_context.hpp>
-#include <catch2/internal/catch_enforce.hpp>
 #include <catch2/internal/catch_debugger.hpp>
 #include <catch2/internal/catch_test_failure_exception.hpp>
-#include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
-#include <catch2/internal/catch_run_context.hpp>
 #include <catch2/matchers/catch_matchers_string.hpp>
 
 namespace Catch {
@@ -24,7 +21,9 @@ namespace Catch {
             ResultDisposition::Flags resultDisposition )
     :   m_assertionInfo{ macroName, lineInfo, capturedExpression, resultDisposition },
         m_resultCapture( getResultCapture() )
-    {}
+    {
+        m_resultCapture.notifyAssertionStarted( m_assertionInfo );
+    }
 
     void AssertionHandler::handleExpr( ITransientExpression const& expr ) {
         m_resultCapture.handleExpr( m_assertionInfo, expr, m_reaction );
@@ -38,7 +37,7 @@ namespace Catch {
     }
 
     void AssertionHandler::complete() {
-        setCompleted();
+        m_completed = true;
         if( m_reaction.shouldDebugBreak ) {
 
             // If you find your debugger stopping you here then go one level up on the
@@ -51,16 +50,9 @@ namespace Catch {
             throw_test_failure_exception();
         }
         if ( m_reaction.shouldSkip ) {
-#if !defined( CATCH_CONFIG_DISABLE_EXCEPTIONS )
-            throw Catch::TestSkipException();
-#else
-            CATCH_ERROR( "Explicitly skipping tests during runtime requires exceptions" );
-#endif
+            throw_test_skip_exception();
         }
     }
-    void AssertionHandler::setCompleted() {
-        m_completed = true;
-    }
 
     void AssertionHandler::handleUnexpectedInflightException() {
         m_resultCapture.handleUnexpectedInflightException( m_assertionInfo, Catch::translateActiveException(), m_reaction );
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_assertion_handler.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_assertion_handler.hpp
index ae7776d8..01dd7801 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_assertion_handler.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_assertion_handler.hpp
@@ -11,14 +11,11 @@
 #include <catch2/catch_assertion_info.hpp>
 #include <catch2/internal/catch_decomposer.hpp>
 #include <catch2/interfaces/catch_interfaces_capture.hpp>
-#include <catch2/internal/catch_lazy_expr.hpp>
 
 #include <string>
 
 namespace Catch {
 
-    class IResultCapture;
-
     struct AssertionReaction {
         bool shouldDebugBreak = false;
         bool shouldThrow = false;
@@ -59,7 +56,6 @@ namespace Catch {
         void handleUnexpectedInflightException();
 
         void complete();
-        void setCompleted();
 
         // query
         auto allowThrows() const -> bool;
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_clara.cpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_clara.cpp
index c9bc7695..c76089ee 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_clara.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_clara.cpp
@@ -11,6 +11,7 @@
 #include <catch2/internal/catch_platform.hpp>
 #include <catch2/internal/catch_string_manip.hpp>
 #include <catch2/internal/catch_textflow.hpp>
+#include <catch2/internal/catch_reusable_string_stream.hpp>
 
 #include <algorithm>
 #include <ostream>
@@ -24,13 +25,29 @@ namespace {
             ;
     }
 
-    std::string normaliseOpt( std::string const& optName ) {
-#ifdef CATCH_PLATFORM_WINDOWS
-        if ( optName[0] == '/' )
-            return "-" + optName.substr( 1 );
-        else
+    Catch::StringRef normaliseOpt( Catch::StringRef optName ) {
+        if ( optName[0] == '-'
+#if defined(CATCH_PLATFORM_WINDOWS)
+             || optName[0] == '/'
 #endif
-            return optName;
+        ) {
+            return optName.substr( 1, optName.size() );
+        }
+
+        return optName;
+    }
+
+    static size_t find_first_separator(Catch::StringRef sr) {
+        auto is_separator = []( char c ) {
+            return c == ' ' || c == ':' || c == '=';
+        };
+        size_t pos = 0;
+        while (pos < sr.size()) {
+            if (is_separator(sr[pos])) { return pos; }
+            ++pos;
+        }
+
+        return Catch::StringRef::npos;
     }
 
 } // namespace
@@ -48,23 +65,23 @@ namespace Catch {
                 }
 
                 if ( it != itEnd ) {
-                    auto const& next = *it;
+                    StringRef next = *it;
                     if ( isOptPrefix( next[0] ) ) {
-                        auto delimiterPos = next.find_first_of( " :=" );
-                        if ( delimiterPos != std::string::npos ) {
+                        auto delimiterPos = find_first_separator(next);
+                        if ( delimiterPos != StringRef::npos ) {
                             m_tokenBuffer.push_back(
                                 { TokenType::Option,
                                   next.substr( 0, delimiterPos ) } );
                             m_tokenBuffer.push_back(
                                 { TokenType::Argument,
-                                  next.substr( delimiterPos + 1 ) } );
+                                  next.substr( delimiterPos + 1, next.size() ) } );
                         } else {
                             if ( next[1] != '-' && next.size() > 2 ) {
-                                std::string opt = "- ";
+                                // Combined short args, e.g. "-ab" for "-a -b"
                                 for ( size_t i = 1; i < next.size(); ++i ) {
-                                    opt[1] = next[i];
                                     m_tokenBuffer.push_back(
-                                        { TokenType::Option, opt } );
+                                        { TokenType::Option,
+                                          next.substr( i, 1 ) } );
                                 }
                             } else {
                                 m_tokenBuffer.push_back(
@@ -124,12 +141,12 @@ namespace Catch {
             size_t ParserBase::cardinality() const { return 1; }
 
             InternalParseResult ParserBase::parse( Args const& args ) const {
-                return parse( args.exeName(), TokenStream( args ) );
+                return parse( static_cast<std::string>(args.exeName()), TokenStream( args ) );
             }
 
             ParseState::ParseState( ParseResultType type,
-                                    TokenStream const& remainingTokens ):
-                m_type( type ), m_remainingTokens( remainingTokens ) {}
+                                    TokenStream remainingTokens ):
+                m_type( type ), m_remainingTokens( CATCH_MOVE(remainingTokens) ) {}
 
             ParserResult BoundFlagRef::setFlag( bool flag ) {
                 m_ref = flag;
@@ -147,34 +164,34 @@ namespace Catch {
 } // namespace Detail
 
         Detail::InternalParseResult Arg::parse(std::string const&,
-                                               Detail::TokenStream const& tokens) const {
+                                               Detail::TokenStream tokens) const {
             auto validationResult = validate();
             if (!validationResult)
                 return Detail::InternalParseResult(validationResult);
 
-            auto remainingTokens = tokens;
-            auto const& token = *remainingTokens;
+            auto token = *tokens;
             if (token.type != Detail::TokenType::Argument)
                 return Detail::InternalParseResult::ok(Detail::ParseState(
-                    ParseResultType::NoMatch, remainingTokens));
+                    ParseResultType::NoMatch, CATCH_MOVE(tokens)));
 
             assert(!m_ref->isFlag());
             auto valueRef =
                 static_cast<Detail::BoundValueRefBase*>(m_ref.get());
 
-            auto result = valueRef->setValue(remainingTokens->token);
-            if (!result)
-                return Detail::InternalParseResult(result);
+            auto result = valueRef->setValue(static_cast<std::string>(token.token));
+            if ( !result )
+                return Detail::InternalParseResult( result );
             else
-                return Detail::InternalParseResult::ok(Detail::ParseState(
-                    ParseResultType::Matched, ++remainingTokens));
+                return Detail::InternalParseResult::ok(
+                    Detail::ParseState( ParseResultType::Matched,
+                                        CATCH_MOVE( ++tokens ) ) );
         }
 
         Opt::Opt(bool& ref) :
             ParserRefImpl(std::make_shared<Detail::BoundFlagRef>(ref)) {}
 
-        std::vector<Detail::HelpColumns> Opt::getHelpColumns() const {
-            std::ostringstream oss;
+        Detail::HelpColumns Opt::getHelpColumns() const {
+            ReusableStringStream oss;
             bool first = true;
             for (auto const& opt : m_optNames) {
                 if (first)
@@ -185,10 +202,10 @@ namespace Catch {
             }
             if (!m_hint.empty())
                 oss << " <" << m_hint << '>';
-            return { { oss.str(), m_description } };
+            return { oss.str(), m_description };
         }
 
-        bool Opt::isMatch(std::string const& optToken) const {
+        bool Opt::isMatch(StringRef optToken) const {
             auto normalisedToken = normaliseOpt(optToken);
             for (auto const& name : m_optNames) {
                 if (normaliseOpt(name) == normalisedToken)
@@ -198,15 +215,14 @@ namespace Catch {
         }
 
         Detail::InternalParseResult Opt::parse(std::string const&,
-                                       Detail::TokenStream const& tokens) const {
+                                       Detail::TokenStream tokens) const {
             auto validationResult = validate();
             if (!validationResult)
                 return Detail::InternalParseResult(validationResult);
 
-            auto remainingTokens = tokens;
-            if (remainingTokens &&
-                remainingTokens->type == Detail::TokenType::Option) {
-                auto const& token = *remainingTokens;
+            if (tokens &&
+                tokens->type == Detail::TokenType::Option) {
+                auto const& token = *tokens;
                 if (isMatch(token.token)) {
                     if (m_ref->isFlag()) {
                         auto flagRef =
@@ -218,35 +234,35 @@ namespace Catch {
                         if (result.value() ==
                             ParseResultType::ShortCircuitAll)
                             return Detail::InternalParseResult::ok(Detail::ParseState(
-                                result.value(), remainingTokens));
+                                result.value(), CATCH_MOVE(tokens)));
                     } else {
                         auto valueRef =
                             static_cast<Detail::BoundValueRefBase*>(
                                 m_ref.get());
-                        ++remainingTokens;
-                        if (!remainingTokens)
+                        ++tokens;
+                        if (!tokens)
                             return Detail::InternalParseResult::runtimeError(
                                 "Expected argument following " +
                                 token.token);
-                        auto const& argToken = *remainingTokens;
+                        auto const& argToken = *tokens;
                         if (argToken.type != Detail::TokenType::Argument)
                             return Detail::InternalParseResult::runtimeError(
                                 "Expected argument following " +
                                 token.token);
-                        const auto result = valueRef->setValue(argToken.token);
+                        const auto result = valueRef->setValue(static_cast<std::string>(argToken.token));
                         if (!result)
                             return Detail::InternalParseResult(result);
                         if (result.value() ==
                             ParseResultType::ShortCircuitAll)
                             return Detail::InternalParseResult::ok(Detail::ParseState(
-                                result.value(), remainingTokens));
+                                result.value(), CATCH_MOVE(tokens)));
                     }
                     return Detail::InternalParseResult::ok(Detail::ParseState(
-                        ParseResultType::Matched, ++remainingTokens));
+                        ParseResultType::Matched, CATCH_MOVE(++tokens)));
                 }
             }
             return Detail::InternalParseResult::ok(
-                Detail::ParseState(ParseResultType::NoMatch, remainingTokens));
+                Detail::ParseState(ParseResultType::NoMatch, CATCH_MOVE(tokens)));
         }
 
         Detail::Result Opt::validate() const {
@@ -278,9 +294,9 @@ namespace Catch {
 
         Detail::InternalParseResult
             ExeName::parse(std::string const&,
-                           Detail::TokenStream const& tokens) const {
+                           Detail::TokenStream tokens) const {
             return Detail::InternalParseResult::ok(
-                Detail::ParseState(ParseResultType::NoMatch, tokens));
+                Detail::ParseState(ParseResultType::NoMatch, CATCH_MOVE(tokens)));
         }
 
         ParserResult ExeName::set(std::string const& newName) {
@@ -310,9 +326,9 @@ namespace Catch {
 
         std::vector<Detail::HelpColumns> Parser::getHelpColumns() const {
             std::vector<Detail::HelpColumns> cols;
+            cols.reserve( m_options.size() );
             for ( auto const& o : m_options ) {
-                auto childCols = o.getHelpColumns();
-                cols.insert( cols.end(), childCols.begin(), childCols.end() );
+                cols.push_back(o.getHelpColumns());
             }
             return cols;
         }
@@ -350,12 +366,12 @@ namespace Catch {
 
             optWidth = ( std::min )( optWidth, consoleWidth / 2 );
 
-            for ( auto const& cols : rows ) {
-                auto row = TextFlow::Column( cols.left )
+            for ( auto& cols : rows ) {
+                auto row = TextFlow::Column( CATCH_MOVE(cols.left) )
                                .width( optWidth )
                                .indent( 2 ) +
                            TextFlow::Spacer( 4 ) +
-                           TextFlow::Column( cols.right )
+                           TextFlow::Column( static_cast<std::string>(cols.descriptions) )
                                .width( consoleWidth - 7 - optWidth );
                 os << row << '\n';
             }
@@ -377,7 +393,7 @@ namespace Catch {
 
         Detail::InternalParseResult
         Parser::parse( std::string const& exeName,
-                       Detail::TokenStream const& tokens ) const {
+                       Detail::TokenStream tokens ) const {
 
             struct ParserInfo {
                 ParserBase const* parser = nullptr;
@@ -395,7 +411,7 @@ namespace Catch {
             m_exeName.set( exeName );
 
             auto result = Detail::InternalParseResult::ok(
-                Detail::ParseState( ParseResultType::NoMatch, tokens ) );
+                Detail::ParseState( ParseResultType::NoMatch, CATCH_MOVE(tokens) ) );
             while ( result.value().remainingTokens() ) {
                 bool tokenParsed = false;
 
@@ -403,7 +419,7 @@ namespace Catch {
                     if ( parseInfo.parser->cardinality() == 0 ||
                          parseInfo.count < parseInfo.parser->cardinality() ) {
                         result = parseInfo.parser->parse(
-                            exeName, result.value().remainingTokens() );
+                            exeName, CATCH_MOVE(result).value().remainingTokens() );
                         if ( !result )
                             return result;
                         if ( result.value().type() !=
@@ -429,7 +445,7 @@ namespace Catch {
         Args::Args(int argc, char const* const* argv) :
             m_exeName(argv[0]), m_args(argv + 1, argv + argc) {}
 
-        Args::Args(std::initializer_list<std::string> args) :
+        Args::Args(std::initializer_list<StringRef> args) :
             m_exeName(*args.begin()),
             m_args(args.begin() + 1, args.end()) {}
 
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_clara.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_clara.hpp
index 9117b65e..d869593b 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_clara.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_clara.hpp
@@ -29,6 +29,7 @@
 #    endif
 #endif
 
+#include <catch2/internal/catch_stringref.hpp>
 #include <catch2/internal/catch_move_and_forward.hpp>
 #include <catch2/internal/catch_noncopyable.hpp>
 #include <catch2/internal/catch_void_type.hpp>
@@ -101,17 +102,16 @@ namespace Catch {
             enum class TokenType { Option, Argument };
             struct Token {
                 TokenType type;
-                std::string token;
+                StringRef token;
             };
 
             // Abstracts iterators into args as a stream of tokens, with option
             // arguments uniformly handled
             class TokenStream {
-                using Iterator = std::vector<std::string>::const_iterator;
+                using Iterator = std::vector<StringRef>::const_iterator;
                 Iterator it;
                 Iterator itEnd;
                 std::vector<Token> m_tokenBuffer;
-
                 void loadBuffer();
 
             public:
@@ -163,12 +163,17 @@ namespace Catch {
                 ResultType m_type;
             };
 
-            template <typename T> class ResultValueBase : public ResultBase {
+            template <typename T>
+            class ResultValueBase : public ResultBase {
             public:
-                auto value() const -> T const& {
+                T const& value() const& {
                     enforceOk();
                     return m_value;
                 }
+                T&& value() && {
+                    enforceOk();
+                    return CATCH_MOVE( m_value );
+                }
 
             protected:
                 ResultValueBase( ResultType type ): ResultBase( type ) {}
@@ -178,13 +183,23 @@ namespace Catch {
                     if ( m_type == ResultType::Ok )
                         new ( &m_value ) T( other.m_value );
                 }
+                ResultValueBase( ResultValueBase&& other ):
+                    ResultBase( other ) {
+                    if ( m_type == ResultType::Ok )
+                        new ( &m_value ) T( CATCH_MOVE(other.m_value) );
+                }
+
 
-                ResultValueBase( ResultType, T const& value ): ResultBase( ResultType::Ok ) {
+                ResultValueBase( ResultType, T const& value ):
+                    ResultBase( ResultType::Ok ) {
                     new ( &m_value ) T( value );
                 }
+                ResultValueBase( ResultType, T&& value ):
+                    ResultBase( ResultType::Ok ) {
+                    new ( &m_value ) T( CATCH_MOVE(value) );
+                }
 
-                auto operator=( ResultValueBase const& other )
-                    -> ResultValueBase& {
+                ResultValueBase& operator=( ResultValueBase const& other ) {
                     if ( m_type == ResultType::Ok )
                         m_value.~T();
                     ResultBase::operator=( other );
@@ -192,6 +207,14 @@ namespace Catch {
                         new ( &m_value ) T( other.m_value );
                     return *this;
                 }
+                ResultValueBase& operator=( ResultValueBase&& other ) {
+                    if ( m_type == ResultType::Ok ) m_value.~T();
+                    ResultBase::operator=( other );
+                    if ( m_type == ResultType::Ok )
+                        new ( &m_value ) T( CATCH_MOVE(other.m_value) );
+                    return *this;
+                }
+
 
                 ~ResultValueBase() override {
                     if ( m_type == ResultType::Ok )
@@ -219,8 +242,8 @@ namespace Catch {
                 }
 
                 template <typename U>
-                static auto ok( U const& value ) -> BasicResult {
-                    return { ResultType::Ok, value };
+                static auto ok( U&& value ) -> BasicResult {
+                    return { ResultType::Ok, CATCH_FORWARD(value) };
                 }
                 static auto ok() -> BasicResult { return { ResultType::Ok }; }
                 static auto logicError( std::string&& message )
@@ -267,12 +290,15 @@ namespace Catch {
             class ParseState {
             public:
                 ParseState( ParseResultType type,
-                            TokenStream const& remainingTokens );
+                            TokenStream remainingTokens );
 
                 ParseResultType type() const { return m_type; }
-                TokenStream const& remainingTokens() const {
+                TokenStream const& remainingTokens() const& {
                     return m_remainingTokens;
                 }
+                TokenStream&& remainingTokens() && {
+                    return CATCH_MOVE( m_remainingTokens );
+                }
 
             private:
                 ParseResultType m_type;
@@ -285,7 +311,7 @@ namespace Catch {
 
             struct HelpColumns {
                 std::string left;
-                std::string right;
+                StringRef descriptions;
             };
 
             template <typename T>
@@ -445,7 +471,7 @@ namespace Catch {
                 virtual ~ParserBase() = default;
                 virtual auto validate() const -> Result { return Result::ok(); }
                 virtual auto parse( std::string const& exeName,
-                                    TokenStream const& tokens ) const
+                                    TokenStream tokens ) const
                     -> InternalParseResult = 0;
                 virtual size_t cardinality() const;
 
@@ -465,8 +491,8 @@ namespace Catch {
             protected:
                 Optionality m_optionality = Optionality::Optional;
                 std::shared_ptr<BoundRef> m_ref;
-                std::string m_hint;
-                std::string m_description;
+                StringRef m_hint;
+                StringRef m_description;
 
                 explicit ParserRefImpl( std::shared_ptr<BoundRef> const& ref ):
                     m_ref( ref ) {}
@@ -475,28 +501,32 @@ namespace Catch {
                 template <typename LambdaT>
                 ParserRefImpl( accept_many_t,
                                LambdaT const& ref,
-                               std::string const& hint ):
+                               StringRef hint ):
                     m_ref( std::make_shared<BoundManyLambda<LambdaT>>( ref ) ),
                     m_hint( hint ) {}
 
                 template <typename T,
                           typename = typename std::enable_if_t<
                               !Detail::is_unary_function<T>::value>>
-                ParserRefImpl( T& ref, std::string const& hint ):
+                ParserRefImpl( T& ref, StringRef hint ):
                     m_ref( std::make_shared<BoundValueRef<T>>( ref ) ),
                     m_hint( hint ) {}
 
                 template <typename LambdaT,
                           typename = typename std::enable_if_t<
                               Detail::is_unary_function<LambdaT>::value>>
-                ParserRefImpl( LambdaT const& ref, std::string const& hint ):
+                ParserRefImpl( LambdaT const& ref, StringRef hint ):
                     m_ref( std::make_shared<BoundLambda<LambdaT>>( ref ) ),
                     m_hint( hint ) {}
 
-                auto operator()( std::string const& description ) -> DerivedT& {
+                DerivedT& operator()( StringRef description ) & {
                     m_description = description;
                     return static_cast<DerivedT&>( *this );
                 }
+                DerivedT&& operator()( StringRef description ) && {
+                    m_description = description;
+                    return static_cast<DerivedT&&>( *this );
+                }
 
                 auto optional() -> DerivedT& {
                     m_optionality = Optionality::Optional;
@@ -519,7 +549,7 @@ namespace Catch {
                         return 1;
                 }
 
-                std::string const& hint() const { return m_hint; }
+                StringRef hint() const { return m_hint; }
             };
 
         } // namespace detail
@@ -533,13 +563,13 @@ namespace Catch {
 
             Detail::InternalParseResult
                 parse(std::string const&,
-                      Detail::TokenStream const& tokens) const override;
+                      Detail::TokenStream tokens) const override;
         };
 
         // A parser for options
         class Opt : public Detail::ParserRefImpl<Opt> {
         protected:
-            std::vector<std::string> m_optNames;
+            std::vector<StringRef> m_optNames;
 
         public:
             template <typename LambdaT>
@@ -552,33 +582,37 @@ namespace Catch {
             template <typename LambdaT,
                       typename = typename std::enable_if_t<
                           Detail::is_unary_function<LambdaT>::value>>
-            Opt( LambdaT const& ref, std::string const& hint ):
+            Opt( LambdaT const& ref, StringRef hint ):
                 ParserRefImpl( ref, hint ) {}
 
             template <typename LambdaT>
-            Opt( accept_many_t, LambdaT const& ref, std::string const& hint ):
+            Opt( accept_many_t, LambdaT const& ref, StringRef hint ):
                 ParserRefImpl( accept_many, ref, hint ) {}
 
             template <typename T,
                       typename = typename std::enable_if_t<
                           !Detail::is_unary_function<T>::value>>
-            Opt( T& ref, std::string const& hint ):
+            Opt( T& ref, StringRef hint ):
                 ParserRefImpl( ref, hint ) {}
 
-            auto operator[](std::string const& optName) -> Opt& {
+            Opt& operator[]( StringRef optName ) & {
                 m_optNames.push_back(optName);
                 return *this;
             }
+            Opt&& operator[]( StringRef optName ) && {
+                m_optNames.push_back( optName );
+                return CATCH_MOVE(*this);
+            }
 
-            std::vector<Detail::HelpColumns> getHelpColumns() const;
+            Detail::HelpColumns getHelpColumns() const;
 
-            bool isMatch(std::string const& optToken) const;
+            bool isMatch(StringRef optToken) const;
 
             using ParserBase::parse;
 
             Detail::InternalParseResult
                 parse(std::string const&,
-                      Detail::TokenStream const& tokens) const override;
+                      Detail::TokenStream tokens) const override;
 
             Detail::Result validate() const override;
         };
@@ -601,7 +635,7 @@ namespace Catch {
             // handled specially
             Detail::InternalParseResult
                 parse(std::string const&,
-                      Detail::TokenStream const& tokens) const override;
+                      Detail::TokenStream tokens) const override;
 
             std::string const& name() const { return *m_name; }
             Detail::ParserResult set(std::string const& newName);
@@ -626,16 +660,28 @@ namespace Catch {
                 return *this;
             }
 
-            auto operator|=(Opt const& opt) -> Parser& {
-                m_options.push_back(opt);
-                return *this;
+            friend Parser& operator|=( Parser& p, Opt const& opt ) {
+                p.m_options.push_back( opt );
+                return p;
+            }
+            friend Parser& operator|=( Parser& p, Opt&& opt ) {
+                p.m_options.push_back( CATCH_MOVE(opt) );
+                return p;
             }
 
             Parser& operator|=(Parser const& other);
 
             template <typename T>
-            auto operator|(T const& other) const -> Parser {
-                return Parser(*this) |= other;
+            friend Parser operator|( Parser const& p, T&& rhs ) {
+                Parser temp( p );
+                temp |= rhs;
+                return temp;
+            }
+
+            template <typename T>
+            friend Parser operator|( Parser&& p, T&& rhs ) {
+                p |= CATCH_FORWARD(rhs);
+                return CATCH_MOVE(p);
             }
 
             std::vector<Detail::HelpColumns> getHelpColumns() const;
@@ -653,21 +699,23 @@ namespace Catch {
             using ParserBase::parse;
             Detail::InternalParseResult
                 parse(std::string const& exeName,
-                      Detail::TokenStream const& tokens) const override;
+                      Detail::TokenStream tokens) const override;
         };
 
-        // Transport for raw args (copied from main args, or supplied via
-        // init list for testing)
+        /**
+         * Wrapper over argc + argv, assumes that the inputs outlive it
+         */
         class Args {
             friend Detail::TokenStream;
-            std::string m_exeName;
-            std::vector<std::string> m_args;
+            StringRef m_exeName;
+            std::vector<StringRef> m_args;
 
         public:
             Args(int argc, char const* const* argv);
-            Args(std::initializer_list<std::string> args);
+            // Helper constructor for testing
+            Args(std::initializer_list<StringRef> args);
 
-            std::string const& exeName() const { return m_exeName; }
+            StringRef exeName() const { return m_exeName; }
         };
 
 
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_commandline.cpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_commandline.cpp
index 81aa073c..c29a801d 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_commandline.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_commandline.cpp
@@ -9,8 +9,9 @@
 
 #include <catch2/catch_config.hpp>
 #include <catch2/internal/catch_string_manip.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
 #include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
-#include <catch2/interfaces/catch_interfaces_reporter_registry.hpp>
+#include <catch2/internal/catch_reporter_registry.hpp>
 #include <catch2/internal/catch_console_colour.hpp>
 #include <catch2/internal/catch_parse_numbers.hpp>
 #include <catch2/internal/catch_reporter_spec_parser.hpp>
@@ -144,7 +145,7 @@ namespace Catch {
 
             auto const& reporterSpec = *parsed;
 
-            IReporterRegistry::FactoryMap const& factories =
+            auto const& factories =
                 getRegistryHub().getReporterRegistry().getFactories();
             auto result = factories.find( reporterSpec.name() );
 
@@ -300,8 +301,8 @@ namespace Catch {
                 ( "split the tests to execute into this many groups" )
             | Opt( setShardIndex, "shard index" )
                 ["--shard-index"]
-                ( "index of the group of tests to execute (see --shard-count)" ) |
-            Opt( config.allowZeroTests )
+                ( "index of the group of tests to execute (see --shard-count)" )
+            | Opt( config.allowZeroTests )
                 ["--allow-running-no-tests"]
                 ( "Treat 'No tests run' as a success" )
             | Arg( config.testsOrTags, "test name|pattern|tags" )
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_compiler_capabilities.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_compiler_capabilities.hpp
index 42631a5f..dacae01b 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_compiler_capabilities.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_compiler_capabilities.hpp
@@ -50,12 +50,18 @@
 #    define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS \
          _Pragma( "GCC diagnostic ignored \"-Wparentheses\"" )
 
+#    define CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT \
+         _Pragma( "GCC diagnostic ignored \"-Wunused-result\"" )
+
 #    define CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
          _Pragma( "GCC diagnostic ignored \"-Wunused-variable\"" )
 
 #    define CATCH_INTERNAL_SUPPRESS_USELESS_CAST_WARNINGS \
          _Pragma( "GCC diagnostic ignored \"-Wuseless-cast\"" )
 
+#    define CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS \
+         _Pragma( "GCC diagnostic ignored \"-Wshadow\"" )
+
 #    define CATCH_INTERNAL_IGNORE_BUT_WARN(...) (void)__builtin_constant_p(__VA_ARGS__)
 
 #endif
@@ -128,6 +134,9 @@
 #    define CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS \
         _Pragma( "clang diagnostic ignored \"-Wcomma\"" )
 
+#    define CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS \
+        _Pragma( "clang diagnostic ignored \"-Wshadow\"" )
+
 #endif // __clang__
 
 
@@ -147,7 +156,9 @@
 
 ////////////////////////////////////////////////////////////////////////////////
 // Assume that some platforms do not support getenv.
-#if defined(CATCH_PLATFORM_WINDOWS_UWP) || defined(CATCH_PLATFORM_PLAYSTATION)
+#if defined( CATCH_PLATFORM_WINDOWS_UWP ) ||                                   \
+    defined( CATCH_PLATFORM_PLAYSTATION ) ||                                   \
+    defined( _GAMING_XBOX )
 #    define CATCH_INTERNAL_CONFIG_NO_GETENV
 #else
 #    define CATCH_INTERNAL_CONFIG_GETENV
@@ -365,6 +376,9 @@
 #if !defined(CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS)
 #   define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS
 #endif
+#if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT)
+#   define CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT
+#endif
 #if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS)
 #   define CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS
 #endif
@@ -374,6 +388,16 @@
 #if !defined(CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS)
 #   define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS
 #endif
+#if !defined( CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS )
+#    define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS
+#endif
+#if !defined( CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS )
+#    define CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS
+#endif
+#if !defined( CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS )
+#    define CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS
+#endif
+
 
 // The goal of this macro is to avoid evaluation of the arguments, but
 // still have the compiler warn on problems inside...
@@ -387,13 +411,6 @@
 #   undef CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS
 #endif
 
-#if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS)
-#   define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS
-#endif
-
-#if !defined(CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS)
-#   define CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS
-#endif
 
 #if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
 #define CATCH_TRY if ((true))
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_config_counter.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_config_counter.hpp
index 23b22324..a482ce34 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_config_counter.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_config_counter.hpp
@@ -18,6 +18,8 @@
 #ifndef CATCH_CONFIG_COUNTER_HPP_INCLUDED
 #define CATCH_CONFIG_COUNTER_HPP_INCLUDED
 
+#include <catch2/catch_user_config.hpp>
+
 #if ( !defined(__JETBRAINS_IDE__) || __JETBRAINS_IDE__ >= 20170300L )
     #define CATCH_INTERNAL_CONFIG_COUNTER
 #endif
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_config_prefix_messages.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_config_prefix_messages.hpp
new file mode 100644
index 00000000..be1e9a96
--- /dev/null
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_config_prefix_messages.hpp
@@ -0,0 +1,29 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+/** \file
+ * Wrapper for the CATCH_CONFIG_PREFIX_MESSAGES configuration option
+ *
+ * CATCH_CONFIG_PREFIX_ALL can be used to avoid clashes with other macros
+ * by prepending CATCH_. This may not be desirable if the only clashes are with
+ * logger macros such as INFO and WARN. In this cases
+ * CATCH_CONFIG_PREFIX_MESSAGES can be used to only prefix a small subset
+ * of relevant macros.
+ *
+ */
+
+#ifndef CATCH_CONFIG_PREFIX_MESSAGES_HPP_INCLUDED
+#define CATCH_CONFIG_PREFIX_MESSAGES_HPP_INCLUDED
+
+#include <catch2/catch_user_config.hpp>
+
+#if defined(CATCH_CONFIG_PREFIX_ALL) && !defined(CATCH_CONFIG_PREFIX_MESSAGES)
+    #define CATCH_CONFIG_PREFIX_MESSAGES
+#endif
+
+#endif // CATCH_CONFIG_PREFIX_MESSAGES_HPP_INCLUDED
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_config_static_analysis_support.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_config_static_analysis_support.hpp
new file mode 100644
index 00000000..81bdf39f
--- /dev/null
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_config_static_analysis_support.hpp
@@ -0,0 +1,34 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+/** \file
+ * Wrapper for the STATIC_ANALYSIS_SUPPORT configuration option
+ *
+ * Some of Catch2's macros can be defined differently to work better with
+ * static analysis tools, like clang-tidy or coverity.
+ * Currently the main use case is to show that `SECTION`s are executed
+ * exclusively, and not all in one run of a `TEST_CASE`.
+ */
+
+#ifndef CATCH_CONFIG_STATIC_ANALYSIS_SUPPORT_HPP_INCLUDED
+#define CATCH_CONFIG_STATIC_ANALYSIS_SUPPORT_HPP_INCLUDED
+
+#include <catch2/catch_user_config.hpp>
+
+#if defined(__clang_analyzer__) || defined(__COVERITY__)
+    #define CATCH_INTERNAL_CONFIG_STATIC_ANALYSIS_SUPPORT
+#endif
+
+#if defined( CATCH_INTERNAL_CONFIG_STATIC_ANALYSIS_SUPPORT ) && \
+    !defined( CATCH_CONFIG_NO_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT ) && \
+    !defined( CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT )
+#    define CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT
+#endif
+
+
+#endif // CATCH_CONFIG_STATIC_ANALYSIS_SUPPORT_HPP_INCLUDED
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_config_uncaught_exceptions.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_config_uncaught_exceptions.hpp
index 5c4cb930..20b1dfca 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_config_uncaught_exceptions.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_config_uncaught_exceptions.hpp
@@ -17,6 +17,8 @@
 #ifndef CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP_INCLUDED
 #define CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP_INCLUDED
 
+#include <catch2/catch_user_config.hpp>
+
 #if defined(_MSC_VER)
 #  if _MSC_VER >= 1900 // Visual Studio 2015 or newer
 #    define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_config_wchar.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_config_wchar.hpp
index 8c758ec4..90d85d05 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_config_wchar.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_config_wchar.hpp
@@ -17,6 +17,8 @@
 #ifndef CATCH_CONFIG_WCHAR_HPP_INCLUDED
 #define CATCH_CONFIG_WCHAR_HPP_INCLUDED
 
+#include <catch2/catch_user_config.hpp>
+
 // We assume that WCHAR should be enabled by default, and only disabled
 // for a shortlist (so far only DJGPP) of compilers.
 
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_console_colour.cpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_console_colour.cpp
index 099a6c59..e1238816 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_console_colour.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_console_colour.cpp
@@ -85,7 +85,7 @@ namespace Catch {
     namespace {
         //! A do-nothing implementation of colour, used as fallback for unknown
         //! platforms, and when the user asks to deactivate all colours.
-        class NoColourImpl : public ColourImpl {
+        class NoColourImpl final : public ColourImpl {
         public:
             NoColourImpl( IStream* stream ): ColourImpl( stream ) {}
 
@@ -103,7 +103,7 @@ namespace Catch {
 namespace Catch {
 namespace {
 
-    class Win32ColourImpl : public ColourImpl {
+    class Win32ColourImpl final : public ColourImpl {
     public:
         Win32ColourImpl(IStream* stream):
             ColourImpl(stream) {
@@ -169,7 +169,7 @@ namespace {
 namespace Catch {
 namespace {
 
-    class ANSIColourImpl : public ColourImpl {
+    class ANSIColourImpl final : public ColourImpl {
     public:
         ANSIColourImpl( IStream* stream ): ColourImpl( stream ) {}
 
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_context.cpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_context.cpp
index 17f28509..3b1cc277 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_context.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_context.cpp
@@ -11,49 +11,27 @@
 
 namespace Catch {
 
-    class Context : public IMutableContext, private Detail::NonCopyable {
+    Context* Context::currentContext = nullptr;
 
-    public: // IContext
-        IResultCapture* getResultCapture() override {
-            return m_resultCapture;
-        }
-
-        IConfig const* getConfig() const override {
-            return m_config;
-        }
-
-        ~Context() override;
-
-    public: // IMutableContext
-        void setResultCapture( IResultCapture* resultCapture ) override {
-            m_resultCapture = resultCapture;
-        }
-        void setConfig( IConfig const* config ) override {
-            m_config = config;
-        }
-
-        friend IMutableContext& getCurrentMutableContext();
-
-    private:
-        IConfig const* m_config = nullptr;
-        IResultCapture* m_resultCapture = nullptr;
-    };
-
-    IMutableContext *IMutableContext::currentContext = nullptr;
-
-    void IMutableContext::createContext()
-    {
+    void cleanUpContext() {
+        delete Context::currentContext;
+        Context::currentContext = nullptr;
+    }
+    void Context::createContext() {
         currentContext = new Context();
     }
 
-    void cleanUpContext() {
-        delete IMutableContext::currentContext;
-        IMutableContext::currentContext = nullptr;
+    Context& getCurrentMutableContext() {
+        if ( !Context::currentContext ) { Context::createContext(); }
+        // NOLINTNEXTLINE(clang-analyzer-core.uninitialized.UndefReturn)
+        return *Context::currentContext;
+    }
+
+    void Context::setResultCapture( IResultCapture* resultCapture ) {
+        m_resultCapture = resultCapture;
     }
-    IContext::~IContext() = default;
-    IMutableContext::~IMutableContext() = default;
-    Context::~Context() = default;
 
+    void Context::setConfig( IConfig const* config ) { m_config = config; }
 
     SimplePcg32& sharedRng() {
         static SimplePcg32 s_rng;
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_context.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_context.hpp
index a9d1b394..6ccb3b31 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_context.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_context.hpp
@@ -15,38 +15,31 @@ namespace Catch {
     class IResultCapture;
     class IConfig;
 
-    class IContext {
-    public:
-        virtual ~IContext(); // = default
+    class Context {
+        IConfig const* m_config = nullptr;
+        IResultCapture* m_resultCapture = nullptr;
 
-        virtual IResultCapture* getResultCapture() = 0;
-        virtual IConfig const* getConfig() const = 0;
-    };
+        CATCH_EXPORT static Context* currentContext;
+        friend Context& getCurrentMutableContext();
+        friend Context const& getCurrentContext();
+        static void createContext();
+        friend void cleanUpContext();
 
-    class IMutableContext : public IContext {
     public:
-        ~IMutableContext() override; // = default
-        virtual void setResultCapture( IResultCapture* resultCapture ) = 0;
-        virtual void setConfig( IConfig const* config ) = 0;
-
-    private:
-        CATCH_EXPORT static IMutableContext* currentContext;
-        friend IMutableContext& getCurrentMutableContext();
-        friend void cleanUpContext();
-        static void createContext();
+        IResultCapture* getResultCapture() const { return m_resultCapture; }
+        IConfig const* getConfig() const { return m_config; }
+        void setResultCapture( IResultCapture* resultCapture );
+        void setConfig( IConfig const* config );
     };
 
-    inline IMutableContext& getCurrentMutableContext()
-    {
-        if( !IMutableContext::currentContext )
-            IMutableContext::createContext();
-        // NOLINTNEXTLINE(clang-analyzer-core.uninitialized.UndefReturn)
-        return *IMutableContext::currentContext;
-    }
+    Context& getCurrentMutableContext();
 
-    inline IContext& getCurrentContext()
-    {
-        return getCurrentMutableContext();
+    inline Context const& getCurrentContext() {
+        // We duplicate the logic from `getCurrentMutableContext` here,
+        // to avoid paying the call overhead in debug mode.
+        if ( !Context::currentContext ) { Context::createContext(); }
+        // NOLINTNEXTLINE(clang-analyzer-core.uninitialized.UndefReturn)
+        return *Context::currentContext;
     }
 
     void cleanUpContext();
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_enum_values_registry.cpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_enum_values_registry.cpp
index 7e8bf5e5..a94b6088 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_enum_values_registry.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_enum_values_registry.cpp
@@ -39,7 +39,7 @@ namespace Catch {
             return parsed;
         }
 
-        EnumInfo::~EnumInfo() {}
+        EnumInfo::~EnumInfo() = default;
 
         StringRef EnumInfo::lookup( int value ) const {
             for( auto const& valueToName : m_values ) {
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_exception_translator_registry.cpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_exception_translator_registry.cpp
index 0645c6ce..1eb61147 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_exception_translator_registry.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_exception_translator_registry.cpp
@@ -11,10 +11,27 @@
 #include <catch2/internal/catch_test_failure_exception.hpp>
 #include <catch2/internal/catch_move_and_forward.hpp>
 
+#include <exception>
+
 namespace Catch {
 
-    ExceptionTranslatorRegistry::~ExceptionTranslatorRegistry() {
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+    namespace {
+        static std::string tryTranslators(
+            std::vector<
+                Detail::unique_ptr<IExceptionTranslator const>> const& translators ) {
+            if ( translators.empty() ) {
+                std::rethrow_exception( std::current_exception() );
+            } else {
+                return translators[0]->translate( translators.begin() + 1,
+                                                  translators.end() );
+            }
+        }
+
     }
+#endif //!defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+
+    ExceptionTranslatorRegistry::~ExceptionTranslatorRegistry() = default;
 
     void ExceptionTranslatorRegistry::registerTranslator( Detail::unique_ptr<IExceptionTranslator>&& translator ) {
         m_translators.push_back( CATCH_MOVE( translator ) );
@@ -37,7 +54,7 @@ namespace Catch {
         // First we try user-registered translators. If none of them can
         // handle the exception, it will be rethrown handled by our defaults.
         try {
-            return tryTranslators();
+            return tryTranslators(m_translators);
         }
         // To avoid having to handle TFE explicitly everywhere, we just
         // rethrow it so that it goes back up the caller.
@@ -61,23 +78,10 @@ namespace Catch {
         }
     }
 
-    std::string ExceptionTranslatorRegistry::tryTranslators() const {
-        if (m_translators.empty()) {
-            std::rethrow_exception(std::current_exception());
-        } else {
-            return m_translators[0]->translate(m_translators.begin() + 1, m_translators.end());
-        }
-    }
-
 #else // ^^ Exceptions are enabled // Exceptions are disabled vv
     std::string ExceptionTranslatorRegistry::translateActiveException() const {
         CATCH_INTERNAL_ERROR("Attempted to translate active exception under CATCH_CONFIG_DISABLE_EXCEPTIONS!");
     }
-
-    std::string ExceptionTranslatorRegistry::tryTranslators() const {
-        CATCH_INTERNAL_ERROR("Attempted to use exception translators under CATCH_CONFIG_DISABLE_EXCEPTIONS!");
-    }
 #endif
 
-
 }
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_exception_translator_registry.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_exception_translator_registry.hpp
index 2aafa684..3123e93d 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_exception_translator_registry.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_exception_translator_registry.hpp
@@ -21,7 +21,6 @@ namespace Catch {
         ~ExceptionTranslatorRegistry() override;
         void registerTranslator( Detail::unique_ptr<IExceptionTranslator>&& translator );
         std::string translateActiveException() const override;
-        std::string tryTranslators() const;
 
     private:
         ExceptionTranslators m_translators;
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_fatal_condition_handler.cpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_fatal_condition_handler.cpp
index f9702b18..9ef5b217 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_fatal_condition_handler.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_fatal_condition_handler.cpp
@@ -26,6 +26,7 @@
 
 #include <catch2/internal/catch_fatal_condition_handler.hpp>
 
+#include <catch2/internal/catch_compiler_capabilities.hpp>
 #include <catch2/internal/catch_context.hpp>
 #include <catch2/internal/catch_enforce.hpp>
 #include <catch2/interfaces/catch_interfaces_capture.hpp>
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_fatal_condition_handler.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_fatal_condition_handler.hpp
index ce07f9b6..81728b56 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_fatal_condition_handler.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_fatal_condition_handler.hpp
@@ -8,9 +8,6 @@
 #ifndef CATCH_FATAL_CONDITION_HANDLER_HPP_INCLUDED
 #define CATCH_FATAL_CONDITION_HANDLER_HPP_INCLUDED
 
-#include <catch2/internal/catch_platform.hpp>
-#include <catch2/internal/catch_compiler_capabilities.hpp>
-
 #include <cassert>
 
 namespace Catch {
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_floating_point_helpers.cpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_floating_point_helpers.cpp
index e30ee434..9631ed6d 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_floating_point_helpers.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_floating_point_helpers.cpp
@@ -27,6 +27,17 @@ namespace Catch {
             return i;
         }
 
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        bool directCompare( float lhs, float rhs ) { return lhs == rhs; }
+        bool directCompare( double lhs, double rhs ) { return lhs == rhs; }
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic pop
+#endif
+
+
     } // end namespace Detail
 } // end namespace Catch
 
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_floating_point_helpers.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_floating_point_helpers.hpp
index ca883c61..b2143726 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_floating_point_helpers.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_floating_point_helpers.hpp
@@ -22,6 +22,11 @@ namespace Catch {
         uint32_t convertToBits(float f);
         uint64_t convertToBits(double d);
 
+        // Used when we know we want == comparison of two doubles
+        // to centralize warning suppression
+        bool directCompare( float lhs, float rhs );
+        bool directCompare( double lhs, double rhs );
+
     } // end namespace Detail
 
 
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_istream.cpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_istream.cpp
index 489396ec..2867ce74 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_istream.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_istream.cpp
@@ -24,7 +24,7 @@ namespace Catch {
 namespace Detail {
     namespace {
         template<typename WriterF, std::size_t bufferSize=256>
-        class StreamBufImpl : public std::streambuf {
+        class StreamBufImpl final : public std::streambuf {
             char data[bufferSize];
             WriterF m_writer;
 
@@ -72,7 +72,7 @@ namespace Detail {
 
         ///////////////////////////////////////////////////////////////////////////
 
-        class FileStream : public IStream {
+        class FileStream final : public IStream {
             std::ofstream m_ofs;
         public:
             FileStream( std::string const& filename ) {
@@ -80,7 +80,6 @@ namespace Detail {
                 CATCH_ENFORCE( !m_ofs.fail(), "Unable to open file: '" << filename << '\'' );
                 m_ofs << std::unitbuf;
             }
-            ~FileStream() override = default;
         public: // IStream
             std::ostream& stream() override {
                 return m_ofs;
@@ -89,13 +88,12 @@ namespace Detail {
 
         ///////////////////////////////////////////////////////////////////////////
 
-        class CoutStream : public IStream {
+        class CoutStream final : public IStream {
             std::ostream m_os;
         public:
             // Store the streambuf from cout up-front because
             // cout may get redirected when running tests
             CoutStream() : m_os( Catch::cout().rdbuf() ) {}
-            ~CoutStream() override = default;
 
         public: // IStream
             std::ostream& stream() override { return m_os; }
@@ -109,7 +107,6 @@ namespace Detail {
             // Store the streambuf from cerr up-front because
             // cout may get redirected when running tests
             CerrStream(): m_os( Catch::cerr().rdbuf() ) {}
-            ~CerrStream() override = default;
 
         public: // IStream
             std::ostream& stream() override { return m_os; }
@@ -118,7 +115,7 @@ namespace Detail {
 
         ///////////////////////////////////////////////////////////////////////////
 
-        class DebugOutStream : public IStream {
+        class DebugOutStream final : public IStream {
             Detail::unique_ptr<StreamBufImpl<OutputDebugWriter>> m_streamBuf;
             std::ostream m_os;
         public:
@@ -127,8 +124,6 @@ namespace Detail {
                 m_os( m_streamBuf.get() )
             {}
 
-            ~DebugOutStream() override = default;
-
         public: // IStream
             std::ostream& stream() override { return m_os; }
         };
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_jsonwriter.cpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_jsonwriter.cpp
new file mode 100644
index 00000000..ff65a9d3
--- /dev/null
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_jsonwriter.cpp
@@ -0,0 +1,148 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/internal/catch_jsonwriter.hpp>
+
+namespace Catch {
+    void JsonUtils::indent( std::ostream& os, std::uint64_t level ) {
+        for ( std::uint64_t i = 0; i < level; ++i ) {
+            os << "  ";
+        }
+    }
+    void JsonUtils::appendCommaNewline( std::ostream& os,
+                                        bool& should_comma,
+                                        std::uint64_t level ) {
+        if ( should_comma ) { os << ','; }
+        should_comma = true;
+        os << '\n';
+        indent( os, level );
+    }
+
+    JsonObjectWriter::JsonObjectWriter( std::ostream& os ):
+        JsonObjectWriter{ os, 0 } {}
+
+    JsonObjectWriter::JsonObjectWriter( std::ostream& os,
+                                        std::uint64_t indent_level ):
+        m_os{ os }, m_indent_level{ indent_level } {
+        m_os << '{';
+    }
+    JsonObjectWriter::JsonObjectWriter( JsonObjectWriter&& source ):
+        m_os{ source.m_os },
+        m_indent_level{ source.m_indent_level },
+        m_should_comma{ source.m_should_comma },
+        m_active{ source.m_active } {
+        source.m_active = false;
+    }
+
+    JsonObjectWriter::~JsonObjectWriter() {
+        if ( !m_active ) { return; }
+
+        m_os << '\n';
+        JsonUtils::indent( m_os, m_indent_level );
+        m_os << '}';
+    }
+
+    JsonValueWriter JsonObjectWriter::write( StringRef key ) {
+        JsonUtils::appendCommaNewline(
+            m_os, m_should_comma, m_indent_level + 1 );
+
+        m_os << '"' << key << "\": ";
+        return JsonValueWriter{ m_os, m_indent_level + 1 };
+    }
+
+    JsonArrayWriter::JsonArrayWriter( std::ostream& os ):
+        JsonArrayWriter{ os, 0 } {}
+    JsonArrayWriter::JsonArrayWriter( std::ostream& os,
+                                      std::uint64_t indent_level ):
+        m_os{ os }, m_indent_level{ indent_level } {
+        m_os << '[';
+    }
+    JsonArrayWriter::JsonArrayWriter( JsonArrayWriter&& source ):
+        m_os{ source.m_os },
+        m_indent_level{ source.m_indent_level },
+        m_should_comma{ source.m_should_comma },
+        m_active{ source.m_active } {
+        source.m_active = false;
+    }
+    JsonArrayWriter::~JsonArrayWriter() {
+        if ( !m_active ) { return; }
+
+        m_os << '\n';
+        JsonUtils::indent( m_os, m_indent_level );
+        m_os << ']';
+    }
+
+    JsonObjectWriter JsonArrayWriter::writeObject() {
+        JsonUtils::appendCommaNewline(
+            m_os, m_should_comma, m_indent_level + 1 );
+        return JsonObjectWriter{ m_os, m_indent_level + 1 };
+    }
+
+    JsonArrayWriter JsonArrayWriter::writeArray() {
+        JsonUtils::appendCommaNewline(
+            m_os, m_should_comma, m_indent_level + 1 );
+        return JsonArrayWriter{ m_os, m_indent_level + 1 };
+    }
+
+    JsonArrayWriter& JsonArrayWriter::write( bool value ) {
+        return writeImpl( value );
+    }
+
+    JsonValueWriter::JsonValueWriter( std::ostream& os ):
+        JsonValueWriter{ os, 0 } {}
+
+    JsonValueWriter::JsonValueWriter( std::ostream& os,
+                                      std::uint64_t indent_level ):
+        m_os{ os }, m_indent_level{ indent_level } {}
+
+    JsonObjectWriter JsonValueWriter::writeObject() && {
+        return JsonObjectWriter{ m_os, m_indent_level };
+    }
+
+    JsonArrayWriter JsonValueWriter::writeArray() && {
+        return JsonArrayWriter{ m_os, m_indent_level };
+    }
+
+    void JsonValueWriter::write( Catch::StringRef value ) && {
+        writeImpl( value, true );
+    }
+
+    void JsonValueWriter::write( bool value ) && {
+        writeImpl( value ? "true"_sr : "false"_sr, false );
+    }
+
+    void JsonValueWriter::writeImpl( Catch::StringRef value, bool quote ) {
+        if ( quote ) { m_os << '"'; }
+        for (char c : value) {
+            // Escape list taken from https://www.json.org/json-en.html,
+            // string definition.
+            // Note that while forward slash _can_ be escaped, it does
+            // not have to be, if JSON is not further embedded somewhere
+            // where forward slash is meaningful.
+            if ( c == '"' ) {
+                m_os << "\\\"";
+            } else if ( c == '\\' ) {
+                m_os << "\\\\";
+            } else if ( c == '\b' ) {
+                m_os << "\\b";
+            } else if ( c == '\f' ) {
+                m_os << "\\f";
+            } else if ( c == '\n' ) {
+                m_os << "\\n";
+            } else if ( c == '\r' ) {
+                m_os << "\\r";
+            } else if ( c == '\t' ) {
+                m_os << "\\t";
+            } else {
+                m_os << c;
+            }
+        }
+        if ( quote ) { m_os << '"'; }
+    }
+
+} // namespace Catch
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_jsonwriter.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_jsonwriter.hpp
new file mode 100644
index 00000000..59c044e4
--- /dev/null
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_jsonwriter.hpp
@@ -0,0 +1,120 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_JSONWRITER_HPP_INCLUDED
+#define CATCH_JSONWRITER_HPP_INCLUDED
+
+#include <catch2/internal/catch_reusable_string_stream.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+
+#include <cstdint>
+#include <sstream>
+
+namespace Catch {
+    class JsonObjectWriter;
+    class JsonArrayWriter;
+
+    struct JsonUtils {
+        static void indent( std::ostream& os, std::uint64_t level );
+        static void appendCommaNewline( std::ostream& os,
+                                        bool& should_comma,
+                                        std::uint64_t level );
+    };
+
+    class JsonValueWriter {
+    public:
+        JsonValueWriter( std::ostream& os );
+        JsonValueWriter( std::ostream& os, std::uint64_t indent_level );
+
+        JsonObjectWriter writeObject() &&;
+        JsonArrayWriter writeArray() &&;
+
+        template <typename T>
+        void write( T const& value ) && {
+            writeImpl( value, !std::is_arithmetic<T>::value );
+        }
+        void write( StringRef value ) &&;
+        void write( bool value ) &&;
+
+    private:
+        void writeImpl( StringRef value, bool quote );
+
+        // Without this SFINAE, this overload is a better match
+        // for `std::string`, `char const*`, `char const[N]` args.
+        // While it would still work, it would cause code bloat
+        // and multiple iteration over the strings
+        template <typename T,
+                  typename = typename std::enable_if_t<
+                      !std::is_convertible<T, StringRef>::value>>
+        void writeImpl( T const& value, bool quote_value ) {
+            m_sstream << value;
+            writeImpl( m_sstream.str(), quote_value );
+        }
+
+        std::ostream& m_os;
+        std::stringstream m_sstream;
+        std::uint64_t m_indent_level;
+    };
+
+    class JsonObjectWriter {
+    public:
+        JsonObjectWriter( std::ostream& os );
+        JsonObjectWriter( std::ostream& os, std::uint64_t indent_level );
+
+        JsonObjectWriter( JsonObjectWriter&& source );
+        JsonObjectWriter& operator=( JsonObjectWriter&& source ) = delete;
+
+        ~JsonObjectWriter();
+
+        JsonValueWriter write( StringRef key );
+
+    private:
+        std::ostream& m_os;
+        std::uint64_t m_indent_level;
+        bool m_should_comma = false;
+        bool m_active = true;
+    };
+
+    class JsonArrayWriter {
+    public:
+        JsonArrayWriter( std::ostream& os );
+        JsonArrayWriter( std::ostream& os, std::uint64_t indent_level );
+
+        JsonArrayWriter( JsonArrayWriter&& source );
+        JsonArrayWriter& operator=( JsonArrayWriter&& source ) = delete;
+
+        ~JsonArrayWriter();
+
+        JsonObjectWriter writeObject();
+        JsonArrayWriter writeArray();
+
+        template <typename T>
+        JsonArrayWriter& write( T const& value ) {
+            return writeImpl( value );
+        }
+
+        JsonArrayWriter& write( bool value );
+
+    private:
+        template <typename T>
+        JsonArrayWriter& writeImpl( T const& value ) {
+            JsonUtils::appendCommaNewline(
+                m_os, m_should_comma, m_indent_level + 1 );
+            JsonValueWriter{ m_os }.write( value );
+
+            return *this;
+        }
+
+        std::ostream& m_os;
+        std::uint64_t m_indent_level;
+        bool m_should_comma = false;
+        bool m_active = true;
+    };
+
+} // namespace Catch
+
+#endif // CATCH_JSONWRITER_HPP_INCLUDED
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_leak_detector.cpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_leak_detector.cpp
index 7389eaf7..691bc772 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_leak_detector.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_leak_detector.cpp
@@ -29,7 +29,7 @@ namespace Catch {
 
 #else // ^^ Windows crt debug heap enabled // Windows crt debug heap disabled vv
 
-    Catch::LeakDetector::LeakDetector() {}
+    Catch::LeakDetector::LeakDetector() = default;
 
 #endif // CATCH_CONFIG_WINDOWS_CRTDBG
 
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_list.cpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_list.cpp
index 263781d6..5bd06a2a 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_list.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_list.cpp
@@ -9,15 +9,12 @@
 
 #include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
 #include <catch2/interfaces/catch_interfaces_reporter.hpp>
-#include <catch2/interfaces/catch_interfaces_reporter_registry.hpp>
-#include <catch2/interfaces/catch_interfaces_testcase.hpp>
 #include <catch2/interfaces/catch_interfaces_reporter_factory.hpp>
+#include <catch2/internal/catch_test_case_registry_impl.hpp>
+#include <catch2/internal/catch_reporter_registry.hpp>
 #include <catch2/internal/catch_move_and_forward.hpp>
 #include <catch2/internal/catch_case_insensitive_comparisons.hpp>
-
-#include <catch2/internal/catch_context.hpp>
 #include <catch2/catch_config.hpp>
-#include <catch2/catch_test_spec.hpp>
 #include <catch2/catch_test_case_info.hpp>
 
 namespace Catch {
@@ -54,7 +51,7 @@ namespace Catch {
         void listReporters(IEventListener& reporter) {
             std::vector<ReporterDescription> descriptions;
 
-            IReporterRegistry::FactoryMap const& factories = getRegistryHub().getReporterRegistry().getFactories();
+            auto const& factories = getRegistryHub().getReporterRegistry().getFactories();
             descriptions.reserve(factories.size());
             for (auto const& fac : factories) {
                 descriptions.push_back({ fac.first, fac.second->getDescription() });
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_message_info.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_message_info.hpp
index d2658429..1ef43fda 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_message_info.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_message_info.hpp
@@ -10,7 +10,7 @@
 
 #include <catch2/internal/catch_result_type.hpp>
 #include <catch2/internal/catch_source_line_info.hpp>
-#include <catch2/interfaces/catch_interfaces_capture.hpp>
+#include <catch2/internal/catch_stringref.hpp>
 
 #include <string>
 
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_optional.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_optional.hpp
index ac3714ee..d1e953ad 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_optional.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_optional.hpp
@@ -8,6 +8,8 @@
 #ifndef CATCH_OPTIONAL_HPP_INCLUDED
 #define CATCH_OPTIONAL_HPP_INCLUDED
 
+#include <catch2/internal/catch_move_and_forward.hpp>
+
 #include <cassert>
 
 namespace Catch {
@@ -16,35 +18,50 @@ namespace Catch {
     template<typename T>
     class Optional {
     public:
-        Optional() : nullableValue( nullptr ) {}
-        Optional( T const& _value )
-        : nullableValue( new( storage ) T( _value ) )
-        {}
-        Optional( Optional const& _other )
-        : nullableValue( _other ? new( storage ) T( *_other ) : nullptr )
-        {}
-
-        ~Optional() {
+        Optional(): nullableValue( nullptr ) {}
+        ~Optional() { reset(); }
+
+        Optional( T const& _value ):
+            nullableValue( new ( storage ) T( _value ) ) {}
+        Optional( T&& _value ):
+            nullableValue( new ( storage ) T( CATCH_MOVE( _value ) ) ) {}
+
+        Optional& operator=( T const& _value ) {
             reset();
+            nullableValue = new ( storage ) T( _value );
+            return *this;
         }
+        Optional& operator=( T&& _value ) {
+            reset();
+            nullableValue = new ( storage ) T( CATCH_MOVE( _value ) );
+            return *this;
+        }
+
+        Optional( Optional const& _other ):
+            nullableValue( _other ? new ( storage ) T( *_other ) : nullptr ) {}
+        Optional( Optional&& _other ):
+            nullableValue( _other ? new ( storage ) T( CATCH_MOVE( *_other ) )
+                                  : nullptr ) {}
 
-        Optional& operator= ( Optional const& _other ) {
-            if( &_other != this ) {
+        Optional& operator=( Optional const& _other ) {
+            if ( &_other != this ) {
                 reset();
-                if( _other )
-                    nullableValue = new( storage ) T( *_other );
+                if ( _other ) { nullableValue = new ( storage ) T( *_other ); }
             }
             return *this;
         }
-        Optional& operator = ( T const& _value ) {
-            reset();
-            nullableValue = new( storage ) T( _value );
+        Optional& operator=( Optional&& _other ) {
+            if ( &_other != this ) {
+                reset();
+                if ( _other ) {
+                    nullableValue = new ( storage ) T( CATCH_MOVE( *_other ) );
+                }
+            }
             return *this;
         }
 
         void reset() {
-            if( nullableValue )
-                nullableValue->~T();
+            if ( nullableValue ) { nullableValue->~T(); }
             nullableValue = nullptr;
         }
 
@@ -91,7 +108,7 @@ namespace Catch {
         }
 
     private:
-        T *nullableValue;
+        T* nullableValue;
         alignas(alignof(T)) char storage[sizeof(T)];
     };
 
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_polyfills.cpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_polyfills.cpp
index 96efad5d..776c2243 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_polyfills.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_polyfills.cpp
@@ -31,4 +31,12 @@ namespace Catch {
     }
 #endif
 
+#if !defined( CATCH_CONFIG_GLOBAL_NEXTAFTER )
+    float nextafter( float x, float y ) { return std::nextafter( x, y ); }
+    double nextafter( double x, double y ) { return std::nextafter( x, y ); }
+#else
+    float nextafter( float x, float y ) { return ::nextafterf( x, y ); }
+    double nextafter( double x, double y ) { return ::nextafter( x, y ); }
+#endif
+
 } // end namespace Catch
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_polyfills.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_polyfills.hpp
index 23a9332b..4503f8f2 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_polyfills.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_polyfills.hpp
@@ -9,8 +9,13 @@
 #define CATCH_POLYFILLS_HPP_INCLUDED
 
 namespace Catch {
+
     bool isnan(float f);
     bool isnan(double d);
+
+    float nextafter(float x, float y);
+    double nextafter(double x, double y);
+
 }
 
 #endif // CATCH_POLYFILLS_HPP_INCLUDED
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_preprocessor_internal_stringify.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_preprocessor_internal_stringify.hpp
new file mode 100644
index 00000000..2fd64e1c
--- /dev/null
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_preprocessor_internal_stringify.hpp
@@ -0,0 +1,19 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_PREPROCESSOR_INTERNAL_STRINGIFY_HPP_INCLUDED
+#define CATCH_PREPROCESSOR_INTERNAL_STRINGIFY_HPP_INCLUDED
+
+#include <catch2/catch_user_config.hpp>
+
+#if !defined(CATCH_CONFIG_DISABLE_STRINGIFICATION)
+  #define CATCH_INTERNAL_STRINGIFY(...) #__VA_ARGS__##_catch_sr
+#else
+  #define CATCH_INTERNAL_STRINGIFY(...) "Disabled by CATCH_CONFIG_DISABLE_STRINGIFICATION"_catch_sr
+#endif
+
+#endif // CATCH_PREPROCESSOR_INTERNAL_STRINGIFY_HPP_INCLUDED
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_random_floating_point_helpers.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_random_floating_point_helpers.hpp
new file mode 100644
index 00000000..c59c0539
--- /dev/null
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_random_floating_point_helpers.hpp
@@ -0,0 +1,94 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#ifndef CATCH_RANDOM_FLOATING_POINT_HELPERS_HPP_INCLUDED
+#define CATCH_RANDOM_FLOATING_POINT_HELPERS_HPP_INCLUDED
+
+#include <catch2/internal/catch_polyfills.hpp>
+
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
+namespace Catch {
+
+    namespace Detail {
+        /**
+         * Returns the largest magnitude of 1-ULP distance inside the [a, b] range.
+         *
+         * Assumes `a < b`.
+         */
+        template <typename FloatType>
+        FloatType gamma(FloatType a, FloatType b) {
+            static_assert( std::is_floating_point<FloatType>::value,
+                           "gamma returns the largest ULP magnitude within "
+                           "floating point range [a, b]. This only makes sense "
+                           "for floating point types" );
+            assert( a <= b );
+
+            const auto gamma_up = Catch::nextafter( a, std::numeric_limits<FloatType>::infinity() ) - a;
+            const auto gamma_down = b - Catch::nextafter( b, -std::numeric_limits<FloatType>::infinity() );
+
+            return gamma_up < gamma_down ? gamma_down : gamma_up;
+        }
+
+        template <typename FloatingPoint>
+        struct DistanceTypePicker;
+        template <>
+        struct DistanceTypePicker<float> {
+            using type = std::uint32_t;
+        };
+        template <>
+        struct DistanceTypePicker<double> {
+            using type = std::uint64_t;
+        };
+
+        template <typename T>
+        using DistanceType = typename DistanceTypePicker<T>::type;
+
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        /**
+         * Computes the number of equi-distant floats in [a, b]
+         *
+         * Since not every range can be split into equidistant floats
+         * exactly, we actually compute ceil(b/distance - a/distance),
+         * because in those cases we want to overcount.
+         *
+         * Uses modified Dekker's FastTwoSum algorithm to handle rounding.
+         */
+        template <typename FloatType>
+        DistanceType<FloatType>
+        count_equidistant_floats( FloatType a, FloatType b, FloatType distance ) {
+            assert( a <= b );
+            // We get distance as gamma for our uniform float distribution,
+            // so this will round perfectly.
+            const auto ag = a / distance;
+            const auto bg = b / distance;
+
+            const auto s = bg - ag;
+            const auto err = ( std::fabs( a ) <= std::fabs( b ) )
+                                 ? -ag - ( s - bg )
+                                 : bg - ( s + ag );
+            const auto ceil_s = static_cast<DistanceType<FloatType>>( std::ceil( s ) );
+
+            return ( ceil_s != s ) ? ceil_s : ceil_s + ( err > 0 );
+        }
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic pop
+#endif
+
+    }
+
+} // end namespace Catch
+
+#endif // CATCH_RANDOM_FLOATING_POINT_HELPERS_HPP_INCLUDED
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_random_integer_helpers.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_random_integer_helpers.hpp
new file mode 100644
index 00000000..1c450f05
--- /dev/null
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_random_integer_helpers.hpp
@@ -0,0 +1,202 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#ifndef CATCH_RANDOM_INTEGER_HELPERS_HPP_INCLUDED
+#define CATCH_RANDOM_INTEGER_HELPERS_HPP_INCLUDED
+
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+namespace Catch {
+    namespace Detail {
+
+        template <std::size_t>
+        struct SizedUnsignedType;
+#define SizedUnsignedTypeHelper( TYPE )        \
+    template <>                                \
+    struct SizedUnsignedType<sizeof( TYPE )> { \
+        using type = TYPE;                     \
+    }
+
+        SizedUnsignedTypeHelper( std::uint8_t );
+        SizedUnsignedTypeHelper( std::uint16_t );
+        SizedUnsignedTypeHelper( std::uint32_t );
+        SizedUnsignedTypeHelper( std::uint64_t );
+#undef SizedUnsignedTypeHelper
+
+        template <std::size_t sz>
+        using SizedUnsignedType_t = typename SizedUnsignedType<sz>::type;
+
+        template <typename T>
+        using DoubleWidthUnsignedType_t = SizedUnsignedType_t<2 * sizeof( T )>;
+
+        template <typename T>
+        struct ExtendedMultResult {
+            T upper;
+            T lower;
+            friend bool operator==( ExtendedMultResult const& lhs,
+                                    ExtendedMultResult const& rhs ) {
+                return lhs.upper == rhs.upper && lhs.lower == rhs.lower;
+            }
+        };
+
+        // Returns 128 bit result of multiplying lhs and rhs
+        constexpr ExtendedMultResult<std::uint64_t>
+        extendedMult( std::uint64_t lhs, std::uint64_t rhs ) {
+            // We use the simple long multiplication approach for
+            // correctness, we can use platform specific builtins
+            // for performance later.
+
+            // Split the lhs and rhs into two 32bit "digits", so that we can
+            // do 64 bit arithmetic to handle carry bits.
+            //            32b    32b    32b    32b
+            //     lhs                  L1     L2
+            //   * rhs                  R1     R2
+            //            ------------------------
+            //                       |  R2 * L2  |
+            //                 |  R2 * L1  |
+            //                 |  R1 * L2  |
+            //           |  R1 * L1  |
+            //           -------------------------
+            //           |  a  |  b  |  c  |  d  |
+
+#define CarryBits( x ) ( x >> 32 )
+#define Digits( x ) ( x & 0xFF'FF'FF'FF )
+
+            auto r2l2 = Digits( rhs ) * Digits( lhs );
+            auto r2l1 = Digits( rhs ) * CarryBits( lhs );
+            auto r1l2 = CarryBits( rhs ) * Digits( lhs );
+            auto r1l1 = CarryBits( rhs ) * CarryBits( lhs );
+
+            // Sum to columns first
+            auto d = Digits( r2l2 );
+            auto c = CarryBits( r2l2 ) + Digits( r2l1 ) + Digits( r1l2 );
+            auto b = CarryBits( r2l1 ) + CarryBits( r1l2 ) + Digits( r1l1 );
+            auto a = CarryBits( r1l1 );
+
+            // Propagate carries between columns
+            c += CarryBits( d );
+            b += CarryBits( c );
+            a += CarryBits( b );
+
+            // Remove the used carries
+            c = Digits( c );
+            b = Digits( b );
+            a = Digits( a );
+
+#undef CarryBits
+#undef Digits
+
+            return {
+                a << 32 | b, // upper 64 bits
+                c << 32 | d  // lower 64 bits
+            };
+        }
+
+        template <typename UInt>
+        constexpr ExtendedMultResult<UInt> extendedMult( UInt lhs, UInt rhs ) {
+            static_assert( std::is_unsigned<UInt>::value,
+                           "extendedMult can only handle unsigned integers" );
+            static_assert( sizeof( UInt ) < sizeof( std::uint64_t ),
+                           "Generic extendedMult can only handle types smaller "
+                           "than uint64_t" );
+            using WideType = DoubleWidthUnsignedType_t<UInt>;
+
+            auto result = WideType( lhs ) * WideType( rhs );
+            return {
+                static_cast<UInt>( result >> ( CHAR_BIT * sizeof( UInt ) ) ),
+                static_cast<UInt>( result & UInt( -1 ) ) };
+        }
+
+
+        template <typename TargetType,
+                  typename Generator>
+            std::enable_if_t<sizeof(typename Generator::result_type) >= sizeof(TargetType),
+            TargetType> fillBitsFrom(Generator& gen) {
+            using gresult_type = typename Generator::result_type;
+            static_assert( std::is_unsigned<TargetType>::value, "Only unsigned integers are supported" );
+            static_assert( Generator::min() == 0 &&
+                           Generator::max() == static_cast<gresult_type>( -1 ),
+                           "Generator must be able to output all numbers in its result type (effectively it must be a random bit generator)" );
+
+            // We want to return the top bits from a generator, as they are
+            // usually considered higher quality.
+            constexpr auto generated_bits = sizeof( gresult_type ) * CHAR_BIT;
+            constexpr auto return_bits = sizeof( TargetType ) * CHAR_BIT;
+
+            return static_cast<TargetType>( gen() >>
+                                            ( generated_bits - return_bits) );
+        }
+
+        template <typename TargetType,
+                  typename Generator>
+            std::enable_if_t<sizeof(typename Generator::result_type) < sizeof(TargetType),
+            TargetType> fillBitsFrom(Generator& gen) {
+            using gresult_type = typename Generator::result_type;
+            static_assert( std::is_unsigned<TargetType>::value,
+                           "Only unsigned integers are supported" );
+            static_assert( Generator::min() == 0 &&
+                           Generator::max() == static_cast<gresult_type>( -1 ),
+                           "Generator must be able to output all numbers in its result type (effectively it must be a random bit generator)" );
+
+            constexpr auto generated_bits = sizeof( gresult_type ) * CHAR_BIT;
+            constexpr auto return_bits = sizeof( TargetType ) * CHAR_BIT;
+            std::size_t filled_bits = 0;
+            TargetType ret = 0;
+            do {
+                ret <<= generated_bits;
+                ret |= gen();
+                filled_bits += generated_bits;
+            } while ( filled_bits < return_bits );
+
+            return ret;
+        }
+
+        /*
+         * Transposes numbers into unsigned type while keeping their ordering
+         *
+         * This means that signed types are changed so that the ordering is
+         * [INT_MIN, ..., -1, 0, ..., INT_MAX], rather than order we would
+         * get by simple casting ([0, ..., INT_MAX, INT_MIN, ..., -1])
+         */
+        template <typename OriginalType, typename UnsignedType>
+        std::enable_if_t<std::is_signed<OriginalType>::value, UnsignedType>
+        transposeToNaturalOrder( UnsignedType in ) {
+            static_assert(
+                sizeof( OriginalType ) == sizeof( UnsignedType ),
+                "reordering requires the same sized types on both sides" );
+            static_assert( std::is_unsigned<UnsignedType>::value,
+                           "Input type must be unsigned" );
+            // Assuming 2s complement (standardized in current C++), the
+            // positive and negative numbers are already internally ordered,
+            // and their difference is in the top bit. Swapping it orders
+            // them the desired way.
+            constexpr auto highest_bit =
+                UnsignedType( 1 ) << ( sizeof( UnsignedType ) * CHAR_BIT - 1 );
+            return static_cast<UnsignedType>( in ^ highest_bit );
+        }
+
+
+
+        template <typename OriginalType,
+                  typename UnsignedType>
+        std::enable_if_t<std::is_unsigned<OriginalType>::value, UnsignedType>
+            transposeToNaturalOrder(UnsignedType in) {
+            static_assert(
+                sizeof( OriginalType ) == sizeof( UnsignedType ),
+                "reordering requires the same sized types on both sides" );
+            static_assert( std::is_unsigned<UnsignedType>::value, "Input type must be unsigned" );
+            // No reordering is needed for unsigned -> unsigned
+            return in;
+        }
+    } // namespace Detail
+} // namespace Catch
+
+#endif // CATCH_RANDOM_INTEGER_HELPERS_HPP_INCLUDED
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_random_seed_generation.cpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_random_seed_generation.cpp
index 40c468cb..fdc3fa19 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_random_seed_generation.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_random_seed_generation.cpp
@@ -9,6 +9,7 @@
 #include <catch2/internal/catch_random_seed_generation.hpp>
 
 #include <catch2/internal/catch_enforce.hpp>
+#include <catch2/internal/catch_random_integer_helpers.hpp>
 
 #include <ctime>
 #include <random>
@@ -21,10 +22,10 @@ namespace Catch {
             return static_cast<std::uint32_t>( std::time( nullptr ) );
 
         case GenerateFrom::Default:
-        case GenerateFrom::RandomDevice:
-            // In theory, a platform could have random_device that returns just
-            // 16 bits. That is still some randomness, so we don't care too much
-            return static_cast<std::uint32_t>( std::random_device{}() );
+        case GenerateFrom::RandomDevice: {
+            std::random_device rd;
+            return Detail::fillBitsFrom<std::uint32_t>( rd );
+        }
 
         default:
             CATCH_ERROR("Unknown generation method");
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_reporter_registry.cpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_reporter_registry.cpp
index 4c0c44f4..cea8c4dc 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_reporter_registry.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_reporter_registry.cpp
@@ -5,61 +5,87 @@
 //        https://www.boost.org/LICENSE_1_0.txt)
 
 // SPDX-License-Identifier: BSL-1.0
-#include <catch2/internal/catch_reporter_registry.hpp>
 
-#include <catch2/reporters/catch_reporter_registrars.hpp>
+#include <catch2/interfaces/catch_interfaces_reporter_factory.hpp>
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+#include <catch2/internal/catch_reporter_registry.hpp>
 #include <catch2/reporters/catch_reporter_automake.hpp>
 #include <catch2/reporters/catch_reporter_compact.hpp>
 #include <catch2/reporters/catch_reporter_console.hpp>
+#include <catch2/reporters/catch_reporter_json.hpp>
 #include <catch2/reporters/catch_reporter_junit.hpp>
+#include <catch2/reporters/catch_reporter_registrars.hpp>
 #include <catch2/reporters/catch_reporter_sonarqube.hpp>
 #include <catch2/reporters/catch_reporter_tap.hpp>
 #include <catch2/reporters/catch_reporter_teamcity.hpp>
 #include <catch2/reporters/catch_reporter_xml.hpp>
-#include <catch2/internal/catch_move_and_forward.hpp>
-#include <catch2/internal/catch_enforce.hpp>
 
 namespace Catch {
+    struct ReporterRegistry::ReporterRegistryImpl {
+        std::vector<Detail::unique_ptr<EventListenerFactory>> listeners;
+        std::map<std::string, IReporterFactoryPtr, Detail::CaseInsensitiveLess>
+            factories;
+    };
 
-    ReporterRegistry::ReporterRegistry() {
+    ReporterRegistry::ReporterRegistry():
+        m_impl( Detail::make_unique<ReporterRegistryImpl>() ) {
         // Because it is impossible to move out of initializer list,
         // we have to add the elements manually
-        m_factories["Automake"] = Detail::make_unique<ReporterFactory<AutomakeReporter>>();
-        m_factories["compact"] = Detail::make_unique<ReporterFactory<CompactReporter>>();
-        m_factories["console"] = Detail::make_unique<ReporterFactory<ConsoleReporter>>();
-        m_factories["JUnit"] = Detail::make_unique<ReporterFactory<JunitReporter>>();
-        m_factories["SonarQube"] = Detail::make_unique<ReporterFactory<SonarQubeReporter>>();
-        m_factories["TAP"] = Detail::make_unique<ReporterFactory<TAPReporter>>();
-        m_factories["TeamCity"] = Detail::make_unique<ReporterFactory<TeamCityReporter>>();
-        m_factories["XML"] = Detail::make_unique<ReporterFactory<XmlReporter>>();
+        m_impl->factories["Automake"] =
+            Detail::make_unique<ReporterFactory<AutomakeReporter>>();
+        m_impl->factories["compact"] =
+            Detail::make_unique<ReporterFactory<CompactReporter>>();
+        m_impl->factories["console"] =
+            Detail::make_unique<ReporterFactory<ConsoleReporter>>();
+        m_impl->factories["JUnit"] =
+            Detail::make_unique<ReporterFactory<JunitReporter>>();
+        m_impl->factories["SonarQube"] =
+            Detail::make_unique<ReporterFactory<SonarQubeReporter>>();
+        m_impl->factories["TAP"] =
+            Detail::make_unique<ReporterFactory<TAPReporter>>();
+        m_impl->factories["TeamCity"] =
+            Detail::make_unique<ReporterFactory<TeamCityReporter>>();
+        m_impl->factories["XML"] =
+            Detail::make_unique<ReporterFactory<XmlReporter>>();
+        m_impl->factories["JSON"] =
+            Detail::make_unique<ReporterFactory<JsonReporter>>();
     }
 
     ReporterRegistry::~ReporterRegistry() = default;
 
-
-    IEventListenerPtr ReporterRegistry::create( std::string const& name, ReporterConfig&& config ) const {
-        auto it =  m_factories.find( name );
-        if( it == m_factories.end() )
-            return nullptr;
-        return it->second->create( CATCH_MOVE(config) );
+    IEventListenerPtr
+    ReporterRegistry::create( std::string const& name,
+                              ReporterConfig&& config ) const {
+        auto it = m_impl->factories.find( name );
+        if ( it == m_impl->factories.end() ) return nullptr;
+        return it->second->create( CATCH_MOVE( config ) );
     }
 
-    void ReporterRegistry::registerReporter( std::string const& name, IReporterFactoryPtr factory ) {
+    void ReporterRegistry::registerReporter( std::string const& name,
+                                             IReporterFactoryPtr factory ) {
         CATCH_ENFORCE( name.find( "::" ) == name.npos,
-                       "'::' is not allowed in reporter name: '" + name + '\'' );
-        auto ret = m_factories.emplace(name, CATCH_MOVE(factory));
-        CATCH_ENFORCE( ret.second, "reporter using '" + name + "' as name was already registered" );
+                       "'::' is not allowed in reporter name: '" + name +
+                           '\'' );
+        auto ret = m_impl->factories.emplace( name, CATCH_MOVE( factory ) );
+        CATCH_ENFORCE( ret.second,
+                       "reporter using '" + name +
+                           "' as name was already registered" );
     }
     void ReporterRegistry::registerListener(
         Detail::unique_ptr<EventListenerFactory> factory ) {
-        m_listeners.push_back( CATCH_MOVE(factory) );
+        m_impl->listeners.push_back( CATCH_MOVE( factory ) );
     }
 
-    IReporterRegistry::FactoryMap const& ReporterRegistry::getFactories() const {
-        return m_factories;
-    }
-    IReporterRegistry::Listeners const& ReporterRegistry::getListeners() const {
-        return m_listeners;
+    std::map<std::string,
+             IReporterFactoryPtr,
+             Detail::CaseInsensitiveLess> const&
+    ReporterRegistry::getFactories() const {
+        return m_impl->factories;
     }
 
-}
+    std::vector<Detail::unique_ptr<EventListenerFactory>> const&
+    ReporterRegistry::getListeners() const {
+        return m_impl->listeners;
+    }
+} // namespace Catch
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_reporter_registry.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_reporter_registry.hpp
index 5577b9ef..92a88927 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_reporter_registry.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_reporter_registry.hpp
@@ -8,31 +8,48 @@
 #ifndef CATCH_REPORTER_REGISTRY_HPP_INCLUDED
 #define CATCH_REPORTER_REGISTRY_HPP_INCLUDED
 
-#include <catch2/interfaces/catch_interfaces_reporter.hpp>
-#include <catch2/interfaces/catch_interfaces_reporter_registry.hpp>
+#include <catch2/internal/catch_case_insensitive_comparisons.hpp>
+#include <catch2/internal/catch_unique_ptr.hpp>
 
 #include <map>
+#include <string>
+#include <vector>
 
 namespace Catch {
 
-    class ReporterRegistry : public IReporterRegistry {
-    public:
+    class IEventListener;
+    using IEventListenerPtr = Detail::unique_ptr<IEventListener>;
+    class IReporterFactory;
+    using IReporterFactoryPtr = Detail::unique_ptr<IReporterFactory>;
+    struct ReporterConfig;
+    class EventListenerFactory;
+
+    class ReporterRegistry {
+        struct ReporterRegistryImpl;
+        Detail::unique_ptr<ReporterRegistryImpl> m_impl;
 
+    public:
         ReporterRegistry();
-        ~ReporterRegistry() override; // = default, out of line to allow fwd decl
+        ~ReporterRegistry(); // = default;
 
-        IEventListenerPtr create( std::string const& name, ReporterConfig&& config ) const override;
+        IEventListenerPtr create( std::string const& name,
+                                  ReporterConfig&& config ) const;
 
-        void registerReporter( std::string const& name, IReporterFactoryPtr factory );
-        void registerListener( Detail::unique_ptr<EventListenerFactory> factory );
+        void registerReporter( std::string const& name,
+                               IReporterFactoryPtr factory );
 
-        FactoryMap const& getFactories() const override;
-        Listeners const& getListeners() const override;
+        void
+        registerListener( Detail::unique_ptr<EventListenerFactory> factory );
 
-    private:
-        FactoryMap m_factories;
-        Listeners m_listeners;
+        std::map<std::string,
+                 IReporterFactoryPtr,
+                 Detail::CaseInsensitiveLess> const&
+        getFactories() const;
+
+        std::vector<Detail::unique_ptr<EventListenerFactory>> const&
+        getListeners() const;
     };
-}
+
+} // end namespace Catch
 
 #endif // CATCH_REPORTER_REGISTRY_HPP_INCLUDED
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_reporter_spec_parser.cpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_reporter_spec_parser.cpp
index f6591d9a..8b88b170 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_reporter_spec_parser.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_reporter_spec_parser.cpp
@@ -21,9 +21,9 @@ namespace Catch {
         };
 
         kvPair splitKVPair(StringRef kvString) {
-            auto splitPos = static_cast<size_t>( std::distance(
-                kvString.begin(),
-                std::find( kvString.begin(), kvString.end(), '=' ) ) );
+            auto splitPos = static_cast<size_t>(
+                std::find( kvString.begin(), kvString.end(), '=' ) -
+                kvString.begin() );
 
             return { kvString.substr( 0, splitPos ),
                      kvString.substr( splitPos + 1, kvString.size() ) };
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_reporter_spec_parser.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_reporter_spec_parser.hpp
index d446ce98..9f447ee2 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_reporter_spec_parser.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_reporter_spec_parser.hpp
@@ -8,7 +8,7 @@
 #ifndef CATCH_REPORTER_SPEC_PARSER_HPP_INCLUDED
 #define CATCH_REPORTER_SPEC_PARSER_HPP_INCLUDED
 
-#include <catch2/internal/catch_console_colour.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
 #include <catch2/internal/catch_optional.hpp>
 #include <catch2/internal/catch_stringref.hpp>
 
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_run_context.cpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_run_context.cpp
index 08086b28..77b476d8 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_run_context.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_run_context.cpp
@@ -8,8 +8,9 @@
 #include <catch2/internal/catch_run_context.hpp>
 
 #include <catch2/catch_user_config.hpp>
-#include <catch2/interfaces/catch_interfaces_generatortracker.hpp>
 #include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/interfaces/catch_interfaces_generatortracker.hpp>
+#include <catch2/interfaces/catch_interfaces_reporter.hpp>
 #include <catch2/internal/catch_compiler_capabilities.hpp>
 #include <catch2/internal/catch_context.hpp>
 #include <catch2/internal/catch_enforce.hpp>
@@ -19,6 +20,7 @@
 #include <catch2/internal/catch_output_redirect.hpp>
 #include <catch2/internal/catch_assertion_handler.hpp>
 #include <catch2/internal/catch_test_failure_exception.hpp>
+#include <catch2/internal/catch_result_type.hpp>
 
 #include <cassert>
 #include <algorithm>
@@ -26,146 +28,151 @@
 namespace Catch {
 
     namespace Generators {
-        struct GeneratorTracker : TestCaseTracking::TrackerBase, IGeneratorTracker {
-            GeneratorBasePtr m_generator;
-
-            GeneratorTracker( TestCaseTracking::NameAndLocation&& nameAndLocation, TrackerContext& ctx, ITracker* parent )
-            :   TrackerBase( CATCH_MOVE(nameAndLocation), ctx, parent )
-            {}
-            ~GeneratorTracker() override;
-
-            static GeneratorTracker* acquire( TrackerContext& ctx, TestCaseTracking::NameAndLocationRef const& nameAndLocation ) {
-                GeneratorTracker* tracker;
-
-                ITracker& currentTracker = ctx.currentTracker();
-                // Under specific circumstances, the generator we want
-                // to acquire is also the current tracker. If this is
-                // the case, we have to avoid looking through current
-                // tracker's children, and instead return the current
-                // tracker.
-                // A case where this check is important is e.g.
-                //     for (int i = 0; i < 5; ++i) {
-                //         int n = GENERATE(1, 2);
-                //     }
-                //
-                // without it, the code above creates 5 nested generators.
-                if ( currentTracker.nameAndLocation() == nameAndLocation ) {
-                    auto thisTracker =
-                        currentTracker.parent()->findChild( nameAndLocation );
-                    assert( thisTracker );
-                    assert( thisTracker->isGeneratorTracker() );
-                    tracker = static_cast<GeneratorTracker*>( thisTracker );
-                } else if ( ITracker* childTracker =
-                                currentTracker.findChild( nameAndLocation ) ) {
-                    assert( childTracker );
-                    assert( childTracker->isGeneratorTracker() );
-                    tracker = static_cast<GeneratorTracker*>( childTracker );
-                } else {
-                    return nullptr;
-                }
+        namespace {
+            struct GeneratorTracker final : TestCaseTracking::TrackerBase,
+                                      IGeneratorTracker {
+                GeneratorBasePtr m_generator;
+
+                GeneratorTracker(
+                    TestCaseTracking::NameAndLocation&& nameAndLocation,
+                    TrackerContext& ctx,
+                    ITracker* parent ):
+                    TrackerBase( CATCH_MOVE( nameAndLocation ), ctx, parent ) {}
+
+                static GeneratorTracker*
+                acquire( TrackerContext& ctx,
+                         TestCaseTracking::NameAndLocationRef const&
+                             nameAndLocation ) {
+                    GeneratorTracker* tracker;
+
+                    ITracker& currentTracker = ctx.currentTracker();
+                    // Under specific circumstances, the generator we want
+                    // to acquire is also the current tracker. If this is
+                    // the case, we have to avoid looking through current
+                    // tracker's children, and instead return the current
+                    // tracker.
+                    // A case where this check is important is e.g.
+                    //     for (int i = 0; i < 5; ++i) {
+                    //         int n = GENERATE(1, 2);
+                    //     }
+                    //
+                    // without it, the code above creates 5 nested generators.
+                    if ( currentTracker.nameAndLocation() == nameAndLocation ) {
+                        auto thisTracker = currentTracker.parent()->findChild(
+                            nameAndLocation );
+                        assert( thisTracker );
+                        assert( thisTracker->isGeneratorTracker() );
+                        tracker = static_cast<GeneratorTracker*>( thisTracker );
+                    } else if ( ITracker* childTracker =
+                                    currentTracker.findChild(
+                                        nameAndLocation ) ) {
+                        assert( childTracker );
+                        assert( childTracker->isGeneratorTracker() );
+                        tracker =
+                            static_cast<GeneratorTracker*>( childTracker );
+                    } else {
+                        return nullptr;
+                    }
 
-                if( !tracker->isComplete() ) {
-                    tracker->open();
-                }
+                    if ( !tracker->isComplete() ) { tracker->open(); }
 
-                return tracker;
-            }
-
-            // TrackerBase interface
-            bool isGeneratorTracker() const override { return true; }
-            auto hasGenerator() const -> bool override {
-                return !!m_generator;
-            }
-            void close() override {
-                TrackerBase::close();
-                // If a generator has a child (it is followed by a section)
-                // and none of its children have started, then we must wait
-                // until later to start consuming its values.
-                // This catches cases where `GENERATE` is placed between two
-                // `SECTION`s.
-                // **The check for m_children.empty cannot be removed**.
-                // doing so would break `GENERATE` _not_ followed by `SECTION`s.
-                const bool should_wait_for_child = [&]() {
-                    // No children -> nobody to wait for
-                    if ( m_children.empty() ) {
-                        return false;
-                    }
-                    // If at least one child started executing, don't wait
-                    if ( std::find_if(
-                             m_children.begin(),
-                             m_children.end(),
-                             []( TestCaseTracking::ITrackerPtr const& tracker ) {
-                                 return tracker->hasStarted();
-                             } ) != m_children.end() ) {
-                        return false;
-                    }
+                    return tracker;
+                }
 
-                    // No children have started. We need to check if they _can_
-                    // start, and thus we should wait for them, or they cannot
-                    // start (due to filters), and we shouldn't wait for them
-                    ITracker* parent = m_parent;
-                    // This is safe: there is always at least one section
-                    // tracker in a test case tracking tree
-                    while ( !parent->isSectionTracker() ) {
-                        parent = parent->parent();
-                    }
-                    assert( parent &&
-                            "Missing root (test case) level section" );
-
-                    auto const& parentSection =
-                        static_cast<SectionTracker const&>( *parent );
-                    auto const& filters = parentSection.getFilters();
-                    // No filters -> no restrictions on running sections
-                    if ( filters.empty() ) {
-                        return true;
-                    }
+                // TrackerBase interface
+                bool isGeneratorTracker() const override { return true; }
+                auto hasGenerator() const -> bool override {
+                    return !!m_generator;
+                }
+                void close() override {
+                    TrackerBase::close();
+                    // If a generator has a child (it is followed by a section)
+                    // and none of its children have started, then we must wait
+                    // until later to start consuming its values.
+                    // This catches cases where `GENERATE` is placed between two
+                    // `SECTION`s.
+                    // **The check for m_children.empty cannot be removed**.
+                    // doing so would break `GENERATE` _not_ followed by
+                    // `SECTION`s.
+                    const bool should_wait_for_child = [&]() {
+                        // No children -> nobody to wait for
+                        if ( m_children.empty() ) { return false; }
+                        // If at least one child started executing, don't wait
+                        if ( std::find_if(
+                                 m_children.begin(),
+                                 m_children.end(),
+                                 []( TestCaseTracking::ITrackerPtr const&
+                                         tracker ) {
+                                     return tracker->hasStarted();
+                                 } ) != m_children.end() ) {
+                            return false;
+                        }
 
-                    for ( auto const& child : m_children ) {
-                        if ( child->isSectionTracker() &&
-                             std::find(
-                                 filters.begin(),
-                                 filters.end(),
-                                 static_cast<SectionTracker const&>( *child )
-                                     .trimmedName() ) != filters.end() ) {
-                            return true;
+                        // No children have started. We need to check if they
+                        // _can_ start, and thus we should wait for them, or
+                        // they cannot start (due to filters), and we shouldn't
+                        // wait for them
+                        ITracker* parent = m_parent;
+                        // This is safe: there is always at least one section
+                        // tracker in a test case tracking tree
+                        while ( !parent->isSectionTracker() ) {
+                            parent = parent->parent();
+                        }
+                        assert( parent &&
+                                "Missing root (test case) level section" );
+
+                        auto const& parentSection =
+                            static_cast<SectionTracker const&>( *parent );
+                        auto const& filters = parentSection.getFilters();
+                        // No filters -> no restrictions on running sections
+                        if ( filters.empty() ) { return true; }
+
+                        for ( auto const& child : m_children ) {
+                            if ( child->isSectionTracker() &&
+                                 std::find( filters.begin(),
+                                            filters.end(),
+                                            static_cast<SectionTracker const&>(
+                                                *child )
+                                                .trimmedName() ) !=
+                                     filters.end() ) {
+                                return true;
+                            }
                         }
+                        return false;
+                    }();
+
+                    // This check is a bit tricky, because m_generator->next()
+                    // has a side-effect, where it consumes generator's current
+                    // value, but we do not want to invoke the side-effect if
+                    // this generator is still waiting for any child to start.
+                    assert( m_generator && "Tracker without generator" );
+                    if ( should_wait_for_child ||
+                         ( m_runState == CompletedSuccessfully &&
+                           m_generator->countedNext() ) ) {
+                        m_children.clear();
+                        m_runState = Executing;
                     }
-                    return false;
-                }();
-
-                // This check is a bit tricky, because m_generator->next()
-                // has a side-effect, where it consumes generator's current
-                // value, but we do not want to invoke the side-effect if
-                // this generator is still waiting for any child to start.
-                assert( m_generator && "Tracker without generator" );
-                if ( should_wait_for_child ||
-                     ( m_runState == CompletedSuccessfully &&
-                       m_generator->countedNext() ) ) {
-                    m_children.clear();
-                    m_runState = Executing;
                 }
-            }
 
-            // IGeneratorTracker interface
-            auto getGenerator() const -> GeneratorBasePtr const& override {
-                return m_generator;
-            }
-            void setGenerator( GeneratorBasePtr&& generator ) override {
-                m_generator = CATCH_MOVE( generator );
-            }
-        };
-        GeneratorTracker::~GeneratorTracker() = default;
+                // IGeneratorTracker interface
+                auto getGenerator() const -> GeneratorBasePtr const& override {
+                    return m_generator;
+                }
+                void setGenerator( GeneratorBasePtr&& generator ) override {
+                    m_generator = CATCH_MOVE( generator );
+                }
+            };
+        } // namespace
     }
 
     RunContext::RunContext(IConfig const* _config, IEventListenerPtr&& reporter)
     :   m_runInfo(_config->name()),
-        m_context(getCurrentMutableContext()),
         m_config(_config),
         m_reporter(CATCH_MOVE(reporter)),
         m_lastAssertionInfo{ StringRef(), SourceLineInfo("",0), StringRef(), ResultDisposition::Normal },
         m_includeSuccessfulResults( m_config->includeSuccessfulResults() || m_reporter->getPreferences().shouldReportAllAssertions )
     {
-        m_context.setResultCapture(this);
+        getCurrentMutableContext().setResultCapture( this );
         m_reporter->testRunStarting(m_runInfo);
     }
 
@@ -260,7 +267,7 @@ namespace Catch {
     }
 
 
-    void RunContext::assertionEnded(AssertionResult const & result) {
+    void RunContext::assertionEnded(AssertionResult&& result) {
         if (result.getResultType() == ResultWas::Ok) {
             m_totals.assertions.passed++;
             m_lastAssertionPassed = true;
@@ -282,19 +289,27 @@ namespace Catch {
 
         m_reporter->assertionEnded(AssertionStats(result, m_messages, m_totals));
 
-        if (result.getResultType() != ResultWas::Warning)
+        if ( result.getResultType() != ResultWas::Warning ) {
             m_messageScopes.clear();
+        }
 
-        // Reset working state
-        resetAssertionInfo();
-        m_lastResult = result;
+        // Reset working state. assertion info will be reset after
+        // populateReaction is run if it is needed
+        m_lastResult = CATCH_MOVE( result );
     }
     void RunContext::resetAssertionInfo() {
         m_lastAssertionInfo.macroName = StringRef();
         m_lastAssertionInfo.capturedExpression = "{Unknown expression after the reported line}"_sr;
+        m_lastAssertionInfo.resultDisposition = ResultDisposition::Normal;
     }
 
-    bool RunContext::sectionStarted(StringRef sectionName, SourceLineInfo const& sectionLineInfo, Counts & assertions) {
+    void RunContext::notifyAssertionStarted( AssertionInfo const& info ) {
+        m_reporter->assertionStarting( info );
+    }
+
+    bool RunContext::sectionStarted( StringRef sectionName,
+                                     SourceLineInfo const& sectionLineInfo,
+                                     Counts& assertions ) {
         ITracker& sectionTracker =
             SectionTracker::acquire( m_trackerContext,
                                      TestCaseTracking::NameAndLocationRef(
@@ -432,7 +447,8 @@ namespace Catch {
         tempResult.message = static_cast<std::string>(message);
         AssertionResult result(m_lastAssertionInfo, CATCH_MOVE(tempResult));
 
-        assertionEnded(result);
+        assertionEnded(CATCH_MOVE(result) );
+        resetAssertionInfo();
 
         handleUnfinishedSections();
 
@@ -554,8 +570,6 @@ namespace Catch {
         ITransientExpression const& expr,
         AssertionReaction& reaction
     ) {
-        m_reporter->assertionStarting( info );
-
         bool negated = isFalseTest( info.resultDisposition );
         bool result = expr.getResult() != negated;
 
@@ -571,6 +585,7 @@ namespace Catch {
             reportExpr(info, ResultWas::ExpressionFailed, &expr, negated );
             populateReaction( reaction );
         }
+        resetAssertionInfo();
     }
     void RunContext::reportExpr(
             AssertionInfo const &info,
@@ -584,7 +599,7 @@ namespace Catch {
         AssertionResult assertionResult{ info, CATCH_MOVE( data ) };
         assertionResult.m_resultData.lazyExpression.m_transientExpression = expr;
 
-        assertionEnded( assertionResult );
+        assertionEnded( CATCH_MOVE(assertionResult) );
     }
 
     void RunContext::handleMessage(
@@ -593,22 +608,23 @@ namespace Catch {
             StringRef message,
             AssertionReaction& reaction
     ) {
-        m_reporter->assertionStarting( info );
-
         m_lastAssertionInfo = info;
 
         AssertionResultData data( resultType, LazyExpression( false ) );
         data.message = static_cast<std::string>(message);
         AssertionResult assertionResult{ m_lastAssertionInfo,
                                          CATCH_MOVE( data ) };
-        assertionEnded( assertionResult );
-        if ( !assertionResult.isOk() ) {
+
+        const auto isOk = assertionResult.isOk();
+        assertionEnded( CATCH_MOVE(assertionResult) );
+        if ( !isOk ) {
             populateReaction( reaction );
         } else if ( resultType == ResultWas::ExplicitSkip ) {
             // TODO: Need to handle this explicitly, as ExplicitSkip is
             // considered "OK"
             reaction.shouldSkip = true;
         }
+        resetAssertionInfo();
     }
     void RunContext::handleUnexpectedExceptionNotThrown(
             AssertionInfo const& info,
@@ -619,16 +635,17 @@ namespace Catch {
 
     void RunContext::handleUnexpectedInflightException(
             AssertionInfo const& info,
-            std::string const& message,
+            std::string&& message,
             AssertionReaction& reaction
     ) {
         m_lastAssertionInfo = info;
 
         AssertionResultData data( ResultWas::ThrewException, LazyExpression( false ) );
-        data.message = message;
+        data.message = CATCH_MOVE(message);
         AssertionResult assertionResult{ info, CATCH_MOVE(data) };
-        assertionEnded( assertionResult );
+        assertionEnded( CATCH_MOVE(assertionResult) );
         populateReaction( reaction );
+        resetAssertionInfo();
     }
 
     void RunContext::populateReaction( AssertionReaction& reaction ) {
@@ -645,7 +662,8 @@ namespace Catch {
         AssertionResultData data( ResultWas::ThrewException, LazyExpression( false ) );
         data.message = "Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE"s;
         AssertionResult assertionResult{ info, CATCH_MOVE( data ) };
-        assertionEnded( assertionResult );
+        assertionEnded( CATCH_MOVE(assertionResult) );
+        resetAssertionInfo();
     }
     void RunContext::handleNonExpr(
             AssertionInfo const &info,
@@ -656,10 +674,11 @@ namespace Catch {
 
         AssertionResultData data( resultType, LazyExpression( false ) );
         AssertionResult assertionResult{ info, CATCH_MOVE( data ) };
-        assertionEnded( assertionResult );
 
-        if( !assertionResult.isOk() )
-            populateReaction( reaction );
+        const auto isOk = assertionResult.isOk();
+        assertionEnded( CATCH_MOVE(assertionResult) );
+        if ( !isOk ) { populateReaction( reaction ); }
+        resetAssertionInfo();
     }
 
 
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_run_context.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_run_context.hpp
index 3928180e..c749304d 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_run_context.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_run_context.hpp
@@ -8,8 +8,9 @@
 #ifndef CATCH_RUN_CONTEXT_HPP_INCLUDED
 #define CATCH_RUN_CONTEXT_HPP_INCLUDED
 
-#include <catch2/interfaces/catch_interfaces_reporter.hpp>
+#include <catch2/interfaces/catch_interfaces_capture.hpp>
 #include <catch2/internal/catch_test_registry.hpp>
+#include <catch2/internal/catch_test_run_info.hpp>
 #include <catch2/internal/catch_fatal_condition_handler.hpp>
 #include <catch2/catch_test_case_info.hpp>
 #include <catch2/catch_message.hpp>
@@ -24,13 +25,14 @@
 
 namespace Catch {
 
-    class IMutableContext;
     class IGeneratorTracker;
     class IConfig;
+    class IEventListener;
+    using IEventListenerPtr = Detail::unique_ptr<IEventListener>;
 
     ///////////////////////////////////////////////////////////////////////////
 
-    class RunContext : public IResultCapture {
+    class RunContext final : public IResultCapture {
 
     public:
         RunContext( RunContext const& ) = delete;
@@ -59,7 +61,7 @@ namespace Catch {
                     AssertionReaction& reaction ) override;
         void handleUnexpectedInflightException
                 (   AssertionInfo const& info,
-                    std::string const& message,
+                    std::string&& message,
                     AssertionReaction& reaction ) override;
         void handleIncomplete
                 (   AssertionInfo const& info ) override;
@@ -68,6 +70,7 @@ namespace Catch {
                     ResultWas::OfType resultType,
                     AssertionReaction &reaction ) override;
 
+        void notifyAssertionStarted( AssertionInfo const& info ) override;
         bool sectionStarted( StringRef sectionName,
                              SourceLineInfo const& sectionLineInfo,
                              Counts& assertions ) override;
@@ -118,7 +121,7 @@ namespace Catch {
         void resetAssertionInfo();
         bool testForMissingAssertions( Counts& assertions );
 
-        void assertionEnded( AssertionResult const& result );
+        void assertionEnded( AssertionResult&& result );
         void reportExpr
                 (   AssertionInfo const &info,
                     ResultWas::OfType resultType,
@@ -132,7 +135,6 @@ namespace Catch {
         void handleUnfinishedSections();
 
         TestRunInfo m_runInfo;
-        IMutableContext& m_context;
         TestCaseHandle const* m_activeTestCase = nullptr;
         ITracker* m_testCaseTracker = nullptr;
         Optional<AssertionResult> m_lastResult;
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_section.cpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_section.cpp
index 061732b1..677c2164 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_section.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_section.cpp
@@ -6,7 +6,7 @@
 
 // SPDX-License-Identifier: BSL-1.0
 #include <catch2/internal/catch_section.hpp>
-#include <catch2/internal/catch_run_context.hpp>
+#include <catch2/interfaces/catch_interfaces_capture.hpp>
 #include <catch2/internal/catch_uncaught_exceptions.hpp>
 #include <catch2/internal/catch_move_and_forward.hpp>
 
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_section.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_section.hpp
index 8c1a882b..8c894eeb 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_section.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_section.hpp
@@ -9,6 +9,7 @@
 #define CATCH_SECTION_HPP_INCLUDED
 
 #include <catch2/internal/catch_compiler_capabilities.hpp>
+#include <catch2/internal/catch_config_static_analysis_support.hpp>
 #include <catch2/internal/catch_noncopyable.hpp>
 #include <catch2/catch_section_info.hpp>
 #include <catch2/catch_timer.hpp>
@@ -38,16 +39,62 @@ namespace Catch {
 
 } // end namespace Catch
 
-#define INTERNAL_CATCH_SECTION( ... ) \
-    CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
-    CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
-    if( Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME( catch_internal_Section ) = Catch::Section( CATCH_INTERNAL_LINEINFO, __VA_ARGS__ ) ) \
-    CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
-
-#define INTERNAL_CATCH_DYNAMIC_SECTION( ... ) \
-    CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
-    CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
-    if( Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME( catch_internal_Section ) = Catch::SectionInfo( CATCH_INTERNAL_LINEINFO, (Catch::ReusableStringStream() << __VA_ARGS__).str() ) ) \
-    CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+#if !defined(CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT)
+#    define INTERNAL_CATCH_SECTION( ... )                                 \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                         \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS                  \
+        if ( Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME(            \
+                 catch_internal_Section ) =                               \
+                 Catch::Section( CATCH_INTERNAL_LINEINFO, __VA_ARGS__ ) ) \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#    define INTERNAL_CATCH_DYNAMIC_SECTION( ... )                     \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                     \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS              \
+        if ( Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME(        \
+                 catch_internal_Section ) =                           \
+                 Catch::SectionInfo(                                  \
+                     CATCH_INTERNAL_LINEINFO,                         \
+                     ( Catch::ReusableStringStream() << __VA_ARGS__ ) \
+                         .str() ) )                                   \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#else
+
+// These section definitions imply that at most one section at one level
+// will be intered (because only one section's __LINE__ can be equal to
+// the dummy `catchInternalSectionHint` variable from `TEST_CASE`).
+
+namespace Catch {
+    namespace Detail {
+        // Intentionally without linkage, as it should only be used as a dummy
+        // symbol for static analysis.
+        int GetNewSectionHint();
+    } // namespace Detail
+} // namespace Catch
+
+
+#    define INTERNAL_CATCH_SECTION( ... )                                   \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                           \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS                    \
+        CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS                             \
+        if ( [[maybe_unused]] const int catchInternalPreviousSectionHint =  \
+                 catchInternalSectionHint,                                  \
+             catchInternalSectionHint = Catch::Detail::GetNewSectionHint(); \
+             catchInternalPreviousSectionHint == __LINE__ )                 \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#    define INTERNAL_CATCH_DYNAMIC_SECTION( ... )                           \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                           \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS                    \
+        CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS                             \
+        if ( [[maybe_unused]] const int catchInternalPreviousSectionHint =  \
+                 catchInternalSectionHint,                                  \
+             catchInternalSectionHint = Catch::Detail::GetNewSectionHint(); \
+             catchInternalPreviousSectionHint == __LINE__ )                 \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#endif
+
 
 #endif // CATCH_SECTION_HPP_INCLUDED
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_sharding.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_sharding.hpp
index d0e4cfa1..22561f4b 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_sharding.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_sharding.hpp
@@ -8,8 +8,7 @@
 #ifndef CATCH_SHARDING_HPP_INCLUDED
 #define CATCH_SHARDING_HPP_INCLUDED
 
-#include <catch2/catch_session.hpp>
-
+#include <cassert>
 #include <cmath>
 #include <algorithm>
 
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_stream_end_stop.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_stream_end_stop.hpp
index 61379f20..66d678cf 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_stream_end_stop.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_stream_end_stop.hpp
@@ -17,10 +17,10 @@ namespace Catch {
     // as well as
     //    << stuff +StreamEndStop
     struct StreamEndStop {
-        StringRef operator+() const { return StringRef(); }
+        constexpr StringRef operator+() const { return StringRef(); }
 
         template <typename T>
-        friend T const& operator+( T const& value, StreamEndStop ) {
+        constexpr friend T const& operator+( T const& value, StreamEndStop ) {
             return value;
         }
     };
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_string_manip.cpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_string_manip.cpp
index cb96dd4f..0c889ca1 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_string_manip.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_string_manip.cpp
@@ -8,7 +8,6 @@
 #include <catch2/internal/catch_string_manip.hpp>
 #include <catch2/internal/catch_stringref.hpp>
 
-#include <algorithm>
 #include <ostream>
 #include <cstring>
 #include <cctype>
@@ -32,9 +31,9 @@ namespace Catch {
         return s.find( infix ) != std::string::npos;
     }
     void toLowerInPlace( std::string& s ) {
-        std::transform( s.begin(), s.end(), s.begin(), []( char c ) {
-            return toLower( c );
-        } );
+        for ( char& c : s ) {
+            c = toLower( c );
+        }
     }
     std::string toLower( std::string const& s ) {
         std::string lc = s;
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_stringref.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_stringref.hpp
index 99bb9a98..4b9212bf 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_stringref.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_stringref.hpp
@@ -25,6 +25,8 @@ namespace Catch {
         using size_type = std::size_t;
         using const_iterator = const char*;
 
+        static constexpr size_type npos{ static_cast<size_type>( -1 ) };
+
     private:
         static constexpr char const* const s_empty = "";
 
@@ -75,7 +77,7 @@ namespace Catch {
         }
 
         // Returns a substring of [start, start + length).
-        // If start + length > size(), then the substring is [start, start + size()).
+        // If start + length > size(), then the substring is [start, size()).
         // If start > size(), then the substring is empty.
         constexpr StringRef substr(size_type start, size_type length) const noexcept {
             if (start < m_size) {
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_tag_alias_registry.cpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_tag_alias_registry.cpp
index b7c6b9ec..510df167 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_tag_alias_registry.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_tag_alias_registry.cpp
@@ -6,14 +6,13 @@
 
 // SPDX-License-Identifier: BSL-1.0
 #include <catch2/internal/catch_tag_alias_registry.hpp>
-#include <catch2/internal/catch_console_colour.hpp>
 #include <catch2/internal/catch_enforce.hpp>
 #include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
 #include <catch2/internal/catch_string_manip.hpp>
 
 namespace Catch {
 
-    TagAliasRegistry::~TagAliasRegistry() {}
+    TagAliasRegistry::~TagAliasRegistry() = default;
 
     TagAlias const* TagAliasRegistry::find( std::string const& alias ) const {
         auto it = m_registry.find( alias );
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_case_registry_impl.cpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_case_registry_impl.cpp
index 4b3d2e47..c2b052da 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_case_registry_impl.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_case_registry_impl.cpp
@@ -7,11 +7,9 @@
 // SPDX-License-Identifier: BSL-1.0
 #include <catch2/internal/catch_test_case_registry_impl.hpp>
 
-#include <catch2/internal/catch_context.hpp>
 #include <catch2/internal/catch_enforce.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
 #include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
-#include <catch2/internal/catch_random_number_generator.hpp>
-#include <catch2/internal/catch_run_context.hpp>
 #include <catch2/internal/catch_sharding.hpp>
 #include <catch2/catch_test_case_info.hpp>
 #include <catch2/catch_test_spec.hpp>
@@ -23,6 +21,38 @@
 
 namespace Catch {
 
+    namespace {
+        static void enforceNoDuplicateTestCases(
+            std::vector<TestCaseHandle> const& tests ) {
+            auto testInfoCmp = []( TestCaseInfo const* lhs,
+                                   TestCaseInfo const* rhs ) {
+                return *lhs < *rhs;
+            };
+            std::set<TestCaseInfo const*, decltype( testInfoCmp )&> seenTests(
+                testInfoCmp );
+            for ( auto const& test : tests ) {
+                const auto infoPtr = &test.getTestCaseInfo();
+                const auto prev = seenTests.insert( infoPtr );
+                CATCH_ENFORCE( prev.second,
+                               "error: test case \""
+                                   << infoPtr->name << "\", with tags \""
+                                   << infoPtr->tagsAsString()
+                                   << "\" already defined.\n"
+                                   << "\tFirst seen at "
+                                   << ( *prev.first )->lineInfo << "\n"
+                                   << "\tRedefined at " << infoPtr->lineInfo );
+            }
+        }
+
+        static bool matchTest( TestCaseHandle const& testCase,
+                               TestSpec const& testSpec,
+                               IConfig const& config ) {
+            return testSpec.matches( testCase.getTestCaseInfo() ) &&
+                   isThrowSafe( testCase, config );
+        }
+
+    } // end unnamed namespace
+
     std::vector<TestCaseHandle> sortTests( IConfig const& config, std::vector<TestCaseHandle> const& unsortedTestCases ) {
         switch (config.runOrder()) {
         case TestRunOrder::Declared:
@@ -40,7 +70,6 @@ namespace Catch {
             return sorted;
         }
         case TestRunOrder::Randomized: {
-            seedRng(config);
             using TestWithHash = std::pair<TestCaseInfoHasher::hash_t, TestCaseHandle>;
 
             TestCaseInfoHasher h{ config.rngSeed() };
@@ -79,29 +108,6 @@ namespace Catch {
         return !testCase.getTestCaseInfo().throws() || config.allowThrows();
     }
 
-    bool matchTest( TestCaseHandle const& testCase, TestSpec const& testSpec, IConfig const& config ) {
-        return testSpec.matches( testCase.getTestCaseInfo() ) && isThrowSafe( testCase, config );
-    }
-
-    void
-    enforceNoDuplicateTestCases( std::vector<TestCaseHandle> const& tests ) {
-        auto testInfoCmp = []( TestCaseInfo const* lhs,
-                               TestCaseInfo const* rhs ) {
-            return *lhs < *rhs;
-        };
-        std::set<TestCaseInfo const*, decltype(testInfoCmp) &> seenTests(testInfoCmp);
-        for ( auto const& test : tests ) {
-            const auto infoPtr = &test.getTestCaseInfo();
-            const auto prev = seenTests.insert( infoPtr );
-            CATCH_ENFORCE(
-                prev.second,
-                "error: test case \"" << infoPtr->name << "\", with tags \""
-                    << infoPtr->tagsAsString() << "\" already defined.\n"
-                    << "\tFirst seen at " << ( *prev.first )->lineInfo << "\n"
-                    << "\tRedefined at " << infoPtr->lineInfo );
-        }
-    }
-
     std::vector<TestCaseHandle> filterTests( std::vector<TestCaseHandle> const& testCases, TestSpec const& testSpec, IConfig const& config ) {
         std::vector<TestCaseHandle> filtered;
         filtered.reserve( testCases.size() );
@@ -142,11 +148,4 @@ namespace Catch {
         return m_sortedFunctions;
     }
 
-
-
-    ///////////////////////////////////////////////////////////////////////////
-    void TestInvokerAsFunction::invoke() const {
-        m_testAsFunction();
-    }
-
 } // end namespace Catch
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_case_registry_impl.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_case_registry_impl.hpp
index 228dbb79..99a38498 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_case_registry_impl.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_case_registry_impl.hpp
@@ -8,31 +8,28 @@
 #ifndef CATCH_TEST_CASE_REGISTRY_IMPL_HPP_INCLUDED
 #define CATCH_TEST_CASE_REGISTRY_IMPL_HPP_INCLUDED
 
-#include <catch2/internal/catch_test_registry.hpp>
+#include <catch2/interfaces/catch_interfaces_testcase.hpp>
 #include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/internal/catch_unique_ptr.hpp>
 
 #include <vector>
 
 namespace Catch {
 
-    class TestCaseHandle;
     class IConfig;
+    class ITestInvoker;
+    class TestCaseHandle;
     class TestSpec;
 
     std::vector<TestCaseHandle> sortTests( IConfig const& config, std::vector<TestCaseHandle> const& unsortedTestCases );
 
     bool isThrowSafe( TestCaseHandle const& testCase, IConfig const& config );
-    bool matchTest( TestCaseHandle const& testCase, TestSpec const& testSpec, IConfig const& config );
-
-    void enforceNoDuplicateTestCases( std::vector<TestCaseHandle> const& functions );
 
     std::vector<TestCaseHandle> filterTests( std::vector<TestCaseHandle> const& testCases, TestSpec const& testSpec, IConfig const& config );
     std::vector<TestCaseHandle> const& getAllTestCasesSorted( IConfig const& config );
 
     class TestRegistry : public ITestCaseRegistry {
     public:
-        ~TestRegistry() override = default;
-
         void registerTest( Detail::unique_ptr<TestCaseInfo> testInfo, Detail::unique_ptr<ITestInvoker> testInvoker );
 
         std::vector<TestCaseInfo*> const& getAllInfos() const override;
@@ -53,18 +50,6 @@ namespace Catch {
 
     ///////////////////////////////////////////////////////////////////////////
 
-    class TestInvokerAsFunction final : public ITestInvoker {
-        using TestType = void(*)();
-        TestType m_testAsFunction;
-    public:
-        TestInvokerAsFunction(TestType testAsFunction) noexcept:
-            m_testAsFunction(testAsFunction) {}
-
-        void invoke() const override;
-    };
-
-    ///////////////////////////////////////////////////////////////////////////
-
 
 } // end namespace Catch
 
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_case_tracker.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_case_tracker.hpp
index beff8d6d..50278c91 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_case_tracker.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_case_tracker.hpp
@@ -113,7 +113,7 @@ namespace TestCaseTracking {
 
         //! Returns true if tracker run to completion (successfully or not)
         virtual bool isComplete() const = 0;
-        //! Returns true if tracker run to completion succesfully
+        //! Returns true if tracker run to completion successfully
         bool isSuccessfullyCompleted() const {
             return m_runState == CompletedSuccessfully;
         }
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_failure_exception.cpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_failure_exception.cpp
index c1edff3c..8ea31313 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_failure_exception.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_failure_exception.cpp
@@ -20,4 +20,12 @@ namespace Catch {
 #endif
     }
 
+    void throw_test_skip_exception() {
+#if !defined( CATCH_CONFIG_DISABLE_EXCEPTIONS )
+        throw Catch::TestSkipException();
+#else
+        CATCH_ERROR( "Explicitly skipping tests during runtime requires exceptions" );
+#endif
+    }
+
 } // namespace Catch
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_failure_exception.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_failure_exception.hpp
index 13c5fc08..1ef88364 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_failure_exception.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_failure_exception.hpp
@@ -12,6 +12,8 @@ namespace Catch {
 
     //! Used to signal that an assertion macro failed
     struct TestFailureException{};
+    //! Used to signal that the remainder of a test should be skipped
+    struct TestSkipException {};
 
     /**
      * Outlines throwing of `TestFailureException` into a single TU
@@ -20,8 +22,12 @@ namespace Catch {
      */
     [[noreturn]] void throw_test_failure_exception();
 
-    //! Used to signal that the remainder of a test should be skipped
-    struct TestSkipException{};
+    /**
+     * Outlines throwing of `TestSkipException` into a single TU
+     *
+     * Also handles `CATCH_CONFIG_DISABLE_EXCEPTIONS` for callers.
+     */
+    [[noreturn]] void throw_test_skip_exception();
 
 } // namespace Catch
 
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_macro_impl.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_macro_impl.hpp
index e569680b..59c851e8 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_macro_impl.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_macro_impl.hpp
@@ -10,6 +10,7 @@
 
 #include <catch2/catch_user_config.hpp>
 #include <catch2/internal/catch_assertion_handler.hpp>
+#include <catch2/internal/catch_preprocessor_internal_stringify.hpp>
 #include <catch2/interfaces/catch_interfaces_capture.hpp>
 #include <catch2/internal/catch_stringref.hpp>
 #include <catch2/internal/catch_source_line_info.hpp>
@@ -22,12 +23,6 @@
 
 #if !defined(CATCH_CONFIG_DISABLE)
 
-#if !defined(CATCH_CONFIG_DISABLE_STRINGIFICATION)
-  #define CATCH_INTERNAL_STRINGIFY(...) #__VA_ARGS__##_catch_sr
-#else
-  #define CATCH_INTERNAL_STRINGIFY(...) "Disabled by CATCH_CONFIG_DISABLE_STRINGIFICATION"_catch_sr
-#endif
-
 #if defined(CATCH_CONFIG_FAST_COMPILE) || defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -39,7 +34,7 @@
 #else // CATCH_CONFIG_FAST_COMPILE
 
 #define INTERNAL_CATCH_TRY try
-#define INTERNAL_CATCH_CATCH( handler ) catch(...) { handler.handleUnexpectedInflightException(); }
+#define INTERNAL_CATCH_CATCH( handler ) catch(...) { (handler).handleUnexpectedInflightException(); }
 
 #endif
 
@@ -95,6 +90,7 @@
         if( catchAssertionHandler.allowThrows() ) \
             try { \
                 CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+                CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT \
                 CATCH_INTERNAL_SUPPRESS_USELESS_CAST_WARNINGS \
                 static_cast<void>(__VA_ARGS__); \
                 CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
@@ -115,6 +111,7 @@
         if( catchAssertionHandler.allowThrows() ) \
             try { \
                 CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+                CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT \
                 CATCH_INTERNAL_SUPPRESS_USELESS_CAST_WARNINGS \
                 static_cast<void>(expr); \
                 CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
@@ -141,6 +138,7 @@
         if( catchAssertionHandler.allowThrows() ) \
             try { \
                 CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+                CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT \
                 CATCH_INTERNAL_SUPPRESS_USELESS_CAST_WARNINGS \
                 static_cast<void>(__VA_ARGS__); \
                 CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_registry.cpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_registry.cpp
index 9769ed03..e9c999fe 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_registry.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_registry.cpp
@@ -8,7 +8,6 @@
 #include <catch2/internal/catch_test_registry.hpp>
 #include <catch2/internal/catch_compiler_capabilities.hpp>
 #include <catch2/catch_test_case_info.hpp>
-#include <catch2/internal/catch_test_case_registry_impl.hpp>
 #include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
 #include <catch2/internal/catch_string_manip.hpp>
 #include <catch2/internal/catch_move_and_forward.hpp>
@@ -17,9 +16,10 @@
 #include <iterator>
 
 namespace Catch {
+    ITestInvoker::~ITestInvoker() = default;
 
     namespace {
-        StringRef extractClassName( StringRef classOrMethodName ) {
+        static StringRef extractClassName( StringRef classOrMethodName ) {
             if ( !startsWith( classOrMethodName, '&' ) ) {
                 return classOrMethodName;
             }
@@ -46,6 +46,18 @@ namespace Catch {
                 static_cast<std::size_t>( startIdx ),
                 static_cast<std::size_t>( classNameSize ) );
         }
+
+        class TestInvokerAsFunction final : public ITestInvoker {
+            using TestType = void ( * )();
+            TestType m_testAsFunction;
+
+        public:
+            TestInvokerAsFunction( TestType testAsFunction ) noexcept:
+                m_testAsFunction( testAsFunction ) {}
+
+            void invoke() const override { m_testAsFunction(); }
+        };
+
     } // namespace
 
     Detail::unique_ptr<ITestInvoker> makeTestInvoker( void(*testAsFunction)() ) {
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_registry.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_registry.hpp
index f53b93c8..7766fe11 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_registry.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_registry.hpp
@@ -8,9 +8,10 @@
 #ifndef CATCH_TEST_REGISTRY_HPP_INCLUDED
 #define CATCH_TEST_REGISTRY_HPP_INCLUDED
 
+#include <catch2/internal/catch_config_static_analysis_support.hpp>
 #include <catch2/internal/catch_source_line_info.hpp>
 #include <catch2/internal/catch_noncopyable.hpp>
-#include <catch2/interfaces/catch_interfaces_testcase.hpp>
+#include <catch2/interfaces/catch_interfaces_test_invoker.hpp>
 #include <catch2/internal/catch_stringref.hpp>
 #include <catch2/internal/catch_unique_ptr.hpp>
 #include <catch2/internal/catch_unique_name.hpp>
@@ -72,6 +73,9 @@ struct AutoReg : Detail::NonCopyable {
         void TestName::test()
 #endif
 
+
+#if !defined(CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT)
+
     ///////////////////////////////////////////////////////////////////////////////
     #define INTERNAL_CATCH_TESTCASE2( TestName, ... ) \
         static void TestName(); \
@@ -84,19 +88,40 @@ struct AutoReg : Detail::NonCopyable {
     #define INTERNAL_CATCH_TESTCASE( ... ) \
         INTERNAL_CATCH_TESTCASE2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEST_ ), __VA_ARGS__ )
 
-    ///////////////////////////////////////////////////////////////////////////////
-    #define INTERNAL_CATCH_METHOD_AS_TEST_CASE( QualifiedMethod, ... ) \
-        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
-        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
-        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
-        namespace {                                                           \
-        const Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( \
-            Catch::makeTestInvoker( &QualifiedMethod ),                   \
-            CATCH_INTERNAL_LINEINFO,                                      \
-            "&" #QualifiedMethod##_catch_sr,                              \
-            Catch::NameAndTags{ __VA_ARGS__ } );                          \
-    } /* NOLINT */ \
-        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+#else  // ^^ !CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT | vv CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT
+
+
+// Dummy registrator for the dumy test case macros
+namespace Catch {
+    namespace Detail {
+        struct DummyUse {
+            DummyUse( void ( * )( int ) );
+        };
+    } // namespace Detail
+} // namespace Catch
+
+// Note that both the presence of the argument and its exact name are
+// necessary for the section support.
+
+// We provide a shadowed variable so that a `SECTION` inside non-`TEST_CASE`
+// tests can compile. The redefined `TEST_CASE` shadows this with param.
+static int catchInternalSectionHint = 0;
+
+#    define INTERNAL_CATCH_TESTCASE2( fname )                              \
+        static void fname( int );                                          \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                          \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                           \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS                   \
+        static const Catch::Detail::DummyUse INTERNAL_CATCH_UNIQUE_NAME(   \
+            dummyUser )( &(fname) );                                       \
+        CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS                            \
+        static void fname( [[maybe_unused]] int catchInternalSectionHint ) \
+            CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+#    define INTERNAL_CATCH_TESTCASE( ... ) \
+        INTERNAL_CATCH_TESTCASE2( INTERNAL_CATCH_UNIQUE_NAME( dummyFunction ) )
+
+
+#endif // CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT
 
     ///////////////////////////////////////////////////////////////////////////////
     #define INTERNAL_CATCH_TEST_CASE_METHOD2( TestName, ClassName, ... )\
@@ -118,6 +143,22 @@ struct AutoReg : Detail::NonCopyable {
     #define INTERNAL_CATCH_TEST_CASE_METHOD( ClassName, ... ) \
         INTERNAL_CATCH_TEST_CASE_METHOD2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEST_ ), ClassName, __VA_ARGS__ )
 
+
+    ///////////////////////////////////////////////////////////////////////////////
+    #define INTERNAL_CATCH_METHOD_AS_TEST_CASE( QualifiedMethod, ... ) \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
+        namespace {                                                           \
+        const Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( \
+            Catch::makeTestInvoker( &QualifiedMethod ),                   \
+            CATCH_INTERNAL_LINEINFO,                                      \
+            "&" #QualifiedMethod##_catch_sr,                              \
+            Catch::NameAndTags{ __VA_ARGS__ } );                          \
+    } /* NOLINT */ \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+
     ///////////////////////////////////////////////////////////////////////////////
     #define INTERNAL_CATCH_REGISTER_TESTCASE( Function, ... ) \
         do { \
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_run_info.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_run_info.hpp
new file mode 100644
index 00000000..90357b0a
--- /dev/null
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_test_run_info.hpp
@@ -0,0 +1,22 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_TEST_RUN_INFO_HPP_INCLUDED
+#define CATCH_TEST_RUN_INFO_HPP_INCLUDED
+
+#include <catch2/internal/catch_stringref.hpp>
+
+namespace Catch {
+
+    struct TestRunInfo {
+        constexpr TestRunInfo(StringRef _name) : name(_name) {}
+        StringRef name;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_TEST_RUN_INFO_HPP_INCLUDED
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_textflow.cpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_textflow.cpp
index 7eac9732..857fd2b9 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_textflow.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_textflow.cpp
@@ -233,23 +233,36 @@ namespace Catch {
             return os;
         }
 
-        Columns Column::operator+( Column const& other ) {
+        Columns operator+(Column const& lhs, Column const& rhs) {
             Columns cols;
-            cols += *this;
-            cols += other;
+            cols += lhs;
+            cols += rhs;
             return cols;
         }
-
-        Columns& Columns::operator+=( Column const& col ) {
-            m_columns.push_back( col );
-            return *this;
+        Columns operator+(Column&& lhs, Column&& rhs) {
+            Columns cols;
+            cols += CATCH_MOVE( lhs );
+            cols += CATCH_MOVE( rhs );
+            return cols;
         }
 
-        Columns Columns::operator+( Column const& col ) {
-            Columns combined = *this;
-            combined += col;
+        Columns& operator+=(Columns& lhs, Column const& rhs) {
+            lhs.m_columns.push_back( rhs );
+            return lhs;
+        }
+        Columns& operator+=(Columns& lhs, Column&& rhs) {
+            lhs.m_columns.push_back( CATCH_MOVE(rhs) );
+            return lhs;
+        }
+        Columns operator+( Columns const& lhs, Column const& rhs ) {
+            auto combined( lhs );
+            combined += rhs;
             return combined;
         }
+        Columns operator+( Columns&& lhs, Column&& rhs ) {
+            lhs += CATCH_MOVE( rhs );
+            return CATCH_MOVE( lhs );
+        }
 
     } // namespace TextFlow
 } // namespace Catch
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_textflow.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_textflow.hpp
index ceac675d..a78451d5 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_textflow.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_textflow.hpp
@@ -8,8 +8,10 @@
 #ifndef CATCH_TEXTFLOW_HPP_INCLUDED
 #define CATCH_TEXTFLOW_HPP_INCLUDED
 
-#include <cassert>
 #include <catch2/internal/catch_console_width.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <cassert>
 #include <string>
 #include <vector>
 
@@ -37,7 +39,7 @@ namespace Catch {
 
         public:
             /**
-             * Iterates "lines" in `Column` and return sthem
+             * Iterates "lines" in `Column` and returns them
              */
             class const_iterator {
                 friend Column;
@@ -59,7 +61,7 @@ namespace Catch {
                 // Calculates the length of the current line
                 void calcLength();
 
-                // Returns current indention width
+                // Returns current indentation width
                 size_t indentSize() const;
 
                 // Creates an indented and (optionally) suffixed string from
@@ -91,20 +93,35 @@ namespace Catch {
             using iterator = const_iterator;
 
             explicit Column( std::string const& text ): m_string( text ) {}
+            explicit Column( std::string&& text ):
+                m_string( CATCH_MOVE(text)) {}
 
-            Column& width( size_t newWidth ) {
+            Column& width( size_t newWidth ) & {
                 assert( newWidth > 0 );
                 m_width = newWidth;
                 return *this;
             }
-            Column& indent( size_t newIndent ) {
+            Column&& width( size_t newWidth ) && {
+                assert( newWidth > 0 );
+                m_width = newWidth;
+                return CATCH_MOVE( *this );
+            }
+            Column& indent( size_t newIndent ) & {
                 m_indent = newIndent;
                 return *this;
             }
-            Column& initialIndent( size_t newIndent ) {
+            Column&& indent( size_t newIndent ) && {
+                m_indent = newIndent;
+                return CATCH_MOVE( *this );
+            }
+            Column& initialIndent( size_t newIndent ) & {
                 m_initialIndent = newIndent;
                 return *this;
             }
+            Column&& initialIndent( size_t newIndent ) && {
+                m_initialIndent = newIndent;
+                return CATCH_MOVE( *this );
+            }
 
             size_t width() const { return m_width; }
             const_iterator begin() const { return const_iterator( *this ); }
@@ -113,7 +130,8 @@ namespace Catch {
             friend std::ostream& operator<<( std::ostream& os,
                                              Column const& col );
 
-            Columns operator+( Column const& other );
+            friend Columns operator+( Column const& lhs, Column const& rhs );
+            friend Columns operator+( Column&& lhs, Column&& rhs );
         };
 
         //! Creates a column that serves as an empty space of specific width
@@ -157,8 +175,10 @@ namespace Catch {
             iterator begin() const { return iterator( *this ); }
             iterator end() const { return { *this, iterator::EndTag() }; }
 
-            Columns& operator+=( Column const& col );
-            Columns operator+( Column const& col );
+            friend Columns& operator+=( Columns& lhs, Column const& rhs );
+            friend Columns& operator+=( Columns& lhs, Column&& rhs );
+            friend Columns operator+( Columns const& lhs, Column const& rhs );
+            friend Columns operator+( Columns&& lhs, Column&& rhs );
 
             friend std::ostream& operator<<( std::ostream& os,
                                              Columns const& cols );
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_uncaught_exceptions.cpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_uncaught_exceptions.cpp
index 704d6e1c..8cfabc0f 100644
--- a/alpaka/thirdParty/catch2/src/catch2/internal/catch_uncaught_exceptions.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_uncaught_exceptions.cpp
@@ -7,7 +7,6 @@
 // SPDX-License-Identifier: BSL-1.0
 
 #include <catch2/internal/catch_uncaught_exceptions.hpp>
-#include <catch2/internal/catch_compiler_capabilities.hpp>
 #include <catch2/internal/catch_config_uncaught_exceptions.hpp>
 #include <catch2/catch_user_config.hpp>
 
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_uniform_floating_point_distribution.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_uniform_floating_point_distribution.hpp
new file mode 100644
index 00000000..23d03b43
--- /dev/null
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_uniform_floating_point_distribution.hpp
@@ -0,0 +1,131 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#ifndef CATCH_UNIFORM_FLOATING_POINT_DISTRIBUTION_HPP_INCLUDED
+#define CATCH_UNIFORM_FLOATING_POINT_DISTRIBUTION_HPP_INCLUDED
+
+#include <catch2/internal/catch_random_floating_point_helpers.hpp>
+#include <catch2/internal/catch_uniform_integer_distribution.hpp>
+
+#include <cmath>
+#include <type_traits>
+
+namespace Catch {
+
+    namespace Detail {
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        // The issue with overflow only happens with maximal ULP and HUGE
+        // distance, e.g. when generating numbers in [-inf, inf] for given
+        // type. So we only check for the largest possible ULP in the
+        // type, and return something that does not overflow to inf in 1 mult.
+        constexpr std::uint64_t calculate_max_steps_in_one_go(double gamma) {
+            if ( gamma == 1.99584030953472e+292 ) { return 9007199254740991; }
+            return static_cast<std::uint64_t>( -1 );
+        }
+        constexpr std::uint32_t calculate_max_steps_in_one_go(float gamma) {
+            if ( gamma == 2.028241e+31f ) { return 16777215; }
+            return static_cast<std::uint32_t>( -1 );
+        }
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic pop
+#endif
+    }
+
+/**
+ * Implementation of uniform distribution on floating point numbers.
+ *
+ * Note that we support only `float` and `double` types, because these
+ * usually mean the same thing across different platform. `long double`
+ * varies wildly by platform and thus we cannot provide reproducible
+ * implementation. Also note that we don't implement all parts of
+ * distribution per standard: this distribution is not serializable, nor
+ * can the range be arbitrarily reset.
+ *
+ * The implementation also uses different approach than the one taken by
+ * `std::uniform_real_distribution`, where instead of generating a number
+ * between [0, 1) and then multiplying the range bounds with it, we first
+ * split the [a, b] range into a set of equidistributed floating point
+ * numbers, and then use uniform int distribution to pick which one to
+ * return.
+ *
+ * This has the advantage of guaranteeing uniformity (the multiplication
+ * method loses uniformity due to rounding when multiplying floats), except
+ * for small non-uniformity at one side of the interval, where we have
+ * to deal with the fact that not every interval is splittable into
+ * equidistributed floats.
+ *
+ * Based on "Drawing random floating-point numbers from an interval" by
+ * Frederic Goualard.
+ */
+template <typename FloatType>
+class uniform_floating_point_distribution {
+    static_assert(std::is_floating_point<FloatType>::value, "...");
+    static_assert(!std::is_same<FloatType, long double>::value,
+                  "We do not support long double due to inconsistent behaviour between platforms");
+
+    using WidthType = Detail::DistanceType<FloatType>;
+
+    FloatType m_a, m_b;
+    FloatType m_ulp_magnitude;
+    WidthType m_floats_in_range;
+    uniform_integer_distribution<WidthType> m_int_dist;
+
+    // In specific cases, we can overflow into `inf` when computing the
+    // `steps * g` offset. To avoid this, we don't offset by more than this
+    // in one multiply + addition.
+    WidthType m_max_steps_in_one_go;
+    // We don't want to do the magnitude check every call to `operator()`
+    bool m_a_has_leq_magnitude;
+
+public:
+    using result_type = FloatType;
+
+    uniform_floating_point_distribution( FloatType a, FloatType b ):
+        m_a( a ),
+        m_b( b ),
+        m_ulp_magnitude( Detail::gamma( m_a, m_b ) ),
+        m_floats_in_range( Detail::count_equidistant_floats( m_a, m_b, m_ulp_magnitude ) ),
+        m_int_dist(0, m_floats_in_range),
+        m_max_steps_in_one_go( Detail::calculate_max_steps_in_one_go(m_ulp_magnitude)),
+        m_a_has_leq_magnitude(std::fabs(m_a) <= std::fabs(m_b))
+    {
+        assert( a <= b );
+    }
+
+    template <typename Generator>
+    result_type operator()( Generator& g ) {
+        WidthType steps = m_int_dist( g );
+        if ( m_a_has_leq_magnitude ) {
+            if ( steps == m_floats_in_range ) { return m_a; }
+            auto b = m_b;
+            while (steps > m_max_steps_in_one_go) {
+                b -= m_max_steps_in_one_go * m_ulp_magnitude;
+                steps -= m_max_steps_in_one_go;
+            }
+            return b - steps * m_ulp_magnitude;
+        } else {
+            if ( steps == m_floats_in_range ) { return m_b; }
+            auto a = m_a;
+            while (steps > m_max_steps_in_one_go) {
+                a += m_max_steps_in_one_go * m_ulp_magnitude;
+                steps -= m_max_steps_in_one_go;
+            }
+            return a + steps * m_ulp_magnitude;
+        }
+    }
+
+    result_type a() const { return m_a; }
+    result_type b() const { return m_b; }
+};
+
+} // end namespace Catch
+
+#endif // CATCH_UNIFORM_FLOATING_POINT_DISTRIBUTION_HPP_INCLUDED
diff --git a/alpaka/thirdParty/catch2/src/catch2/internal/catch_uniform_integer_distribution.hpp b/alpaka/thirdParty/catch2/src/catch2/internal/catch_uniform_integer_distribution.hpp
new file mode 100644
index 00000000..afa2015d
--- /dev/null
+++ b/alpaka/thirdParty/catch2/src/catch2/internal/catch_uniform_integer_distribution.hpp
@@ -0,0 +1,124 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#ifndef CATCH_UNIFORM_INTEGER_DISTRIBUTION_HPP_INCLUDED
+#define CATCH_UNIFORM_INTEGER_DISTRIBUTION_HPP_INCLUDED
+
+#include <catch2/internal/catch_random_integer_helpers.hpp>
+
+namespace Catch {
+
+    namespace Detail {
+        // Indirection to enable make_unsigned<bool> behaviour.
+        template <typename T>
+        struct make_unsigned {
+            using type = std::make_unsigned_t<T>;
+        };
+
+        template <>
+        struct make_unsigned<bool> {
+            using type = uint8_t;
+        };
+
+        template <typename T>
+        using make_unsigned_t = typename make_unsigned<T>::type;
+    }
+
+/**
+ * Implementation of uniform distribution on integers.
+ *
+ * Unlike `std::uniform_int_distribution`, this implementation supports
+ * various 1 byte integral types, including bool (but you should not
+ * actually use it for bools).
+ *
+ * The underlying algorithm is based on the one described in "Fast Random
+ * Integer Generation in an Interval" by Daniel Lemire, but has been
+ * optimized under the assumption of reuse of the same distribution object.
+ */
+template <typename IntegerType>
+class uniform_integer_distribution {
+    static_assert(std::is_integral<IntegerType>::value, "...");
+
+    using UnsignedIntegerType = Detail::make_unsigned_t<IntegerType>;
+
+    // Only the left bound is stored, and we store it converted to its
+    // unsigned image. This avoids having to do the conversions inside
+    // the operator(), at the cost of having to do the conversion in
+    // the a() getter. The right bound is only needed in the b() getter,
+    // so we recompute it there from other stored data.
+    UnsignedIntegerType m_a;
+
+    // How many different values are there in [a, b]. a == b => 1, can be 0 for distribution over all values in the type.
+    UnsignedIntegerType m_ab_distance;
+
+    // We hoisted this out of the main generation function. Technically,
+    // this means that using this distribution will be slower than Lemire's
+    // algorithm if this distribution instance will be used only few times,
+    // but it will be faster if it is used many times. Since Catch2 uses
+    // distributions only to implement random generators, we assume that each
+    // distribution will be reused many times and this is an optimization.
+    UnsignedIntegerType m_rejection_threshold = 0;
+
+    UnsignedIntegerType computeDistance(IntegerType a, IntegerType b) const {
+        // This overflows and returns 0 if a == 0 and b == TYPE_MAX.
+        // We handle that later when generating the number.
+        return transposeTo(b) - transposeTo(a) + 1;
+    }
+
+    static UnsignedIntegerType computeRejectionThreshold(UnsignedIntegerType ab_distance) {
+        // distance == 0 means that we will return all possible values from
+        // the type's range, and that we shouldn't reject anything.
+        if ( ab_distance == 0 ) { return 0; }
+        return ( ~ab_distance + 1 ) % ab_distance;
+    }
+
+    static UnsignedIntegerType transposeTo(IntegerType in) {
+        return Detail::transposeToNaturalOrder<IntegerType>(
+            static_cast<UnsignedIntegerType>( in ) );
+    }
+    static IntegerType transposeBack(UnsignedIntegerType in) {
+        return static_cast<IntegerType>(
+            Detail::transposeToNaturalOrder<IntegerType>(in) );
+    }
+
+public:
+    using result_type = IntegerType;
+
+    uniform_integer_distribution( IntegerType a, IntegerType b ):
+        m_a( transposeTo(a) ),
+        m_ab_distance( computeDistance(a, b) ),
+        m_rejection_threshold( computeRejectionThreshold(m_ab_distance) ) {
+        assert( a <= b );
+    }
+
+    template <typename Generator>
+    result_type operator()( Generator& g ) {
+        // All possible values of result_type are valid.
+        if ( m_ab_distance == 0 ) {
+            return transposeBack( Detail::fillBitsFrom<UnsignedIntegerType>( g ) );
+        }
+
+        auto random_number = Detail::fillBitsFrom<UnsignedIntegerType>( g );
+        auto emul = Detail::extendedMult( random_number, m_ab_distance );
+        // Unlike Lemire's algorithm we skip the ab_distance check, since
+        // we precomputed the rejection threshold, which is always tighter.
+        while (emul.lower < m_rejection_threshold) {
+            random_number = Detail::fillBitsFrom<UnsignedIntegerType>( g );
+            emul = Detail::extendedMult( random_number, m_ab_distance );
+        }
+
+        return transposeBack(m_a + emul.upper);
+    }
+
+    result_type a() const { return transposeBack(m_a); }
+    result_type b() const { return transposeBack(m_ab_distance + m_a - 1); }
+};
+
+} // end namespace Catch
+
+#endif // CATCH_UNIFORM_INTEGER_DISTRIBUTION_HPP_INCLUDED
diff --git a/alpaka/thirdParty/catch2/src/catch2/matchers/catch_matchers_floating_point.cpp b/alpaka/thirdParty/catch2/src/catch2/matchers/catch_matchers_floating_point.cpp
index 6e596466..206332ef 100644
--- a/alpaka/thirdParty/catch2/src/catch2/matchers/catch_matchers_floating_point.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/matchers/catch_matchers_floating_point.cpp
@@ -38,26 +38,11 @@ namespace {
         return ulpDist <= maxUlpDiff;
     }
 
-#if defined(CATCH_CONFIG_GLOBAL_NEXTAFTER)
-
-    float nextafter(float x, float y) {
-        return ::nextafterf(x, y);
-    }
-
-    double nextafter(double x, double y) {
-        return ::nextafter(x, y);
-    }
-
-#endif // ^^^ CATCH_CONFIG_GLOBAL_NEXTAFTER ^^^
 
 template <typename FP>
 FP step(FP start, FP direction, uint64_t steps) {
     for (uint64_t i = 0; i < steps; ++i) {
-#if defined(CATCH_CONFIG_GLOBAL_NEXTAFTER)
         start = Catch::nextafter(start, direction);
-#else
-        start = std::nextafter(start, direction);
-#endif
     }
     return start;
 }
diff --git a/alpaka/thirdParty/catch2/src/catch2/matchers/catch_matchers_range_equals.hpp b/alpaka/thirdParty/catch2/src/catch2/matchers/catch_matchers_range_equals.hpp
index ce66bed9..95b781a4 100644
--- a/alpaka/thirdParty/catch2/src/catch2/matchers/catch_matchers_range_equals.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/matchers/catch_matchers_range_equals.hpp
@@ -129,7 +129,7 @@ namespace Catch {
 
         /**
          * Creates a matcher that checks if all elements in a range are equal
-         * to all elements in another range, in some permuation.
+         * to all elements in another range, in some permutation.
          *
          * Uses to provided predicate `predicate` to do the comparisons
          */
diff --git a/alpaka/thirdParty/catch2/src/catch2/matchers/catch_matchers_vector.hpp b/alpaka/thirdParty/catch2/src/catch2/matchers/catch_matchers_vector.hpp
index 9a4b024f..fffbfdf6 100644
--- a/alpaka/thirdParty/catch2/src/catch2/matchers/catch_matchers_vector.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/matchers/catch_matchers_vector.hpp
@@ -85,11 +85,10 @@ namespace Matchers {
             // - a more general approach would be via a compare template that defaults
             // to using !=. but could be specialised for, e.g. std::vector<T> etc
             // - then just call that directly
-            if (m_comparator.size() != v.size())
-                return false;
-            for (std::size_t i = 0; i < v.size(); ++i)
-                if (m_comparator[i] != v[i])
-                    return false;
+            if ( m_comparator.size() != v.size() ) { return false; }
+            for ( std::size_t i = 0; i < v.size(); ++i ) {
+                if ( !( m_comparator[i] == v[i] ) ) { return false; }
+            }
             return true;
         }
         std::string describe() const override {
diff --git a/alpaka/thirdParty/catch2/src/catch2/matchers/internal/catch_matchers_impl.hpp b/alpaka/thirdParty/catch2/src/catch2/matchers/internal/catch_matchers_impl.hpp
index 12455bfe..2ee9f0c0 100644
--- a/alpaka/thirdParty/catch2/src/catch2/matchers/internal/catch_matchers_impl.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/matchers/internal/catch_matchers_impl.hpp
@@ -8,9 +8,14 @@
 #ifndef CATCH_MATCHERS_IMPL_HPP_INCLUDED
 #define CATCH_MATCHERS_IMPL_HPP_INCLUDED
 
-#include <catch2/internal/catch_test_macro_impl.hpp>
+#include <catch2/internal/catch_assertion_handler.hpp>
+#include <catch2/internal/catch_source_line_info.hpp>
+#include <catch2/internal/catch_decomposer.hpp>
+#include <catch2/internal/catch_preprocessor_internal_stringify.hpp>
 #include <catch2/internal/catch_move_and_forward.hpp>
 
+#include <string>
+
 namespace Catch {
 
     template<typename ArgT, typename MatcherT>
diff --git a/alpaka/thirdParty/catch2/src/catch2/meson.build b/alpaka/thirdParty/catch2/src/catch2/meson.build
index 0e114065..cc45e641 100644
--- a/alpaka/thirdParty/catch2/src/catch2/meson.build
+++ b/alpaka/thirdParty/catch2/src/catch2/meson.build
@@ -18,6 +18,8 @@ configure_file(
   configuration: conf_data,
 )
 
+fs = import('fs')
+
 benchmark_headers = [
   'benchmark/catch_benchmark.hpp',
   'benchmark/catch_benchmark_all.hpp',
@@ -32,6 +34,8 @@ benchmark_headers = [
   'benchmark/catch_sample_analysis.hpp',
   'benchmark/detail/catch_analyse.hpp',
   'benchmark/detail/catch_benchmark_function.hpp',
+  'benchmark/detail/catch_benchmark_stats.hpp',
+  'benchmark/detail/catch_benchmark_stats_fwd.hpp',
   'benchmark/detail/catch_complete_invoke.hpp',
   'benchmark/detail/catch_estimate_clock.hpp',
   'benchmark/detail/catch_measure.hpp',
@@ -43,6 +47,7 @@ benchmark_headers = [
 
 benchmark_sources = files(
   'benchmark/catch_chronometer.cpp',
+  'benchmark/detail/catch_analyse.cpp',
   'benchmark/detail/catch_benchmark_function.cpp',
   'benchmark/detail/catch_run_for_at_least.cpp',
   'benchmark/detail/catch_stats.cpp',
@@ -64,8 +69,8 @@ internal_headers = [
   'interfaces/catch_interfaces_registry_hub.hpp',
   'interfaces/catch_interfaces_reporter.hpp',
   'interfaces/catch_interfaces_reporter_factory.hpp',
-  'interfaces/catch_interfaces_reporter_registry.hpp',
   'interfaces/catch_interfaces_tag_alias_registry.hpp',
+  'interfaces/catch_interfaces_test_invoker.hpp',
   'interfaces/catch_interfaces_testcase.hpp',
   'internal/catch_assertion_handler.hpp',
   'internal/catch_case_insensitive_comparisons.hpp',
@@ -76,6 +81,7 @@ internal_headers = [
   'internal/catch_compiler_capabilities.hpp',
   'internal/catch_config_android_logwrite.hpp',
   'internal/catch_config_counter.hpp',
+  'internal/catch_config_static_analysis_support.hpp',
   'internal/catch_config_uncaught_exceptions.hpp',
   'internal/catch_config_wchar.hpp',
   'internal/catch_console_colour.hpp',
@@ -94,6 +100,7 @@ internal_headers = [
   'internal/catch_getenv.hpp',
   'internal/catch_istream.hpp',
   'internal/catch_is_permutation.hpp',
+  'internal/catch_jsonwriter.hpp',
   'internal/catch_lazy_expr.hpp',
   'internal/catch_leak_detector.hpp',
   'internal/catch_list.hpp',
@@ -108,7 +115,10 @@ internal_headers = [
   'internal/catch_platform.hpp',
   'internal/catch_polyfills.hpp',
   'internal/catch_preprocessor.hpp',
+  'internal/catch_preprocessor_internal_stringify.hpp',
   'internal/catch_preprocessor_remove_parens.hpp',
+  'internal/catch_random_floating_point_helpers.hpp',
+  'internal/catch_random_integer_helpers.hpp',
   'internal/catch_random_number_generator.hpp',
   'internal/catch_random_seed_generation.hpp',
   'internal/catch_reporter_registry.hpp',
@@ -133,10 +143,13 @@ internal_headers = [
   'internal/catch_test_failure_exception.hpp',
   'internal/catch_test_macro_impl.hpp',
   'internal/catch_test_registry.hpp',
+  'internal/catch_test_run_info.hpp',
   'internal/catch_test_spec_parser.hpp',
   'internal/catch_textflow.hpp',
   'internal/catch_to_string.hpp',
   'internal/catch_uncaught_exceptions.hpp',
+  'internal/catch_uniform_floating_point_distribution.hpp',
+  'internal/catch_uniform_integer_distribution.hpp',
   'internal/catch_unique_name.hpp',
   'internal/catch_unique_ptr.hpp',
   'internal/catch_void_type.hpp',
@@ -151,6 +164,7 @@ internal_headers = [
   'matchers/catch_matchers_floating_point.hpp',
   'matchers/catch_matchers_predicate.hpp',
   'matchers/catch_matchers_quantifiers.hpp',
+  'matchers/catch_matchers_range_equals.hpp',
   'matchers/catch_matchers_string.hpp',
   'matchers/catch_matchers_templated.hpp',
   'matchers/catch_matchers_vector.hpp',
@@ -189,7 +203,6 @@ internal_sources = files(
   'interfaces/catch_interfaces_registry_hub.cpp',
   'interfaces/catch_interfaces_reporter.cpp',
   'interfaces/catch_interfaces_reporter_factory.cpp',
-  'interfaces/catch_interfaces_reporter_registry.cpp',
   'interfaces/catch_interfaces_testcase.cpp',
   'internal/catch_assertion_handler.cpp',
   'internal/catch_case_insensitive_comparisons.cpp',
@@ -208,6 +221,7 @@ internal_sources = files(
   'internal/catch_floating_point_helpers.cpp',
   'internal/catch_getenv.cpp',
   'internal/catch_istream.cpp',
+  'internal/catch_jsonwriter.cpp',
   'internal/catch_lazy_expr.cpp',
   'internal/catch_leak_detector.cpp',
   'internal/catch_list.cpp',
@@ -262,6 +276,7 @@ internal_sources = files(
   'catch_timer.cpp',
   'catch_tostring.cpp',
   'catch_totals.cpp',
+  'catch_translate_exception.cpp',
   'catch_version.cpp',
 )
 
@@ -273,6 +288,7 @@ reporter_headers = [
   'reporters/catch_reporter_cumulative_base.hpp',
   'reporters/catch_reporter_event_listener.hpp',
   'reporters/catch_reporter_helpers.hpp',
+  'reporters/catch_reporter_json.hpp',
   'reporters/catch_reporter_junit.hpp',
   'reporters/catch_reporter_multi.hpp',
   'reporters/catch_reporter_registrars.hpp',
@@ -292,6 +308,7 @@ reporter_sources = files(
   'reporters/catch_reporter_cumulative_base.cpp',
   'reporters/catch_reporter_event_listener.cpp',
   'reporters/catch_reporter_helpers.cpp',
+  'reporters/catch_reporter_json.cpp',
   'reporters/catch_reporter_junit.cpp',
   'reporters/catch_reporter_multi.cpp',
   'reporters/catch_reporter_registrars.cpp',
@@ -325,9 +342,19 @@ foreach file : headers
   install_headers(file, subdir: join_paths(include_subdir, folder))
 endforeach
 
+catch2_dependencies = []
+# Check if this is an Android NDK build.
+if ((host_machine.system() == 'android') or
+  # Check if this is an Android Termux build.
+  (host_machine.system() == 'linux' and fs.is_dir('/data/data/com.termux')))
+  log_dep = meson.get_compiler('cpp').find_library('log')
+  catch2_dependencies += log_dep
+endif
+
 catch2 = static_library(
   'Catch2',
   sources,
+  dependencies: catch2_dependencies,
   include_directories: '..',
   install: true,
 )
diff --git a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_automake.cpp b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_automake.cpp
index 993b594b..5e506a6b 100644
--- a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_automake.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_automake.cpp
@@ -12,7 +12,7 @@
 
 namespace Catch {
 
-    AutomakeReporter::~AutomakeReporter() {}
+    AutomakeReporter::~AutomakeReporter() = default;
 
     void AutomakeReporter::testCaseEnded(TestCaseStats const& _testCaseStats) {
         // Possible values to emit are PASS, XFAIL, SKIP, FAIL, XPASS and ERROR.
diff --git a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_automake.hpp b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_automake.hpp
index 3475a1fd..a639428c 100644
--- a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_automake.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_automake.hpp
@@ -8,7 +8,6 @@
 #ifndef CATCH_REPORTER_AUTOMAKE_HPP_INCLUDED
 #define CATCH_REPORTER_AUTOMAKE_HPP_INCLUDED
 
-#include <catch2/interfaces/catch_interfaces_reporter.hpp>
 #include <catch2/reporters/catch_reporter_streaming_base.hpp>
 #include <catch2/internal/catch_move_and_forward.hpp>
 
diff --git a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_compact.cpp b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_compact.cpp
index 3a9b870c..0f855944 100644
--- a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_compact.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_compact.cpp
@@ -171,7 +171,7 @@ class AssertionPrinter {
             return;
 
         const auto itEnd = messages.cend();
-        const auto N = static_cast<std::size_t>(std::distance(itMessage, itEnd));
+        const auto N = static_cast<std::size_t>(itEnd - itMessage);
 
         stream << colourImpl->guardColour( colour ) << " with "
                << pluralise( N, "message"_sr ) << ':';
@@ -249,6 +249,6 @@ class AssertionPrinter {
             StreamingReporterBase::testRunEnded( _testRunStats );
         }
 
-        CompactReporter::~CompactReporter() {}
+        CompactReporter::~CompactReporter() = default;
 
 } // end namespace Catch
diff --git a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_console.cpp b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_console.cpp
index a46b22cf..f3b8b5b1 100644
--- a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_console.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_console.cpp
@@ -209,15 +209,9 @@ findMax( std::size_t& i, std::size_t& j, std::size_t& k, std::size_t& l ) {
         return l;
 }
 
-enum class Justification { Left, Right };
-
-struct ColumnInfo {
-    std::string name;
-    std::size_t width;
-    Justification justification;
-};
 struct ColumnBreak {};
 struct RowBreak {};
+struct OutputFlush {};
 
 class Duration {
     enum class Unit {
@@ -292,6 +286,14 @@ class Duration {
 };
 } // end anon namespace
 
+enum class Justification { Left, Right };
+
+struct ColumnInfo {
+    std::string name;
+    std::size_t width;
+    Justification justification;
+};
+
 class TablePrinter {
     std::ostream& m_os;
     std::vector<ColumnInfo> m_columnInfos;
@@ -314,11 +316,10 @@ class TablePrinter {
             *this << RowBreak();
 
 			TextFlow::Columns headerCols;
-			auto spacer = TextFlow::Spacer(2);
 			for (auto const& info : m_columnInfos) {
                 assert(info.width > 2);
 				headerCols += TextFlow::Column(info.name).width(info.width - 2);
-				headerCols += spacer;
+                headerCols += TextFlow::Spacer( 2 );
 			}
 			m_os << headerCols << '\n';
 
@@ -334,12 +335,12 @@ class TablePrinter {
     }
 
     template<typename T>
-    friend TablePrinter& operator << (TablePrinter& tp, T const& value) {
+    friend TablePrinter& operator<< (TablePrinter& tp, T const& value) {
         tp.m_oss << value;
         return tp;
     }
 
-    friend TablePrinter& operator << (TablePrinter& tp, ColumnBreak) {
+    friend TablePrinter& operator<< (TablePrinter& tp, ColumnBreak) {
         auto colStr = tp.m_oss.str();
         const auto strSize = colStr.size();
         tp.m_oss.str("");
@@ -361,13 +362,18 @@ class TablePrinter {
         return tp;
     }
 
-    friend TablePrinter& operator << (TablePrinter& tp, RowBreak) {
+    friend TablePrinter& operator<< (TablePrinter& tp, RowBreak) {
         if (tp.m_currentColumn > 0) {
             tp.m_os << '\n';
             tp.m_currentColumn = -1;
         }
         return tp;
     }
+
+    friend TablePrinter& operator<<(TablePrinter& tp, OutputFlush) {
+        tp.m_os << std::flush;
+        return tp;
+    }
 };
 
 ConsoleReporter::ConsoleReporter(ReporterConfig&& config):
@@ -389,7 +395,7 @@ ConsoleReporter::ConsoleReporter(ReporterConfig&& config):
                 { "benchmark name", CATCH_CONFIG_CONSOLE_WIDTH - 43, Justification::Left },
                 { "samples      mean       std dev", 14, Justification::Right },
                 { "iterations   low mean   low std dev", 14, Justification::Right },
-                { "estimated    high mean  high std dev", 14, Justification::Right }
+                { "est run time high mean  high std dev", 14, Justification::Right }
             };
         }
     }())) {}
@@ -473,8 +479,11 @@ void ConsoleReporter::benchmarkPreparing( StringRef name ) {
 void ConsoleReporter::benchmarkStarting(BenchmarkInfo const& info) {
     (*m_tablePrinter) << info.samples << ColumnBreak()
         << info.iterations << ColumnBreak();
-    if (!m_config->benchmarkNoAnalysis())
-        (*m_tablePrinter) << Duration(info.estimatedDuration) << ColumnBreak();
+    if ( !m_config->benchmarkNoAnalysis() ) {
+        ( *m_tablePrinter )
+            << Duration( info.estimatedDuration ) << ColumnBreak();
+    }
+    ( *m_tablePrinter ) << OutputFlush{};
 }
 void ConsoleReporter::benchmarkEnded(BenchmarkStats<> const& stats) {
     if (m_config->benchmarkNoAnalysis())
diff --git a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_cumulative_base.hpp b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_cumulative_base.hpp
index cdff9991..267b39fd 100644
--- a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_cumulative_base.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_cumulative_base.hpp
@@ -8,7 +8,6 @@
 #ifndef CATCH_REPORTER_CUMULATIVE_BASE_HPP_INCLUDED
 #define CATCH_REPORTER_CUMULATIVE_BASE_HPP_INCLUDED
 
-#include <catch2/interfaces/catch_interfaces_reporter.hpp>
 #include <catch2/reporters/catch_reporter_common_base.hpp>
 #include <catch2/internal/catch_move_and_forward.hpp>
 #include <catch2/internal/catch_unique_ptr.hpp>
@@ -125,7 +124,7 @@ namespace Catch {
         void skipTest(TestCaseInfo const&) override {}
 
     protected:
-        //! Should the cumulative base store the assertion expansion for succesful assertions?
+        //! Should the cumulative base store the assertion expansion for successful assertions?
         bool m_shouldStoreSuccesfulAssertions = true;
         //! Should the cumulative base store the assertion expansion for failed assertions?
         bool m_shouldStoreFailedAssertions = true;
diff --git a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_json.cpp b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_json.cpp
new file mode 100644
index 00000000..1f0db8b0
--- /dev/null
+++ b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_json.cpp
@@ -0,0 +1,372 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+//
+#include <catch2/catch_test_case_info.hpp>
+#include <catch2/catch_test_spec.hpp>
+#include <catch2/catch_version.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/internal/catch_list.hpp>
+#include <catch2/internal/catch_string_manip.hpp>
+#include <catch2/reporters/catch_reporter_json.hpp>
+
+namespace Catch {
+    namespace {
+        void writeSourceInfo( JsonObjectWriter& writer,
+                              SourceLineInfo const& sourceInfo ) {
+            auto source_location_writer =
+                writer.write( "source-location"_sr ).writeObject();
+            source_location_writer.write( "filename"_sr )
+                .write( sourceInfo.file );
+            source_location_writer.write( "line"_sr ).write( sourceInfo.line );
+        }
+
+        void writeTags( JsonArrayWriter writer, std::vector<Tag> const& tags ) {
+            for ( auto const& tag : tags ) {
+                writer.write( tag.original );
+            }
+        }
+
+        void writeProperties( JsonArrayWriter writer,
+                              TestCaseInfo const& info ) {
+            if ( info.isHidden() ) { writer.write( "is-hidden"_sr ); }
+            if ( info.okToFail() ) { writer.write( "ok-to-fail"_sr ); }
+            if ( info.expectedToFail() ) {
+                writer.write( "expected-to-fail"_sr );
+            }
+            if ( info.throws() ) { writer.write( "throws"_sr ); }
+        }
+
+    } // namespace
+
+    JsonReporter::JsonReporter( ReporterConfig&& config ):
+        StreamingReporterBase{ CATCH_MOVE( config ) } {
+
+        m_preferences.shouldRedirectStdOut = true;
+        // TBD: Do we want to report all assertions? XML reporter does
+        //      not, but for machine-parseable reporters I think the answer
+        //      should be yes.
+        m_preferences.shouldReportAllAssertions = true;
+
+        m_objectWriters.emplace( m_stream );
+        m_writers.emplace( Writer::Object );
+        auto& writer = m_objectWriters.top();
+
+        writer.write( "version"_sr ).write( 1 );
+
+        {
+            auto metadata_writer = writer.write( "metadata"_sr ).writeObject();
+            metadata_writer.write( "name"_sr ).write( m_config->name() );
+            metadata_writer.write( "rng-seed"_sr ).write( m_config->rngSeed() );
+            metadata_writer.write( "catch2-version"_sr )
+                .write( libraryVersion() );
+            if ( m_config->testSpec().hasFilters() ) {
+                metadata_writer.write( "filters"_sr )
+                    .write( m_config->testSpec() );
+            }
+        }
+    }
+
+    JsonReporter::~JsonReporter() {
+        endListing();
+        // TODO: Ensure this closes the top level object, add asserts
+        assert( m_writers.size() == 1 && "Only the top level object should be open" );
+        assert( m_writers.top() == Writer::Object );
+        endObject();
+        m_stream << '\n' << std::flush;
+        assert( m_writers.empty() );
+    }
+
+    JsonArrayWriter& JsonReporter::startArray() {
+        m_arrayWriters.emplace( m_arrayWriters.top().writeArray() );
+        m_writers.emplace( Writer::Array );
+        return m_arrayWriters.top();
+    }
+    JsonArrayWriter& JsonReporter::startArray( StringRef key ) {
+        m_arrayWriters.emplace(
+            m_objectWriters.top().write( key ).writeArray() );
+        m_writers.emplace( Writer::Array );
+        return m_arrayWriters.top();
+    }
+
+    JsonObjectWriter& JsonReporter::startObject() {
+        m_objectWriters.emplace( m_arrayWriters.top().writeObject() );
+        m_writers.emplace( Writer::Object );
+        return m_objectWriters.top();
+    }
+    JsonObjectWriter& JsonReporter::startObject( StringRef key ) {
+        m_objectWriters.emplace(
+            m_objectWriters.top().write( key ).writeObject() );
+        m_writers.emplace( Writer::Object );
+        return m_objectWriters.top();
+    }
+
+    void JsonReporter::endObject() {
+        assert( isInside( Writer::Object ) );
+        m_objectWriters.pop();
+        m_writers.pop();
+    }
+    void JsonReporter::endArray() {
+        assert( isInside( Writer::Array ) );
+        m_arrayWriters.pop();
+        m_writers.pop();
+    }
+
+    bool JsonReporter::isInside( Writer writer ) {
+        return !m_writers.empty() && m_writers.top() == writer;
+    }
+
+    void JsonReporter::startListing() {
+        if ( !m_startedListing ) { startObject( "listings"_sr ); }
+        m_startedListing = true;
+    }
+    void JsonReporter::endListing() {
+        if ( m_startedListing ) { endObject(); }
+        m_startedListing = false;
+    }
+
+    std::string JsonReporter::getDescription() {
+        return "Outputs listings as JSON. Test listing is Work-in-Progress!";
+    }
+
+    void JsonReporter::testRunStarting( TestRunInfo const& testInfo ) {
+        StreamingReporterBase::testRunStarting( testInfo );
+        endListing();
+
+        assert( isInside( Writer::Object ) );
+        startObject( "test-run"_sr );
+        startArray( "test-cases"_sr );
+    }
+
+     static void writeCounts( JsonObjectWriter&& writer, Counts const& counts ) {
+        writer.write( "passed"_sr ).write( counts.passed );
+        writer.write( "failed"_sr ).write( counts.failed );
+        writer.write( "fail-but-ok"_sr ).write( counts.failedButOk );
+        writer.write( "skipped"_sr ).write( counts.skipped );
+    }
+
+    void JsonReporter::testRunEnded(TestRunStats const& runStats) {
+        assert( isInside( Writer::Array ) );
+        // End "test-cases"
+        endArray();
+
+        {
+            auto totals =
+                m_objectWriters.top().write( "totals"_sr ).writeObject();
+            writeCounts( totals.write( "assertions"_sr ).writeObject(),
+                         runStats.totals.assertions );
+            writeCounts( totals.write( "test-cases"_sr ).writeObject(),
+                         runStats.totals.testCases );
+        }
+
+        // End the "test-run" object
+        endObject();
+    }
+
+    void JsonReporter::testCaseStarting( TestCaseInfo const& tcInfo ) {
+        StreamingReporterBase::testCaseStarting( tcInfo );
+
+        assert( isInside( Writer::Array ) &&
+                "We should be in the 'test-cases' array" );
+        startObject();
+        // "test-info" prelude
+        {
+            auto testInfo =
+                m_objectWriters.top().write( "test-info"_sr ).writeObject();
+            // TODO: handle testName vs className!!
+            testInfo.write( "name"_sr ).write( tcInfo.name );
+            writeSourceInfo(testInfo, tcInfo.lineInfo);
+            writeTags( testInfo.write( "tags"_sr ).writeArray(), tcInfo.tags );
+            writeProperties( testInfo.write( "properties"_sr ).writeArray(),
+                             tcInfo );
+        }
+
+
+        // Start the array for individual test runs (testCasePartial pairs)
+        startArray( "runs"_sr );
+    }
+
+    void JsonReporter::testCaseEnded( TestCaseStats const& tcStats ) {
+        StreamingReporterBase::testCaseEnded( tcStats );
+
+        // We need to close the 'runs' array before finishing the test case
+        assert( isInside( Writer::Array ) );
+        endArray();
+
+        {
+            auto totals =
+                m_objectWriters.top().write( "totals"_sr ).writeObject();
+            writeCounts( totals.write( "assertions"_sr ).writeObject(),
+                         tcStats.totals.assertions );
+            // We do not write the test case totals, because there will always be just one test case here.
+            // TODO: overall "result" -> success, skip, fail here? Or in partial result?
+        }
+        // We do not write out stderr/stdout, because we instead wrote those out in partial runs
+
+        // TODO: aborting?
+
+        // And we also close this test case's object
+        assert( isInside( Writer::Object ) );
+        endObject();
+    }
+
+    void JsonReporter::testCasePartialStarting( TestCaseInfo const& /*tcInfo*/,
+                                                uint64_t index ) {
+        startObject();
+        m_objectWriters.top().write( "run-idx"_sr ).write( index );
+        startArray( "path"_sr );
+        // TODO: we want to delay most of the printing to the 'root' section
+        // TODO: childSection key name?
+    }
+
+    void JsonReporter::testCasePartialEnded( TestCaseStats const& tcStats,
+                                             uint64_t /*index*/ ) {
+        // Fixme: the top level section handles this.
+        //// path object
+        endArray();
+        if ( !tcStats.stdOut.empty() ) {
+            m_objectWriters.top()
+                .write( "captured-stdout"_sr )
+                .write( tcStats.stdOut );
+        }
+        if ( !tcStats.stdErr.empty() ) {
+            m_objectWriters.top()
+                .write( "captured-stderr"_sr )
+                .write( tcStats.stdErr );
+        }
+        {
+            auto totals =
+                m_objectWriters.top().write( "totals"_sr ).writeObject();
+            writeCounts( totals.write( "assertions"_sr ).writeObject(),
+                         tcStats.totals.assertions );
+            // We do not write the test case totals, because there will
+            // always be just one test case here.
+            // TODO: overall "result" -> success, skip, fail here? Or in
+            // partial result?
+        }
+        // TODO: aborting?
+        // run object
+        endObject();
+    }
+
+    void JsonReporter::sectionStarting( SectionInfo const& sectionInfo ) {
+        assert( isInside( Writer::Array ) &&
+                "Section should always start inside an object" );
+        // We want to nest top level sections, even though it shares name
+        // and source loc with the TEST_CASE
+        auto& sectionObject = startObject();
+        sectionObject.write( "kind"_sr ).write( "section"_sr );
+        sectionObject.write( "name"_sr ).write( sectionInfo.name );
+        writeSourceInfo( m_objectWriters.top(), sectionInfo.lineInfo );
+
+
+        // TBD: Do we want to create this event lazily? It would become
+        //      rather complex, but we could do it, and it would look
+        //      better for empty sections. OTOH, empty sections should
+        //      be rare.
+        startArray( "path"_sr );
+    }
+    void JsonReporter::sectionEnded( SectionStats const& /*sectionStats */) {
+        // End the subpath array
+        endArray();
+        // TODO: metadata
+        // TODO: what info do we have here?
+
+        // End the section object
+        endObject();
+    }
+
+    void JsonReporter::assertionStarting( AssertionInfo const& /*assertionInfo*/ ) {}
+    void JsonReporter::assertionEnded( AssertionStats const& assertionStats ) {
+        // TODO: There is lot of different things to handle here, but
+        //       we can fill it in later, after we show that the basic
+        //       outline and streaming reporter impl works well enough.
+        //if ( !m_config->includeSuccessfulResults()
+        //    && assertionStats.assertionResult.isOk() ) {
+        //    return;
+        //}
+        assert( isInside( Writer::Array ) );
+        auto assertionObject = m_arrayWriters.top().writeObject();
+
+        assertionObject.write( "kind"_sr ).write( "assertion"_sr );
+        writeSourceInfo( assertionObject,
+                         assertionStats.assertionResult.getSourceInfo() );
+        assertionObject.write( "status"_sr )
+            .write( assertionStats.assertionResult.isOk() );
+        // TODO: handling of result.
+        // TODO: messages
+        // TODO: totals?
+    }
+
+
+    void JsonReporter::benchmarkPreparing( StringRef name ) { (void)name; }
+    void JsonReporter::benchmarkStarting( BenchmarkInfo const& ) {}
+    void JsonReporter::benchmarkEnded( BenchmarkStats<> const& ) {}
+    void JsonReporter::benchmarkFailed( StringRef error ) { (void)error; }
+
+    void JsonReporter::listReporters(
+        std::vector<ReporterDescription> const& descriptions ) {
+        startListing();
+
+        auto writer =
+            m_objectWriters.top().write( "reporters"_sr ).writeArray();
+        for ( auto const& desc : descriptions ) {
+            auto desc_writer = writer.writeObject();
+            desc_writer.write( "name"_sr ).write( desc.name );
+            desc_writer.write( "description"_sr ).write( desc.description );
+        }
+    }
+    void JsonReporter::listListeners(
+        std::vector<ListenerDescription> const& descriptions ) {
+        startListing();
+
+        auto writer =
+            m_objectWriters.top().write( "listeners"_sr ).writeArray();
+
+        for ( auto const& desc : descriptions ) {
+            auto desc_writer = writer.writeObject();
+            desc_writer.write( "name"_sr ).write( desc.name );
+            desc_writer.write( "description"_sr ).write( desc.description );
+        }
+    }
+    void JsonReporter::listTests( std::vector<TestCaseHandle> const& tests ) {
+        startListing();
+
+        auto writer = m_objectWriters.top().write( "tests"_sr ).writeArray();
+
+        for ( auto const& test : tests ) {
+            auto desc_writer = writer.writeObject();
+            auto const& info = test.getTestCaseInfo();
+
+            desc_writer.write( "name"_sr ).write( info.name );
+            desc_writer.write( "class-name"_sr ).write( info.className );
+            {
+                auto tag_writer = desc_writer.write( "tags"_sr ).writeArray();
+                for ( auto const& tag : info.tags ) {
+                    tag_writer.write( tag.original );
+                }
+            }
+            writeSourceInfo( desc_writer, info.lineInfo );
+        }
+    }
+    void JsonReporter::listTags( std::vector<TagInfo> const& tags ) {
+        startListing();
+
+        auto writer = m_objectWriters.top().write( "tags"_sr ).writeArray();
+        for ( auto const& tag : tags ) {
+            auto tag_writer = writer.writeObject();
+            {
+                auto aliases_writer =
+                    tag_writer.write( "aliases"_sr ).writeArray();
+                for ( auto alias : tag.spellings ) {
+                    aliases_writer.write( alias );
+                }
+            }
+            tag_writer.write( "count"_sr ).write( tag.count );
+        }
+    }
+} // namespace Catch
diff --git a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_json.hpp b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_json.hpp
new file mode 100644
index 00000000..c938ca39
--- /dev/null
+++ b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_json.hpp
@@ -0,0 +1,95 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#ifndef CATCH_REPORTER_JSON_HPP_INCLUDED
+#define CATCH_REPORTER_JSON_HPP_INCLUDED
+
+#include <catch2/catch_timer.hpp>
+#include <catch2/internal/catch_jsonwriter.hpp>
+#include <catch2/reporters/catch_reporter_streaming_base.hpp>
+
+#include <stack>
+
+namespace Catch {
+    class JsonReporter : public StreamingReporterBase {
+    public:
+        JsonReporter( ReporterConfig&& config );
+
+        ~JsonReporter() override;
+
+        static std::string getDescription();
+
+    public: // StreamingReporterBase
+        void testRunStarting( TestRunInfo const& runInfo ) override;
+        void testRunEnded( TestRunStats const& runStats ) override;
+
+        void testCaseStarting( TestCaseInfo const& tcInfo ) override;
+        void testCaseEnded( TestCaseStats const& tcStats ) override;
+
+        void testCasePartialStarting( TestCaseInfo const& tcInfo,
+                                      uint64_t index ) override;
+        void testCasePartialEnded( TestCaseStats const& tcStats,
+                                   uint64_t index ) override;
+
+        void sectionStarting( SectionInfo const& sectionInfo ) override;
+        void sectionEnded( SectionStats const& sectionStats ) override;
+
+        void assertionStarting( AssertionInfo const& assertionInfo ) override;
+        void assertionEnded( AssertionStats const& assertionStats ) override;
+
+        //void testRunEndedCumulative() override;
+
+        void benchmarkPreparing( StringRef name ) override;
+        void benchmarkStarting( BenchmarkInfo const& ) override;
+        void benchmarkEnded( BenchmarkStats<> const& ) override;
+        void benchmarkFailed( StringRef error ) override;
+
+        void listReporters(
+            std::vector<ReporterDescription> const& descriptions ) override;
+        void listListeners(
+            std::vector<ListenerDescription> const& descriptions ) override;
+        void listTests( std::vector<TestCaseHandle> const& tests ) override;
+        void listTags( std::vector<TagInfo> const& tags ) override;
+
+    private:
+        Timer m_testCaseTimer;
+        enum class Writer {
+            Object,
+            Array
+        };
+
+        JsonArrayWriter& startArray();
+        JsonArrayWriter& startArray( StringRef key );
+
+        JsonObjectWriter& startObject();
+        JsonObjectWriter& startObject( StringRef key );
+
+        void endObject();
+        void endArray();
+
+        bool isInside( Writer writer );
+
+        void startListing();
+        void endListing();
+
+        // Invariant:
+        // When m_writers is not empty and its top element is
+        // - Writer::Object, then m_objectWriters is not be empty
+        // - Writer::Array,  then m_arrayWriters shall not be empty
+        std::stack<JsonObjectWriter> m_objectWriters{};
+        std::stack<JsonArrayWriter> m_arrayWriters{};
+        std::stack<Writer> m_writers{};
+
+        bool m_startedListing = false;
+
+        // std::size_t m_sectionDepth = 0;
+        // std::size_t m_sectionStarted = 0;
+    };
+} // namespace Catch
+
+#endif // CATCH_REPORTER_JSON_HPP_INCLUDED
diff --git a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_junit.cpp b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_junit.cpp
index 22d6526f..fc5cae34 100644
--- a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_junit.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_junit.cpp
@@ -33,6 +33,8 @@ namespace Catch {
             gmtime_s(&timeInfo, &rawtime);
 #elif defined (CATCH_PLATFORM_PLAYSTATION)
             gmtime_s(&rawtime, &timeInfo);
+#elif defined (__IAR_SYSTEMS_ICC__)
+            timeInfo = *std::gmtime(&rawtime);
 #else
             gmtime_r(&rawtime, &timeInfo);
 #endif
@@ -293,7 +295,7 @@ namespace Catch {
                 }
             }
 
-            if( !result.getMessage().empty() )
+            if( result.hasMessage() )
                 rss << result.getMessage() << '\n';
             for( auto const& msg : stats.infoMessages )
                 if( msg.type == ResultWas::Info )
diff --git a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_junit.hpp b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_junit.hpp
index 87c7c567..7cb53c25 100644
--- a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_junit.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_junit.hpp
@@ -19,8 +19,6 @@ namespace Catch {
     public:
         JunitReporter(ReporterConfig&& _config);
 
-        ~JunitReporter() override = default;
-
         static std::string getDescription();
 
         void testRunStarting(TestRunInfo const& runInfo) override;
diff --git a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_multi.cpp b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_multi.cpp
index ebf28b64..531902be 100644
--- a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_multi.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_multi.cpp
@@ -114,7 +114,6 @@ namespace Catch {
         }
     }
 
-    // The return value indicates if the messages buffer should be cleared:
     void MultiReporter::assertionEnded( AssertionStats const& assertionStats ) {
         const bool reportByDefault =
             assertionStats.assertionResult.getResultType() != ResultWas::Ok ||
diff --git a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_registrars.cpp b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_registrars.cpp
index a9787ce5..2a3ac957 100644
--- a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_registrars.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_registrars.cpp
@@ -8,6 +8,7 @@
 
 #include <catch2/reporters/catch_reporter_registrars.hpp>
 
+#include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
 #include <catch2/internal/catch_compiler_capabilities.hpp>
 
 namespace Catch {
@@ -26,5 +27,10 @@ namespace Catch {
             }
         }
 
+        void registerListenerImpl( Detail::unique_ptr<EventListenerFactory> listenerFactory ) {
+            getMutableRegistryHub().registerListener( CATCH_MOVE(listenerFactory) );
+        }
+
+
     } // namespace Detail
 } // namespace Catch
diff --git a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_registrars.hpp b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_registrars.hpp
index db5688f2..a93963f0 100644
--- a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_registrars.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_registrars.hpp
@@ -8,8 +8,6 @@
 #ifndef CATCH_REPORTER_REGISTRARS_HPP_INCLUDED
 #define CATCH_REPORTER_REGISTRARS_HPP_INCLUDED
 
-#include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
-#include <catch2/interfaces/catch_interfaces_reporter.hpp>
 #include <catch2/interfaces/catch_interfaces_reporter_factory.hpp>
 #include <catch2/internal/catch_compiler_capabilities.hpp>
 #include <catch2/internal/catch_unique_name.hpp>
@@ -36,7 +34,8 @@ namespace Catch {
         //! independent on the reporter's concrete type
         void registerReporterImpl( std::string const& name,
                                    IReporterFactoryPtr reporterPtr );
-
+        //! Actually registers the factory, independent on listener's concrete type
+        void registerListenerImpl( Detail::unique_ptr<EventListenerFactory> listenerFactory );
     } // namespace Detail
 
     class IEventListener;
@@ -97,7 +96,7 @@ namespace Catch {
 
     public:
         ListenerRegistrar(StringRef listenerName) {
-            getMutableRegistryHub().registerListener( Detail::make_unique<TypedListenerFactory>(listenerName) );
+            registerListenerImpl( Detail::make_unique<TypedListenerFactory>(listenerName) );
         }
     };
 }
diff --git a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_sonarqube.cpp b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_sonarqube.cpp
index dd002b61..9c391b1f 100644
--- a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_sonarqube.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_sonarqube.cpp
@@ -147,7 +147,7 @@ namespace Catch {
                 }
             }
 
-            if (!result.getMessage().empty())
+            if (result.hasMessage())
                 textRss << result.getMessage() << '\n';
 
             for (auto const& msg : stats.infoMessages)
diff --git a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_sonarqube.hpp b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_sonarqube.hpp
index cad6deec..d26af62e 100644
--- a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_sonarqube.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_sonarqube.hpp
@@ -25,8 +25,6 @@ namespace Catch {
             m_shouldStoreSuccesfulAssertions = false;
         }
 
-        ~SonarQubeReporter() override = default;
-
         static std::string getDescription() {
             using namespace std::string_literals;
             return "Reports test results in the Generic Test Data SonarQube XML format"s;
diff --git a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_streaming_base.hpp b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_streaming_base.hpp
index 13672a28..5448000c 100644
--- a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_streaming_base.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_streaming_base.hpp
@@ -8,7 +8,6 @@
 #ifndef CATCH_REPORTER_STREAMING_BASE_HPP_INCLUDED
 #define CATCH_REPORTER_STREAMING_BASE_HPP_INCLUDED
 
-#include <catch2/interfaces/catch_interfaces_reporter.hpp>
 #include <catch2/reporters/catch_reporter_common_base.hpp>
 #include <catch2/internal/catch_move_and_forward.hpp>
 
diff --git a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_tap.cpp b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_tap.cpp
index 563d6fb1..67d406fb 100644
--- a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_tap.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_tap.cpp
@@ -14,7 +14,6 @@
 #include <catch2/reporters/catch_reporter_helpers.hpp>
 
 #include <algorithm>
-#include <iterator>
 #include <ostream>
 
 namespace Catch {
@@ -165,7 +164,7 @@ namespace Catch {
 
                 // using messages.end() directly (or auto) yields compilation error:
                 std::vector<MessageInfo>::const_iterator itEnd = messages.end();
-                const std::size_t N = static_cast<std::size_t>(std::distance(itMessage, itEnd));
+                const std::size_t N = static_cast<std::size_t>(itEnd - itMessage);
 
                 stream << colourImpl->guardColour( colour ) << " with "
                        << pluralise( N, "message"_sr ) << ':';
diff --git a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_tap.hpp b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_tap.hpp
index fe45df63..e6889bb1 100644
--- a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_tap.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_tap.hpp
@@ -19,7 +19,6 @@ namespace Catch {
             StreamingReporterBase( CATCH_MOVE(config) ) {
             m_preferences.shouldReportAllAssertions = true;
         }
-        ~TAPReporter() override = default;
 
         static std::string getDescription() {
             using namespace std::string_literals;
diff --git a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_teamcity.cpp b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_teamcity.cpp
index 32072800..38aa55a6 100644
--- a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_teamcity.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_teamcity.cpp
@@ -45,7 +45,7 @@ namespace Catch {
     } // end anonymous namespace
 
 
-    TeamCityReporter::~TeamCityReporter() {}
+    TeamCityReporter::~TeamCityReporter() = default;
 
     void TeamCityReporter::testRunStarting( TestRunInfo const& runInfo ) {
         m_stream << "##teamcity[testSuiteStarted name='" << escape( runInfo.name )
diff --git a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_xml.cpp b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_xml.cpp
index 13812b92..35a3028e 100644
--- a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_xml.cpp
+++ b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporter_xml.cpp
@@ -56,7 +56,7 @@ namespace Catch {
         m_xml.startElement("Catch2TestRun")
              .writeAttribute("name"_sr, m_config->name())
              .writeAttribute("rng-seed"_sr, m_config->rngSeed())
-             .writeAttribute("xml-format-version"_sr, 2)
+             .writeAttribute("xml-format-version"_sr, 3)
              .writeAttribute("catch2-version"_sr, libraryVersion());
         if ( m_config->testSpec().hasFilters() ) {
             m_xml.writeAttribute( "filters"_sr, m_config->testSpec() );
@@ -98,11 +98,13 @@ namespace Catch {
             // Print any info messages in <Info> tags.
             for( auto const& msg : assertionStats.infoMessages ) {
                 if( msg.type == ResultWas::Info && includeResults ) {
-                    m_xml.scopedElement( "Info" )
-                            .writeText( msg.message );
+                    auto t = m_xml.scopedElement( "Info" );
+                    writeSourceInfo( msg.lineInfo );
+                    t.writeText( msg.message );
                 } else if ( msg.type == ResultWas::Warning ) {
-                    m_xml.scopedElement( "Warning" )
-                            .writeText( msg.message );
+                    auto t = m_xml.scopedElement( "Warning" );
+                    writeSourceInfo( msg.lineInfo );
+                    t.writeText( msg.message );
                 }
             }
         }
@@ -232,26 +234,23 @@ namespace Catch {
     }
 
     void XmlReporter::benchmarkEnded(BenchmarkStats<> const& benchmarkStats) {
-        m_xml.startElement("mean")
+        m_xml.scopedElement("mean")
             .writeAttribute("value"_sr, benchmarkStats.mean.point.count())
             .writeAttribute("lowerBound"_sr, benchmarkStats.mean.lower_bound.count())
             .writeAttribute("upperBound"_sr, benchmarkStats.mean.upper_bound.count())
             .writeAttribute("ci"_sr, benchmarkStats.mean.confidence_interval);
-        m_xml.endElement();
-        m_xml.startElement("standardDeviation")
+        m_xml.scopedElement("standardDeviation")
             .writeAttribute("value"_sr, benchmarkStats.standardDeviation.point.count())
             .writeAttribute("lowerBound"_sr, benchmarkStats.standardDeviation.lower_bound.count())
             .writeAttribute("upperBound"_sr, benchmarkStats.standardDeviation.upper_bound.count())
             .writeAttribute("ci"_sr, benchmarkStats.standardDeviation.confidence_interval);
-        m_xml.endElement();
-        m_xml.startElement("outliers")
+        m_xml.scopedElement("outliers")
             .writeAttribute("variance"_sr, benchmarkStats.outlierVariance)
             .writeAttribute("lowMild"_sr, benchmarkStats.outliers.low_mild)
             .writeAttribute("lowSevere"_sr, benchmarkStats.outliers.low_severe)
             .writeAttribute("highMild"_sr, benchmarkStats.outliers.high_mild)
             .writeAttribute("highSevere"_sr, benchmarkStats.outliers.high_severe);
         m_xml.endElement();
-        m_xml.endElement();
     }
 
     void XmlReporter::benchmarkFailed(StringRef error) {
diff --git a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporters_all.hpp b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporters_all.hpp
index 16f7bd70..5c713fe1 100644
--- a/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporters_all.hpp
+++ b/alpaka/thirdParty/catch2/src/catch2/reporters/catch_reporters_all.hpp
@@ -28,6 +28,7 @@
 #include <catch2/reporters/catch_reporter_cumulative_base.hpp>
 #include <catch2/reporters/catch_reporter_event_listener.hpp>
 #include <catch2/reporters/catch_reporter_helpers.hpp>
+#include <catch2/reporters/catch_reporter_json.hpp>
 #include <catch2/reporters/catch_reporter_junit.hpp>
 #include <catch2/reporters/catch_reporter_multi.hpp>
 #include <catch2/reporters/catch_reporter_registrars.hpp>
diff --git a/alpaka/thirdParty/catch2/tests/CMakeLists.txt b/alpaka/thirdParty/catch2/tests/CMakeLists.txt
index 7be57abe..d3ab14a7 100644
--- a/alpaka/thirdParty/catch2/tests/CMakeLists.txt
+++ b/alpaka/thirdParty/catch2/tests/CMakeLists.txt
@@ -78,6 +78,7 @@ endif(MSVC) #Temporary workaround
 set(TEST_SOURCES
         ${SELF_TEST_DIR}/TestRegistrations.cpp
         ${SELF_TEST_DIR}/IntrospectiveTests/Algorithms.tests.cpp
+        ${SELF_TEST_DIR}/IntrospectiveTests/AssertionHandler.tests.cpp
         ${SELF_TEST_DIR}/IntrospectiveTests/Clara.tests.cpp
         ${SELF_TEST_DIR}/IntrospectiveTests/CmdLine.tests.cpp
         ${SELF_TEST_DIR}/IntrospectiveTests/CmdLineHelpers.tests.cpp
@@ -85,7 +86,9 @@ set(TEST_SOURCES
         ${SELF_TEST_DIR}/IntrospectiveTests/Details.tests.cpp
         ${SELF_TEST_DIR}/IntrospectiveTests/FloatingPoint.tests.cpp
         ${SELF_TEST_DIR}/IntrospectiveTests/GeneratorsImpl.tests.cpp
+        ${SELF_TEST_DIR}/IntrospectiveTests/Integer.tests.cpp
         ${SELF_TEST_DIR}/IntrospectiveTests/InternalBenchmark.tests.cpp
+        ${SELF_TEST_DIR}/IntrospectiveTests/Json.tests.cpp
         ${SELF_TEST_DIR}/IntrospectiveTests/Parse.tests.cpp
         ${SELF_TEST_DIR}/IntrospectiveTests/PartTracker.tests.cpp
         ${SELF_TEST_DIR}/IntrospectiveTests/RandomNumberGeneration.tests.cpp
@@ -622,6 +625,18 @@ if (CATCH_ENABLE_CONFIGURE_TESTS)
     endforeach()
 endif()
 
+if (CATCH_ENABLE_CMAKE_HELPER_TESTS)
+    add_test(NAME "CMakeHelper::DiscoverTests"
+      COMMAND
+        "${PYTHON_EXECUTABLE}" "${CMAKE_CURRENT_LIST_DIR}/TestScripts/DiscoverTests/VerifyRegistration.py" "${CATCH_DIR}" "${CMAKE_CURRENT_BINARY_DIR}"
+    )
+    set_tests_properties("CMakeHelper::DiscoverTests"
+      PROPERTIES
+        COST 240
+        LABELS "uses-python"
+    )
+endif()
+
 foreach (reporterName # "Automake" - the simple .trs format does not support any kind of comments/metadata
                       "compact"
                       "console"
@@ -629,7 +644,8 @@ foreach (reporterName # "Automake" - the simple .trs format does not support any
                       "SonarQube"
                       "TAP"
                       # "TeamCity" - does not seem to support test suite-level metadata/comments
-                      "XML")
+                      "XML"
+                      "JSON")
 
     add_test(NAME "Reporters:Filters:${reporterName}"
       COMMAND
@@ -639,6 +655,8 @@ foreach (reporterName # "Automake" - the simple .trs format does not support any
     # Different regex for these two reporters, because the commas end up xml-escaped
     if (reporterName MATCHES "JUnit|XML")
       set(testCaseNameFormat "&quot;CaseInsensitiveLess is case insensitive&quot;")
+    elseif(reporterName MATCHES "JSON")
+      set(testCaseNameFormat "\\\\\"CaseInsensitiveLess is case insensitive\\\\\"")
     else()
       set(testCaseNameFormat "\"CaseInsensitiveLess is case insensitive\"")
     endif()
diff --git a/alpaka/thirdParty/catch2/tests/ExtraTests/CMakeLists.txt b/alpaka/thirdParty/catch2/tests/ExtraTests/CMakeLists.txt
index 4172d7a0..2a810e25 100644
--- a/alpaka/thirdParty/catch2/tests/ExtraTests/CMakeLists.txt
+++ b/alpaka/thirdParty/catch2/tests/ExtraTests/CMakeLists.txt
@@ -468,6 +468,17 @@ set_tests_properties(
 )
 
 
+add_executable(AssertionStartingEventGoesBeforeAssertionIsEvaluated
+  X20-AssertionStartingEventGoesBeforeAssertionIsEvaluated.cpp
+)
+target_link_libraries(AssertionStartingEventGoesBeforeAssertionIsEvaluated
+  PRIVATE Catch2::Catch2WithMain
+)
+add_test(
+  NAME ReporterEvents::AssertionStartingHappensBeforeAssertionIsEvaluated
+  COMMAND $<TARGET_FILE:AssertionStartingEventGoesBeforeAssertionIsEvaluated>
+)
+
 #add_executable(DebugBreakMacros ${TESTS_DIR}/X12-CustomDebugBreakMacro.cpp)
 #target_link_libraries(DebugBreakMacros Catch2)
 #add_test(NAME DebugBreakMacros COMMAND DebugBreakMacros --break)
diff --git a/alpaka/thirdParty/catch2/tests/ExtraTests/X20-AssertionStartingEventGoesBeforeAssertionIsEvaluated.cpp b/alpaka/thirdParty/catch2/tests/ExtraTests/X20-AssertionStartingEventGoesBeforeAssertionIsEvaluated.cpp
new file mode 100644
index 00000000..6f44bf69
--- /dev/null
+++ b/alpaka/thirdParty/catch2/tests/ExtraTests/X20-AssertionStartingEventGoesBeforeAssertionIsEvaluated.cpp
@@ -0,0 +1,77 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+/**\file
+ * Registers an event listener to increments counter of assertionStarting events.
+ *
+ * Different assertion macros then check that the counter is at expected
+ * value when they are evaluated.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/reporters/catch_reporter_event_listener.hpp>
+#include <catch2/reporters/catch_reporter_registrars.hpp>
+#include <catch2/matchers/catch_matchers_predicate.hpp>
+
+namespace {
+
+    static size_t assertion_starting_events_seen = 0;
+
+    class AssertionStartingListener : public Catch::EventListenerBase {
+    public:
+        AssertionStartingListener( Catch::IConfig const* config ):
+            EventListenerBase( config ) {}
+
+        void assertionStarting( Catch::AssertionInfo const& ) override {
+            ++assertion_starting_events_seen;
+        }
+    };
+
+    static bool f1() {
+        return assertion_starting_events_seen == 1;
+    }
+
+    static void f2() {
+        if ( assertion_starting_events_seen != 2 ) { throw 1; }
+    }
+
+    static void f3() {
+        if ( assertion_starting_events_seen == 3 ) { throw 1; }
+    }
+
+    static bool f4() { return assertion_starting_events_seen == 4; }
+
+    static void f5() { throw assertion_starting_events_seen; }
+
+} // anonymous namespace
+
+CATCH_REGISTER_LISTENER( AssertionStartingListener )
+
+TEST_CASE() {
+    // **IMPORTANT**
+    // The order of assertions below matters.
+    REQUIRE( f1() );
+    REQUIRE_NOTHROW( f2() );
+    REQUIRE_THROWS( f3() );
+    REQUIRE_THAT( f4(),
+                  Catch::Matchers::Predicate<bool>( []( bool b ) { return b; } ) );
+    REQUIRE_THROWS_MATCHES(
+        f5(), size_t, Catch::Matchers::Predicate<size_t>( []( size_t i ) {
+            return i == 5;
+        } ) );
+
+    CAPTURE( assertion_starting_events_seen ); // **not** an assertion
+    INFO( "some info msg" );                   // **not** an assertion
+    WARN( "warning! warning!" );               // assertion-like message
+    SUCCEED();                                 // assertion-like message
+
+    // We skip FAIL/SKIP and so on, which fail the test.
+
+    // This require will also increment the count once
+    REQUIRE( assertion_starting_events_seen == 8 );
+}
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/automake.sw.approved.txt b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/automake.sw.approved.txt
index d33effdd..88c23e17 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/automake.sw.approved.txt
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/automake.sw.approved.txt
@@ -130,8 +130,8 @@ Nor would this
 :test-result: FAIL Custom std-exceptions can be custom translated
 :test-result: PASS Default scale is invisible to comparison
 :test-result: PASS Directly creating an EnumInfo
+:test-result: SKIP Empty generators can SKIP in constructor
 :test-result: PASS Empty stream name opens cout stream
-:test-result: PASS Empty tag is not allowed
 :test-result: FAIL EndsWith string matcher
 :test-result: PASS Enums can quickly have stringification enabled using REGISTER_ENUM
 :test-result: PASS Enums in namespaces can quickly have stringification enabled using REGISTER_ENUM
@@ -154,6 +154,7 @@ Nor would this
 :test-result: PASS Filter generator throws exception for empty generator
 :test-result: PASS Floating point matchers: double
 :test-result: PASS Floating point matchers: float
+:test-result: PASS GENERATE can combine literals and generators
 :test-result: PASS Generators -- adapters
 :test-result: PASS Generators -- simple
 :test-result: PASS Generators internals
@@ -162,12 +163,16 @@ Nor would this
 :test-result: PASS Hashers with same seed produce same hash
 :test-result: PASS Hashing different test cases produces different result
 :test-result: PASS Hashing test case produces same hash across multiple calls
+:test-result: FAIL INFO and UNSCOPED_INFO can stream multiple arguments
 :test-result: FAIL INFO and WARN do not abort tests
 :test-result: FAIL INFO gets logged on failure
 :test-result: FAIL INFO gets logged on failure, even if captured before successful assertions
 :test-result: FAIL INFO is reset for each loop
+:test-result: XFAIL Incomplete AssertionHandler
 :test-result: XFAIL Inequality checks that should fail
 :test-result: PASS Inequality checks that should succeed
+:test-result: PASS JsonWriter
+:test-result: PASS JsonWriter escapes charaters in strings properly
 :test-result: PASS Lambdas in assertions
 :test-result: PASS Less-than inequalities with different epsilons
 :test-result: PASS ManuallyRegistered
@@ -265,6 +270,8 @@ Message from section two
 :test-result: PASS Testing checked-if
 :test-result: XFAIL Testing checked-if 2
 :test-result: XFAIL Testing checked-if 3
+:test-result: XFAIL Testing checked-if 4
+:test-result: XFAIL Testing checked-if 5
 :test-result: FAIL The NO_FAIL macro reports a failure but does not fail the test
 :test-result: PASS The default listing implementation write to provided stream
 :test-result: FAIL This test 'should' fail but doesn't
@@ -408,6 +415,7 @@ b1!
 :test-result: PASS tuple<string,string>
 :test-result: PASS tuple<tuple<int>,tuple<>,float>
 :test-result: PASS uniform samples
+:test-result: PASS uniform_integer_distribution can return the bounds
 :test-result: PASS unique_ptr reimplementation: basic functionality
 :test-result: PASS vec<vec<string,alloc>> -> toString
 :test-result: PASS vector<bool> -> toString
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/automake.sw.multi.approved.txt b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/automake.sw.multi.approved.txt
index f698f0c5..a37b1a2b 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/automake.sw.multi.approved.txt
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/automake.sw.multi.approved.txt
@@ -128,8 +128,8 @@
 :test-result: FAIL Custom std-exceptions can be custom translated
 :test-result: PASS Default scale is invisible to comparison
 :test-result: PASS Directly creating an EnumInfo
+:test-result: SKIP Empty generators can SKIP in constructor
 :test-result: PASS Empty stream name opens cout stream
-:test-result: PASS Empty tag is not allowed
 :test-result: FAIL EndsWith string matcher
 :test-result: PASS Enums can quickly have stringification enabled using REGISTER_ENUM
 :test-result: PASS Enums in namespaces can quickly have stringification enabled using REGISTER_ENUM
@@ -152,6 +152,7 @@
 :test-result: PASS Filter generator throws exception for empty generator
 :test-result: PASS Floating point matchers: double
 :test-result: PASS Floating point matchers: float
+:test-result: PASS GENERATE can combine literals and generators
 :test-result: PASS Generators -- adapters
 :test-result: PASS Generators -- simple
 :test-result: PASS Generators internals
@@ -160,12 +161,16 @@
 :test-result: PASS Hashers with same seed produce same hash
 :test-result: PASS Hashing different test cases produces different result
 :test-result: PASS Hashing test case produces same hash across multiple calls
+:test-result: FAIL INFO and UNSCOPED_INFO can stream multiple arguments
 :test-result: FAIL INFO and WARN do not abort tests
 :test-result: FAIL INFO gets logged on failure
 :test-result: FAIL INFO gets logged on failure, even if captured before successful assertions
 :test-result: FAIL INFO is reset for each loop
+:test-result: XFAIL Incomplete AssertionHandler
 :test-result: XFAIL Inequality checks that should fail
 :test-result: PASS Inequality checks that should succeed
+:test-result: PASS JsonWriter
+:test-result: PASS JsonWriter escapes charaters in strings properly
 :test-result: PASS Lambdas in assertions
 :test-result: PASS Less-than inequalities with different epsilons
 :test-result: PASS ManuallyRegistered
@@ -258,6 +263,8 @@
 :test-result: PASS Testing checked-if
 :test-result: XFAIL Testing checked-if 2
 :test-result: XFAIL Testing checked-if 3
+:test-result: XFAIL Testing checked-if 4
+:test-result: XFAIL Testing checked-if 5
 :test-result: FAIL The NO_FAIL macro reports a failure but does not fail the test
 :test-result: PASS The default listing implementation write to provided stream
 :test-result: FAIL This test 'should' fail but doesn't
@@ -397,6 +404,7 @@
 :test-result: PASS tuple<string,string>
 :test-result: PASS tuple<tuple<int>,tuple<>,float>
 :test-result: PASS uniform samples
+:test-result: PASS uniform_integer_distribution can return the bounds
 :test-result: PASS unique_ptr reimplementation: basic functionality
 :test-result: PASS vec<vec<string,alloc>> -> toString
 :test-result: PASS vector<bool> -> toString
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/compact.sw.approved.txt b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/compact.sw.approved.txt
index 541770cc..0669fdbb 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/compact.sw.approved.txt
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/compact.sw.approved.txt
@@ -331,7 +331,7 @@ MatchersRanges.tests.cpp:<line number>: passed: inner_lists_are_empty.front(), I
 MatchersRanges.tests.cpp:<line number>: passed: has_empty{}, !IsEmpty() for: {?} not is empty
 MatchersRanges.tests.cpp:<line number>: passed: unrelated::ADL_empty{}, IsEmpty() for: {?} is empty
 Message.tests.cpp:<line number>: passed: with 7 messages: 'a := 1' and 'b := 2' and 'c := 3' and 'a + b := 3' and 'a+b := 3' and 'c > b := true' and 'a == 1 := true'
-Message.tests.cpp:<line number>: passed: with 7 messages: 'std::vector<int>{1, 2, 3}[0, 1, 2] := 3' and 'std::vector<int>{1, 2, 3}[(0, 1)] := 2' and 'std::vector<int>{1, 2, 3}[0] := 1' and '(helper_1436<int, int>{12, -12}) := { 12, -12 }' and '(helper_1436<int, int>(-12, 12)) := { -12, 12 }' and '(1, 2) := 2' and '(2, 3) := 3'
+Message.tests.cpp:<line number>: passed: with 7 messages: 'custom_index_op<int>{1, 2, 3}[0, 1, 2] := 0' and 'custom_index_op<int>{1, 2, 3}[(0, 1)] := 0' and 'custom_index_op<int>{1, 2, 3}[0] := 0' and '(helper_1436<int, int>{12, -12}) := { 12, -12 }' and '(helper_1436<int, int>(-12, 12)) := { -12, 12 }' and '(1, 2) := 2' and '(2, 3) := 3'
 Message.tests.cpp:<line number>: passed: with 11 messages: '("comma, in string", "escaped, \", ") := "escaped, ", "' and '"single quote in string,'," := "single quote in string,',"' and '"some escapes, \\,\\\\" := "some escapes, \,\\"' and '"some, ), unmatched, } prenheses {[<" := "some, ), unmatched, } prenheses {[<"' and ''"' := '"'' and ''\'' := '''' and '',' := ','' and ''}' := '}'' and '')' := ')'' and ''(' := '('' and ''{' := '{''
 ToStringGeneral.tests.cpp:<line number>: passed: true with 1 message: 'i := 2'
 ToStringGeneral.tests.cpp:<line number>: passed: true with 1 message: '3'
@@ -520,8 +520,8 @@ ToString.tests.cpp:<line number>: passed: enumInfo->lookup(1) == "Value2" for: V
 ToString.tests.cpp:<line number>: passed: enumInfo->lookup(3) == "{** unexpected enum value **}" for: {** unexpected enum value **}
 ==
 "{** unexpected enum value **}"
+Skip.tests.cpp:<line number>: skipped: 'This generator is empty'
 Stream.tests.cpp:<line number>: passed: Catch::makeStream( "" )->isConsole() for: true
-Tag.tests.cpp:<line number>: passed: Catch::TestCaseInfo( "", { "fake test name", "[]" }, dummySourceLineInfo )
 Matchers.tests.cpp:<line number>: failed: testStringForMatching(), EndsWith( "Substring" ) for: "this string contains 'abc' as a substring" ends with: "Substring"
 Matchers.tests.cpp:<line number>: failed: testStringForMatching(), EndsWith( "this", Catch::CaseSensitive::No ) for: "this string contains 'abc' as a substring" ends with: "this" (case insensitive)
 EnumToString.tests.cpp:<line number>: passed: stringify( EnumClass3::Value1 ) == "Value1" for: "Value1" == "Value1"
@@ -666,6 +666,10 @@ Matchers.tests.cpp:<line number>: passed: 1., !IsNaN() for: 1.0 not is NaN
 Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
 Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
 Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
+Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
+Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
+Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
+Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
 Generators.tests.cpp:<line number>: passed: filter([] (int) {return false; }, value(1)), Catch::GeneratorException
 Generators.tests.cpp:<line number>: passed: i < 4 for: 1 < 4
 Generators.tests.cpp:<line number>: passed: i < 4 for: 2 < 4
@@ -944,6 +948,7 @@ TestCaseInfoHasher.tests.cpp:<line number>: passed: h( dummy1 ) != h( dummy2 ) f
 TestCaseInfoHasher.tests.cpp:<line number>: passed: h( dummy ) == h( dummy ) for: 3422778688 (0x<hex digits>)
 ==
 3422778688 (0x<hex digits>)
+Message.tests.cpp:<line number>: failed: explicitly with 3 messages: 'This info has multiple parts.' and 'This unscoped info has multiple parts.' and 'Show infos!'
 Message.tests.cpp:<line number>: warning: 'this is a message' with 1 message: 'this is a warning'
 Message.tests.cpp:<line number>: failed: a == 1 for: 2 == 1 with 2 messages: 'this message should be logged' and 'so should this'
 Message.tests.cpp:<line number>: passed: a == 2 for: 2 == 2 with 1 message: 'this message may be logged later'
@@ -961,6 +966,7 @@ Message.tests.cpp:<line number>: passed: i < 10 for: 7 < 10 with 2 messages: 'cu
 Message.tests.cpp:<line number>: passed: i < 10 for: 8 < 10 with 2 messages: 'current counter 8' and 'i := 8'
 Message.tests.cpp:<line number>: passed: i < 10 for: 9 < 10 with 2 messages: 'current counter 9' and 'i := 9'
 Message.tests.cpp:<line number>: failed: i < 10 for: 10 < 10 with 2 messages: 'current counter 10' and 'i := 10'
+AssertionHandler.tests.cpp:<line number>: failed: unexpected exception with message: 'Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE'; expression was: Dummy
 Condition.tests.cpp:<line number>: failed: data.int_seven != 7 for: 7 != 7
 Condition.tests.cpp:<line number>: failed: data.float_nine_point_one != Approx( 9.1f ) for: 9.1f != Approx( 9.1000003815 )
 Condition.tests.cpp:<line number>: failed: data.double_pi != Approx( 3.1415926535 ) for: 3.1415926535 != Approx( 3.1415926535 )
@@ -977,6 +983,91 @@ Condition.tests.cpp:<line number>: passed: data.str_hello != "goodbye" for: "hel
 Condition.tests.cpp:<line number>: passed: data.str_hello != "hell" for: "hello" != "hell"
 Condition.tests.cpp:<line number>: passed: data.str_hello != "hello1" for: "hello" != "hello1"
 Condition.tests.cpp:<line number>: passed: data.str_hello.size() != 6 for: 5 != 6
+Json.tests.cpp:<line number>: passed: stream.str() == "" for: "" == ""
+Json.tests.cpp:<line number>: passed: stream.str() == "{\n}" for: "{
+}"
+==
+"{
+}"
+Json.tests.cpp:<line number>: passed: stream.str(), ContainsSubstring( "\"int\": 1," ) && ContainsSubstring( "\"double\": 1.5," ) && ContainsSubstring( "\"true\": true," ) && ContainsSubstring( "\"false\": false," ) && ContainsSubstring( "\"string\": \"this is a string\"," ) && ContainsSubstring( "\"array\": [\n    1,\n    2\n  ]\n}" ) for: "{
+  "int": 1,
+  "double": 1.5,
+  "true": true,
+  "false": false,
+  "string": "this is a string",
+  "array": [
+    1,
+    2
+  ]
+}" ( contains: ""int": 1," and contains: ""double": 1.5," and contains: ""true": true," and contains: ""false": false," and contains: ""string": "this is a string"," and contains: ""array": [
+    1,
+    2
+  ]
+}" )
+Json.tests.cpp:<line number>: passed: stream.str(), ContainsSubstring( "\"empty_object\": {\n  }," ) && ContainsSubstring( "\"fully_object\": {\n    \"key\": 1\n  }" ) for: "{
+  "empty_object": {
+  },
+  "fully_object": {
+    "key": 1
+  }
+}" ( contains: ""empty_object": {
+  }," and contains: ""fully_object": {
+    "key": 1
+  }" )
+Json.tests.cpp:<line number>: passed: stream.str() == "[\n]" for: "[
+]"
+==
+"[
+]"
+Json.tests.cpp:<line number>: passed: stream.str() == "[\n  1,\n  1.5,\n  true,\n  false,\n  \"this is a string\",\n  {\n    \"object\": 42\n  },\n  [\n    \"array\",\n    42.5\n  ]\n]" for: "[
+  1,
+  1.5,
+  true,
+  false,
+  "this is a string",
+  {
+    "object": 42
+  },
+  [
+    "array",
+    42.5
+  ]
+]"
+==
+"[
+  1,
+  1.5,
+  true,
+  false,
+  "this is a string",
+  {
+    "object": 42
+  },
+  [
+    "array",
+    42.5
+  ]
+]"
+Json.tests.cpp:<line number>: passed: stream.str() == "{\n}" for: "{
+}"
+==
+"{
+}"
+Json.tests.cpp:<line number>: passed: stream.str() == "[\n]" for: "[
+]"
+==
+"[
+]"
+Json.tests.cpp:<line number>: passed: stream.str() == "\"custom\"" for: ""custom"" == ""custom""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\\"\"" for: ""\""" == ""\"""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\\\\"" for: ""\\"" == ""\\""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"/\"" for: ""/"" == ""/""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\b\"" for: ""\b"" == ""\b""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\f\"" for: ""\f"" == ""\f""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\n\"" for: ""\n"" == ""\n""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\r\"" for: ""\r"" == ""\r""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\t\"" for: ""\t"" == ""\t""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\\\/\\t\\r\\n\"" for: ""\\/\t\r\n"" == ""\\/\t\r\n""
 Compilation.tests.cpp:<line number>: passed: []() { return true; }() for: true
 Approx.tests.cpp:<line number>: passed: d <= Approx( 1.24 ) for: 1.23 <= Approx( 1.24 )
 Approx.tests.cpp:<line number>: passed: d <= Approx( 1.23 ) for: 1.23 <= Approx( 1.23 )
@@ -1341,6 +1432,60 @@ Reporters.tests.cpp:<line number>: passed: listingString, ContainsSubstring( "fa
 
 " ( contains: "fake test name" and contains: "fakeTestTag" ) with 1 message: 'Tested reporter: console'
 Reporters.tests.cpp:<line number>: passed: !(factories.empty()) for: !false
+Reporters.tests.cpp:<line number>: passed: listingString, ContainsSubstring("fakeTag"s) for: "{
+  "version": 1,
+  "metadata": {
+    "name": "",
+    "rng-seed": 1234,
+    "catch2-version": "<version>"
+  },
+  "listings": {
+    "tags": [
+      {
+        "aliases": [
+          "fakeTag"
+        ],
+        "count": 1
+      }
+    ]" contains: "fakeTag" with 1 message: 'Tested reporter: JSON'
+Reporters.tests.cpp:<line number>: passed: !(factories.empty()) for: !false
+Reporters.tests.cpp:<line number>: passed: listingString, ContainsSubstring("fake reporter"s) for: "{
+  "version": 1,
+  "metadata": {
+    "name": "",
+    "rng-seed": 1234,
+    "catch2-version": "<version>"
+  },
+  "listings": {
+    "reporters": [
+      {
+        "name": "fake reporter",
+        "description": "fake description"
+      }
+    ]" contains: "fake reporter" with 1 message: 'Tested reporter: JSON'
+Reporters.tests.cpp:<line number>: passed: !(factories.empty()) for: !false
+Reporters.tests.cpp:<line number>: passed: listingString, ContainsSubstring( "fake test name"s ) && ContainsSubstring( "fakeTestTag"s ) for: "{
+  "version": 1,
+  "metadata": {
+    "name": "",
+    "rng-seed": 1234,
+    "catch2-version": "<version>"
+  },
+  "listings": {
+    "tests": [
+      {
+        "name": "fake test name",
+        "class-name": "",
+        "tags": [
+          "fakeTestTag"
+        ],
+        "source-location": {
+          "filename": "fake-file.cpp",
+          "line": 123456789
+        }
+      }
+    ]" ( contains: "fake test name" and contains: "fakeTestTag" ) with 1 message: 'Tested reporter: JSON'
+Reporters.tests.cpp:<line number>: passed: !(factories.empty()) for: !false
 Reporters.tests.cpp:<line number>: passed: listingString, ContainsSubstring("fakeTag"s) for: "<?xml version="1.0" encoding="UTF-8"?>
 All available tags:
    1  [fakeTag]
@@ -1750,6 +1895,10 @@ Misc.tests.cpp:<line number>: passed: true
 Misc.tests.cpp:<line number>: failed: explicitly
 Misc.tests.cpp:<line number>: failed - but was ok: false
 Misc.tests.cpp:<line number>: failed: explicitly
+Misc.tests.cpp:<line number>: passed: true
+Misc.tests.cpp:<line number>: failed: unexpected exception with message: 'Uncaught exception should fail!'; expression was: {Unknown expression after the reported line}
+Misc.tests.cpp:<line number>: failed - but was ok: false
+Misc.tests.cpp:<line number>: failed: unexpected exception with message: 'Uncaught exception should fail!'; expression was: {Unknown expression after the reported line}
 Message.tests.cpp:<line number>: failed - but was ok: 1 == 2
 Reporters.tests.cpp:<line number>: passed: listingString, ContainsSubstring("[fakeTag]"s) for: "All available tags:
    1  [fakeTag]
@@ -2473,6 +2622,8 @@ InternalBenchmark.tests.cpp:<line number>: passed: e.point == 23 for: 23.0 == 23
 InternalBenchmark.tests.cpp:<line number>: passed: e.upper_bound == 23 for: 23.0 == 23
 InternalBenchmark.tests.cpp:<line number>: passed: e.lower_bound == 23 for: 23.0 == 23
 InternalBenchmark.tests.cpp:<line number>: passed: e.confidence_interval == 0.95 for: 0.95 == 0.95
+RandomNumberGeneration.tests.cpp:<line number>: passed: dist.a() == -10 for: -10 == -10
+RandomNumberGeneration.tests.cpp:<line number>: passed: dist.b() == 10 for: 10 == 10
 UniquePtr.tests.cpp:<line number>: passed: !(ptr) for: !{?}
 UniquePtr.tests.cpp:<line number>: passed: ptr.get() == 0 for: 0 == 0
 UniquePtr.tests.cpp:<line number>: passed: ptr for: {?}
@@ -2538,7 +2689,7 @@ InternalBenchmark.tests.cpp:<line number>: passed: med == 18. for: 18.0 == 18.0
 InternalBenchmark.tests.cpp:<line number>: passed: q3 == 23. for: 23.0 == 23.0
 Misc.tests.cpp:<line number>: passed:
 Misc.tests.cpp:<line number>: passed:
-test cases:  409 |  309 passed |  84 failed | 5 skipped | 11 failed as expected
-assertions: 2226 | 2049 passed | 145 failed | 32 failed as expected
+test cases:  417 |  312 passed |  85 failed | 6 skipped | 14 failed as expected
+assertions: 2260 | 2079 passed | 146 failed | 35 failed as expected
 
 
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/compact.sw.multi.approved.txt b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/compact.sw.multi.approved.txt
index 5b292da1..214fef74 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/compact.sw.multi.approved.txt
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/compact.sw.multi.approved.txt
@@ -329,7 +329,7 @@ MatchersRanges.tests.cpp:<line number>: passed: inner_lists_are_empty.front(), I
 MatchersRanges.tests.cpp:<line number>: passed: has_empty{}, !IsEmpty() for: {?} not is empty
 MatchersRanges.tests.cpp:<line number>: passed: unrelated::ADL_empty{}, IsEmpty() for: {?} is empty
 Message.tests.cpp:<line number>: passed: with 7 messages: 'a := 1' and 'b := 2' and 'c := 3' and 'a + b := 3' and 'a+b := 3' and 'c > b := true' and 'a == 1 := true'
-Message.tests.cpp:<line number>: passed: with 7 messages: 'std::vector<int>{1, 2, 3}[0, 1, 2] := 3' and 'std::vector<int>{1, 2, 3}[(0, 1)] := 2' and 'std::vector<int>{1, 2, 3}[0] := 1' and '(helper_1436<int, int>{12, -12}) := { 12, -12 }' and '(helper_1436<int, int>(-12, 12)) := { -12, 12 }' and '(1, 2) := 2' and '(2, 3) := 3'
+Message.tests.cpp:<line number>: passed: with 7 messages: 'custom_index_op<int>{1, 2, 3}[0, 1, 2] := 0' and 'custom_index_op<int>{1, 2, 3}[(0, 1)] := 0' and 'custom_index_op<int>{1, 2, 3}[0] := 0' and '(helper_1436<int, int>{12, -12}) := { 12, -12 }' and '(helper_1436<int, int>(-12, 12)) := { -12, 12 }' and '(1, 2) := 2' and '(2, 3) := 3'
 Message.tests.cpp:<line number>: passed: with 11 messages: '("comma, in string", "escaped, \", ") := "escaped, ", "' and '"single quote in string,'," := "single quote in string,',"' and '"some escapes, \\,\\\\" := "some escapes, \,\\"' and '"some, ), unmatched, } prenheses {[<" := "some, ), unmatched, } prenheses {[<"' and ''"' := '"'' and ''\'' := '''' and '',' := ','' and ''}' := '}'' and '')' := ')'' and ''(' := '('' and ''{' := '{''
 ToStringGeneral.tests.cpp:<line number>: passed: true with 1 message: 'i := 2'
 ToStringGeneral.tests.cpp:<line number>: passed: true with 1 message: '3'
@@ -518,8 +518,8 @@ ToString.tests.cpp:<line number>: passed: enumInfo->lookup(1) == "Value2" for: V
 ToString.tests.cpp:<line number>: passed: enumInfo->lookup(3) == "{** unexpected enum value **}" for: {** unexpected enum value **}
 ==
 "{** unexpected enum value **}"
+Skip.tests.cpp:<line number>: skipped: 'This generator is empty'
 Stream.tests.cpp:<line number>: passed: Catch::makeStream( "" )->isConsole() for: true
-Tag.tests.cpp:<line number>: passed: Catch::TestCaseInfo( "", { "fake test name", "[]" }, dummySourceLineInfo )
 Matchers.tests.cpp:<line number>: failed: testStringForMatching(), EndsWith( "Substring" ) for: "this string contains 'abc' as a substring" ends with: "Substring"
 Matchers.tests.cpp:<line number>: failed: testStringForMatching(), EndsWith( "this", Catch::CaseSensitive::No ) for: "this string contains 'abc' as a substring" ends with: "this" (case insensitive)
 EnumToString.tests.cpp:<line number>: passed: stringify( EnumClass3::Value1 ) == "Value1" for: "Value1" == "Value1"
@@ -664,6 +664,10 @@ Matchers.tests.cpp:<line number>: passed: 1., !IsNaN() for: 1.0 not is NaN
 Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
 Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
 Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
+Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
+Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
+Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
+Generators.tests.cpp:<line number>: passed: i % 2 == 0 for: 0 == 0
 Generators.tests.cpp:<line number>: passed: filter([] (int) {return false; }, value(1)), Catch::GeneratorException
 Generators.tests.cpp:<line number>: passed: i < 4 for: 1 < 4
 Generators.tests.cpp:<line number>: passed: i < 4 for: 2 < 4
@@ -942,6 +946,7 @@ TestCaseInfoHasher.tests.cpp:<line number>: passed: h( dummy1 ) != h( dummy2 ) f
 TestCaseInfoHasher.tests.cpp:<line number>: passed: h( dummy ) == h( dummy ) for: 3422778688 (0x<hex digits>)
 ==
 3422778688 (0x<hex digits>)
+Message.tests.cpp:<line number>: failed: explicitly with 3 messages: 'This info has multiple parts.' and 'This unscoped info has multiple parts.' and 'Show infos!'
 Message.tests.cpp:<line number>: warning: 'this is a message' with 1 message: 'this is a warning'
 Message.tests.cpp:<line number>: failed: a == 1 for: 2 == 1 with 2 messages: 'this message should be logged' and 'so should this'
 Message.tests.cpp:<line number>: passed: a == 2 for: 2 == 2 with 1 message: 'this message may be logged later'
@@ -959,6 +964,7 @@ Message.tests.cpp:<line number>: passed: i < 10 for: 7 < 10 with 2 messages: 'cu
 Message.tests.cpp:<line number>: passed: i < 10 for: 8 < 10 with 2 messages: 'current counter 8' and 'i := 8'
 Message.tests.cpp:<line number>: passed: i < 10 for: 9 < 10 with 2 messages: 'current counter 9' and 'i := 9'
 Message.tests.cpp:<line number>: failed: i < 10 for: 10 < 10 with 2 messages: 'current counter 10' and 'i := 10'
+AssertionHandler.tests.cpp:<line number>: failed: unexpected exception with message: 'Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE'; expression was: Dummy
 Condition.tests.cpp:<line number>: failed: data.int_seven != 7 for: 7 != 7
 Condition.tests.cpp:<line number>: failed: data.float_nine_point_one != Approx( 9.1f ) for: 9.1f != Approx( 9.1000003815 )
 Condition.tests.cpp:<line number>: failed: data.double_pi != Approx( 3.1415926535 ) for: 3.1415926535 != Approx( 3.1415926535 )
@@ -975,6 +981,91 @@ Condition.tests.cpp:<line number>: passed: data.str_hello != "goodbye" for: "hel
 Condition.tests.cpp:<line number>: passed: data.str_hello != "hell" for: "hello" != "hell"
 Condition.tests.cpp:<line number>: passed: data.str_hello != "hello1" for: "hello" != "hello1"
 Condition.tests.cpp:<line number>: passed: data.str_hello.size() != 6 for: 5 != 6
+Json.tests.cpp:<line number>: passed: stream.str() == "" for: "" == ""
+Json.tests.cpp:<line number>: passed: stream.str() == "{\n}" for: "{
+}"
+==
+"{
+}"
+Json.tests.cpp:<line number>: passed: stream.str(), ContainsSubstring( "\"int\": 1," ) && ContainsSubstring( "\"double\": 1.5," ) && ContainsSubstring( "\"true\": true," ) && ContainsSubstring( "\"false\": false," ) && ContainsSubstring( "\"string\": \"this is a string\"," ) && ContainsSubstring( "\"array\": [\n    1,\n    2\n  ]\n}" ) for: "{
+  "int": 1,
+  "double": 1.5,
+  "true": true,
+  "false": false,
+  "string": "this is a string",
+  "array": [
+    1,
+    2
+  ]
+}" ( contains: ""int": 1," and contains: ""double": 1.5," and contains: ""true": true," and contains: ""false": false," and contains: ""string": "this is a string"," and contains: ""array": [
+    1,
+    2
+  ]
+}" )
+Json.tests.cpp:<line number>: passed: stream.str(), ContainsSubstring( "\"empty_object\": {\n  }," ) && ContainsSubstring( "\"fully_object\": {\n    \"key\": 1\n  }" ) for: "{
+  "empty_object": {
+  },
+  "fully_object": {
+    "key": 1
+  }
+}" ( contains: ""empty_object": {
+  }," and contains: ""fully_object": {
+    "key": 1
+  }" )
+Json.tests.cpp:<line number>: passed: stream.str() == "[\n]" for: "[
+]"
+==
+"[
+]"
+Json.tests.cpp:<line number>: passed: stream.str() == "[\n  1,\n  1.5,\n  true,\n  false,\n  \"this is a string\",\n  {\n    \"object\": 42\n  },\n  [\n    \"array\",\n    42.5\n  ]\n]" for: "[
+  1,
+  1.5,
+  true,
+  false,
+  "this is a string",
+  {
+    "object": 42
+  },
+  [
+    "array",
+    42.5
+  ]
+]"
+==
+"[
+  1,
+  1.5,
+  true,
+  false,
+  "this is a string",
+  {
+    "object": 42
+  },
+  [
+    "array",
+    42.5
+  ]
+]"
+Json.tests.cpp:<line number>: passed: stream.str() == "{\n}" for: "{
+}"
+==
+"{
+}"
+Json.tests.cpp:<line number>: passed: stream.str() == "[\n]" for: "[
+]"
+==
+"[
+]"
+Json.tests.cpp:<line number>: passed: stream.str() == "\"custom\"" for: ""custom"" == ""custom""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\\"\"" for: ""\""" == ""\"""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\\\\"" for: ""\\"" == ""\\""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"/\"" for: ""/"" == ""/""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\b\"" for: ""\b"" == ""\b""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\f\"" for: ""\f"" == ""\f""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\n\"" for: ""\n"" == ""\n""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\r\"" for: ""\r"" == ""\r""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\t\"" for: ""\t"" == ""\t""
+Json.tests.cpp:<line number>: passed: sstream.str() == "\"\\\\/\\t\\r\\n\"" for: ""\\/\t\r\n"" == ""\\/\t\r\n""
 Compilation.tests.cpp:<line number>: passed: []() { return true; }() for: true
 Approx.tests.cpp:<line number>: passed: d <= Approx( 1.24 ) for: 1.23 <= Approx( 1.24 )
 Approx.tests.cpp:<line number>: passed: d <= Approx( 1.23 ) for: 1.23 <= Approx( 1.23 )
@@ -1339,6 +1430,60 @@ Reporters.tests.cpp:<line number>: passed: listingString, ContainsSubstring( "fa
 
 " ( contains: "fake test name" and contains: "fakeTestTag" ) with 1 message: 'Tested reporter: console'
 Reporters.tests.cpp:<line number>: passed: !(factories.empty()) for: !false
+Reporters.tests.cpp:<line number>: passed: listingString, ContainsSubstring("fakeTag"s) for: "{
+  "version": 1,
+  "metadata": {
+    "name": "",
+    "rng-seed": 1234,
+    "catch2-version": "<version>"
+  },
+  "listings": {
+    "tags": [
+      {
+        "aliases": [
+          "fakeTag"
+        ],
+        "count": 1
+      }
+    ]" contains: "fakeTag" with 1 message: 'Tested reporter: JSON'
+Reporters.tests.cpp:<line number>: passed: !(factories.empty()) for: !false
+Reporters.tests.cpp:<line number>: passed: listingString, ContainsSubstring("fake reporter"s) for: "{
+  "version": 1,
+  "metadata": {
+    "name": "",
+    "rng-seed": 1234,
+    "catch2-version": "<version>"
+  },
+  "listings": {
+    "reporters": [
+      {
+        "name": "fake reporter",
+        "description": "fake description"
+      }
+    ]" contains: "fake reporter" with 1 message: 'Tested reporter: JSON'
+Reporters.tests.cpp:<line number>: passed: !(factories.empty()) for: !false
+Reporters.tests.cpp:<line number>: passed: listingString, ContainsSubstring( "fake test name"s ) && ContainsSubstring( "fakeTestTag"s ) for: "{
+  "version": 1,
+  "metadata": {
+    "name": "",
+    "rng-seed": 1234,
+    "catch2-version": "<version>"
+  },
+  "listings": {
+    "tests": [
+      {
+        "name": "fake test name",
+        "class-name": "",
+        "tags": [
+          "fakeTestTag"
+        ],
+        "source-location": {
+          "filename": "fake-file.cpp",
+          "line": 123456789
+        }
+      }
+    ]" ( contains: "fake test name" and contains: "fakeTestTag" ) with 1 message: 'Tested reporter: JSON'
+Reporters.tests.cpp:<line number>: passed: !(factories.empty()) for: !false
 Reporters.tests.cpp:<line number>: passed: listingString, ContainsSubstring("fakeTag"s) for: "<?xml version="1.0" encoding="UTF-8"?>
 All available tags:
    1  [fakeTag]
@@ -1743,6 +1888,10 @@ Misc.tests.cpp:<line number>: passed: true
 Misc.tests.cpp:<line number>: failed: explicitly
 Misc.tests.cpp:<line number>: failed - but was ok: false
 Misc.tests.cpp:<line number>: failed: explicitly
+Misc.tests.cpp:<line number>: passed: true
+Misc.tests.cpp:<line number>: failed: unexpected exception with message: 'Uncaught exception should fail!'; expression was: {Unknown expression after the reported line}
+Misc.tests.cpp:<line number>: failed - but was ok: false
+Misc.tests.cpp:<line number>: failed: unexpected exception with message: 'Uncaught exception should fail!'; expression was: {Unknown expression after the reported line}
 Message.tests.cpp:<line number>: failed - but was ok: 1 == 2
 Reporters.tests.cpp:<line number>: passed: listingString, ContainsSubstring("[fakeTag]"s) for: "All available tags:
    1  [fakeTag]
@@ -2462,6 +2611,8 @@ InternalBenchmark.tests.cpp:<line number>: passed: e.point == 23 for: 23.0 == 23
 InternalBenchmark.tests.cpp:<line number>: passed: e.upper_bound == 23 for: 23.0 == 23
 InternalBenchmark.tests.cpp:<line number>: passed: e.lower_bound == 23 for: 23.0 == 23
 InternalBenchmark.tests.cpp:<line number>: passed: e.confidence_interval == 0.95 for: 0.95 == 0.95
+RandomNumberGeneration.tests.cpp:<line number>: passed: dist.a() == -10 for: -10 == -10
+RandomNumberGeneration.tests.cpp:<line number>: passed: dist.b() == 10 for: 10 == 10
 UniquePtr.tests.cpp:<line number>: passed: !(ptr) for: !{?}
 UniquePtr.tests.cpp:<line number>: passed: ptr.get() == 0 for: 0 == 0
 UniquePtr.tests.cpp:<line number>: passed: ptr for: {?}
@@ -2527,7 +2678,7 @@ InternalBenchmark.tests.cpp:<line number>: passed: med == 18. for: 18.0 == 18.0
 InternalBenchmark.tests.cpp:<line number>: passed: q3 == 23. for: 23.0 == 23.0
 Misc.tests.cpp:<line number>: passed:
 Misc.tests.cpp:<line number>: passed:
-test cases:  409 |  309 passed |  84 failed | 5 skipped | 11 failed as expected
-assertions: 2226 | 2049 passed | 145 failed | 32 failed as expected
+test cases:  417 |  312 passed |  85 failed | 6 skipped | 14 failed as expected
+assertions: 2260 | 2079 passed | 146 failed | 35 failed as expected
 
 
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/console.std.approved.txt b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/console.std.approved.txt
index 15d8b024..25426256 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/console.std.approved.txt
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/console.std.approved.txt
@@ -383,6 +383,16 @@ Exception.tests.cpp:<line number>: FAILED:
 due to unexpected exception with message:
   custom std exception
 
+-------------------------------------------------------------------------------
+Empty generators can SKIP in constructor
+-------------------------------------------------------------------------------
+Skip.tests.cpp:<line number>
+...............................................................................
+
+Skip.tests.cpp:<line number>: SKIPPED:
+explicitly with message:
+  This generator is empty
+
 -------------------------------------------------------------------------------
 EndsWith string matcher
 -------------------------------------------------------------------------------
@@ -589,6 +599,18 @@ explicitly with message:
 Message.tests.cpp:<line number>: warning:
   This message appears in the output
 
+-------------------------------------------------------------------------------
+INFO and UNSCOPED_INFO can stream multiple arguments
+-------------------------------------------------------------------------------
+Message.tests.cpp:<line number>
+...............................................................................
+
+Message.tests.cpp:<line number>: FAILED:
+explicitly with messages:
+  This info has multiple parts.
+  This unscoped info has multiple parts.
+  Show infos!
+
 -------------------------------------------------------------------------------
 INFO and WARN do not abort tests
 -------------------------------------------------------------------------------
@@ -649,6 +671,17 @@ with messages:
   current counter 10
   i := 10
 
+-------------------------------------------------------------------------------
+Incomplete AssertionHandler
+-------------------------------------------------------------------------------
+AssertionHandler.tests.cpp:<line number>
+...............................................................................
+
+AssertionHandler.tests.cpp:<line number>: FAILED:
+  REQUIRE( Dummy )
+due to unexpected exception with message:
+  Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE
+
 -------------------------------------------------------------------------------
 Inequality checks that should fail
 -------------------------------------------------------------------------------
@@ -987,6 +1020,28 @@ Misc.tests.cpp:<line number>
 
 Misc.tests.cpp:<line number>: FAILED:
 
+-------------------------------------------------------------------------------
+Testing checked-if 4
+-------------------------------------------------------------------------------
+Misc.tests.cpp:<line number>
+...............................................................................
+
+Misc.tests.cpp:<line number>: FAILED:
+  {Unknown expression after the reported line}
+due to unexpected exception with message:
+  Uncaught exception should fail!
+
+-------------------------------------------------------------------------------
+Testing checked-if 5
+-------------------------------------------------------------------------------
+Misc.tests.cpp:<line number>
+...............................................................................
+
+Misc.tests.cpp:<line number>: FAILED:
+  {Unknown expression after the reported line}
+due to unexpected exception with message:
+  Uncaught exception should fail!
+
 -------------------------------------------------------------------------------
 Thrown string literals are translated
 -------------------------------------------------------------------------------
@@ -1533,6 +1588,6 @@ due to unexpected exception with message:
   Why would you throw a std::string?
 
 ===============================================================================
-test cases:  409 |  323 passed |  69 failed | 6 skipped | 11 failed as expected
-assertions: 2209 | 2049 passed | 128 failed | 32 failed as expected
+test cases:  417 |  326 passed |  70 failed | 7 skipped | 14 failed as expected
+assertions: 2243 | 2079 passed | 129 failed | 35 failed as expected
 
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/console.sw.approved.txt b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/console.sw.approved.txt
index 26d5e8f3..077b7bf7 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/console.sw.approved.txt
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/console.sw.approved.txt
@@ -2740,9 +2740,9 @@ Message.tests.cpp:<line number>
 
 Message.tests.cpp:<line number>: PASSED:
 with messages:
-  std::vector<int>{1, 2, 3}[0, 1, 2] := 3
-  std::vector<int>{1, 2, 3}[(0, 1)] := 2
-  std::vector<int>{1, 2, 3}[0] := 1
+  custom_index_op<int>{1, 2, 3}[0, 1, 2] := 0
+  custom_index_op<int>{1, 2, 3}[(0, 1)] := 0
+  custom_index_op<int>{1, 2, 3}[0] := 0
   (helper_1436<int, int>{12, -12}) := { 12, -12 }
   (helper_1436<int, int>(-12, 12)) := { -12, 12 }
   (1, 2) := 2
@@ -3956,6 +3956,16 @@ with expansion:
   ==
   "{** unexpected enum value **}"
 
+-------------------------------------------------------------------------------
+Empty generators can SKIP in constructor
+-------------------------------------------------------------------------------
+Skip.tests.cpp:<line number>
+...............................................................................
+
+Skip.tests.cpp:<line number>: SKIPPED:
+explicitly with message:
+  This generator is empty
+
 -------------------------------------------------------------------------------
 Empty stream name opens cout stream
 -------------------------------------------------------------------------------
@@ -3967,15 +3977,6 @@ Stream.tests.cpp:<line number>: PASSED:
 with expansion:
   true
 
--------------------------------------------------------------------------------
-Empty tag is not allowed
--------------------------------------------------------------------------------
-Tag.tests.cpp:<line number>
-...............................................................................
-
-Tag.tests.cpp:<line number>: PASSED:
-  REQUIRE_THROWS( Catch::TestCaseInfo( "", { "fake test name", "[]" }, dummySourceLineInfo ) )
-
 -------------------------------------------------------------------------------
 EndsWith string matcher
 -------------------------------------------------------------------------------
@@ -4888,6 +4889,50 @@ Matchers.tests.cpp:<line number>: PASSED:
 with expansion:
   1.0 not is NaN
 
+-------------------------------------------------------------------------------
+GENERATE can combine literals and generators
+-------------------------------------------------------------------------------
+Generators.tests.cpp:<line number>
+...............................................................................
+
+Generators.tests.cpp:<line number>: PASSED:
+  REQUIRE( i % 2 == 0 )
+with expansion:
+  0 == 0
+
+-------------------------------------------------------------------------------
+GENERATE can combine literals and generators
+-------------------------------------------------------------------------------
+Generators.tests.cpp:<line number>
+...............................................................................
+
+Generators.tests.cpp:<line number>: PASSED:
+  REQUIRE( i % 2 == 0 )
+with expansion:
+  0 == 0
+
+-------------------------------------------------------------------------------
+GENERATE can combine literals and generators
+-------------------------------------------------------------------------------
+Generators.tests.cpp:<line number>
+...............................................................................
+
+Generators.tests.cpp:<line number>: PASSED:
+  REQUIRE( i % 2 == 0 )
+with expansion:
+  0 == 0
+
+-------------------------------------------------------------------------------
+GENERATE can combine literals and generators
+-------------------------------------------------------------------------------
+Generators.tests.cpp:<line number>
+...............................................................................
+
+Generators.tests.cpp:<line number>: PASSED:
+  REQUIRE( i % 2 == 0 )
+with expansion:
+  0 == 0
+
 -------------------------------------------------------------------------------
 Generators -- adapters
   Filtering by predicate
@@ -6981,6 +7026,18 @@ with expansion:
   ==
   3422778688 (0x<hex digits>)
 
+-------------------------------------------------------------------------------
+INFO and UNSCOPED_INFO can stream multiple arguments
+-------------------------------------------------------------------------------
+Message.tests.cpp:<line number>
+...............................................................................
+
+Message.tests.cpp:<line number>: FAILED:
+explicitly with messages:
+  This info has multiple parts.
+  This unscoped info has multiple parts.
+  Show infos!
+
 -------------------------------------------------------------------------------
 INFO and WARN do not abort tests
 -------------------------------------------------------------------------------
@@ -7142,6 +7199,17 @@ with messages:
   current counter 10
   i := 10
 
+-------------------------------------------------------------------------------
+Incomplete AssertionHandler
+-------------------------------------------------------------------------------
+AssertionHandler.tests.cpp:<line number>
+...............................................................................
+
+AssertionHandler.tests.cpp:<line number>: FAILED:
+  REQUIRE( Dummy )
+due to unexpected exception with message:
+  Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE
+
 -------------------------------------------------------------------------------
 Inequality checks that should fail
 -------------------------------------------------------------------------------
@@ -7234,6 +7302,291 @@ Condition.tests.cpp:<line number>: PASSED:
 with expansion:
   5 != 6
 
+-------------------------------------------------------------------------------
+JsonWriter
+  Newly constructed JsonWriter does nothing
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "" )
+with expansion:
+  "" == ""
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Calling writeObject will create an empty pair of braces
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "{\n}" )
+with expansion:
+  "{
+  }"
+  ==
+  "{
+  }"
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Calling writeObject with key will create an object to write the value
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE_THAT( stream.str(), ContainsSubstring( "\"int\": 1," ) && ContainsSubstring( "\"double\": 1.5," ) && ContainsSubstring( "\"true\": true," ) && ContainsSubstring( "\"false\": false," ) && ContainsSubstring( "\"string\": \"this is a string\"," ) && ContainsSubstring( "\"array\": [\n    1,\n    2\n  ]\n}" ) )
+with expansion:
+  "{
+    "int": 1,
+    "double": 1.5,
+    "true": true,
+    "false": false,
+    "string": "this is a string",
+    "array": [
+      1,
+      2
+    ]
+  }" ( contains: ""int": 1," and contains: ""double": 1.5," and contains:
+  ""true": true," and contains: ""false": false," and contains: ""string":
+  "this is a string"," and contains: ""array": [
+      1,
+      2
+    ]
+  }" )
+
+-------------------------------------------------------------------------------
+JsonWriter
+  nesting objects
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE_THAT( stream.str(), ContainsSubstring( "\"empty_object\": {\n  }," ) && ContainsSubstring( "\"fully_object\": {\n    \"key\": 1\n  }" ) )
+with expansion:
+  "{
+    "empty_object": {
+    },
+    "fully_object": {
+      "key": 1
+    }
+  }" ( contains: ""empty_object": {
+    }," and contains: ""fully_object": {
+      "key": 1
+    }" )
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Calling writeArray will create an empty pair of braces
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "[\n]" )
+with expansion:
+  "[
+  ]"
+  ==
+  "[
+  ]"
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Calling writeArray creates array to write the values to
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "[\n  1,\n  1.5,\n  true,\n  false,\n  \"this is a string\",\n  {\n    \"object\": 42\n  },\n  [\n    \"array\",\n    42.5\n  ]\n]" )
+with expansion:
+  "[
+    1,
+    1.5,
+    true,
+    false,
+    "this is a string",
+    {
+      "object": 42
+    },
+    [
+      "array",
+      42.5
+    ]
+  ]"
+  ==
+  "[
+    1,
+    1.5,
+    true,
+    false,
+    "this is a string",
+    {
+      "object": 42
+    },
+    [
+      "array",
+      42.5
+    ]
+  ]"
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Moved from JsonObjectWriter shall not insert superfluous brace
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "{\n}" )
+with expansion:
+  "{
+  }"
+  ==
+  "{
+  }"
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Moved from JsonArrayWriter shall not insert superfluous bracket
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "[\n]" )
+with expansion:
+  "[
+  ]"
+  ==
+  "[
+  ]"
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Custom class shall be quoted
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "\"custom\"" )
+with expansion:
+  ""custom"" == ""custom""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  Quote in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\\"\"" )
+with expansion:
+  ""\""" == ""\"""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  Backslash in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\\\\"" )
+with expansion:
+  ""\\"" == ""\\""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  Forward slash in a string is **not** escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"/\"" )
+with expansion:
+  ""/"" == ""/""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  Backspace in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\b\"" )
+with expansion:
+  ""\b"" == ""\b""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  Formfeed in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\f\"" )
+with expansion:
+  ""\f"" == ""\f""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  linefeed in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\n\"" )
+with expansion:
+  ""\n"" == ""\n""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  carriage return in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\r\"" )
+with expansion:
+  ""\r"" == ""\r""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  tab in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\t\"" )
+with expansion:
+  ""\t"" == ""\t""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  combination of characters is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\\\/\\t\\r\\n\"" )
+with expansion:
+  ""\\/\t\r\n"" == ""\\/\t\r\n""
+
 -------------------------------------------------------------------------------
 Lambdas in assertions
 -------------------------------------------------------------------------------
@@ -9732,6 +10085,129 @@ Reporter's write listings to provided stream
 Reporters.tests.cpp:<line number>
 ...............................................................................
 
+Reporters.tests.cpp:<line number>: PASSED:
+  REQUIRE_FALSE( factories.empty() )
+with expansion:
+  !false
+
+-------------------------------------------------------------------------------
+Reporter's write listings to provided stream
+  JSON reporter lists tags
+-------------------------------------------------------------------------------
+Reporters.tests.cpp:<line number>
+...............................................................................
+
+Reporters.tests.cpp:<line number>: PASSED:
+  REQUIRE_THAT( listingString, ContainsSubstring("fakeTag"s) )
+with expansion:
+  "{
+    "version": 1,
+    "metadata": {
+      "name": "",
+      "rng-seed": 1234,
+      "catch2-version": "<version>"
+    },
+    "listings": {
+      "tags": [
+        {
+          "aliases": [
+            "fakeTag"
+          ],
+          "count": 1
+        }
+      ]" contains: "fakeTag"
+with message:
+  Tested reporter: JSON
+
+-------------------------------------------------------------------------------
+Reporter's write listings to provided stream
+-------------------------------------------------------------------------------
+Reporters.tests.cpp:<line number>
+...............................................................................
+
+Reporters.tests.cpp:<line number>: PASSED:
+  REQUIRE_FALSE( factories.empty() )
+with expansion:
+  !false
+
+-------------------------------------------------------------------------------
+Reporter's write listings to provided stream
+  JSON reporter lists reporters
+-------------------------------------------------------------------------------
+Reporters.tests.cpp:<line number>
+...............................................................................
+
+Reporters.tests.cpp:<line number>: PASSED:
+  REQUIRE_THAT( listingString, ContainsSubstring("fake reporter"s) )
+with expansion:
+  "{
+    "version": 1,
+    "metadata": {
+      "name": "",
+      "rng-seed": 1234,
+      "catch2-version": "<version>"
+    },
+    "listings": {
+      "reporters": [
+        {
+          "name": "fake reporter",
+          "description": "fake description"
+        }
+      ]" contains: "fake reporter"
+with message:
+  Tested reporter: JSON
+
+-------------------------------------------------------------------------------
+Reporter's write listings to provided stream
+-------------------------------------------------------------------------------
+Reporters.tests.cpp:<line number>
+...............................................................................
+
+Reporters.tests.cpp:<line number>: PASSED:
+  REQUIRE_FALSE( factories.empty() )
+with expansion:
+  !false
+
+-------------------------------------------------------------------------------
+Reporter's write listings to provided stream
+  JSON reporter lists tests
+-------------------------------------------------------------------------------
+Reporters.tests.cpp:<line number>
+...............................................................................
+
+Reporters.tests.cpp:<line number>: PASSED:
+  REQUIRE_THAT( listingString, ContainsSubstring( "fake test name"s ) && ContainsSubstring( "fakeTestTag"s ) )
+with expansion:
+  "{
+    "version": 1,
+    "metadata": {
+      "name": "",
+      "rng-seed": 1234,
+      "catch2-version": "<version>"
+    },
+    "listings": {
+      "tests": [
+        {
+          "name": "fake test name",
+          "class-name": "",
+          "tags": [
+            "fakeTestTag"
+          ],
+          "source-location": {
+            "filename": "fake-file.cpp",
+            "line": 123456789
+          }
+        }
+      ]" ( contains: "fake test name" and contains: "fakeTestTag" )
+with message:
+  Tested reporter: JSON
+
+-------------------------------------------------------------------------------
+Reporter's write listings to provided stream
+-------------------------------------------------------------------------------
+Reporters.tests.cpp:<line number>
+...............................................................................
+
 Reporters.tests.cpp:<line number>: PASSED:
   REQUIRE_FALSE( factories.empty() )
 with expansion:
@@ -12521,6 +12997,34 @@ Misc.tests.cpp:<line number>: FAILED - but was ok:
 
 Misc.tests.cpp:<line number>: FAILED:
 
+-------------------------------------------------------------------------------
+Testing checked-if 4
+-------------------------------------------------------------------------------
+Misc.tests.cpp:<line number>
+...............................................................................
+
+Misc.tests.cpp:<line number>: PASSED:
+  CHECKED_ELSE( true )
+
+Misc.tests.cpp:<line number>: FAILED:
+  {Unknown expression after the reported line}
+due to unexpected exception with message:
+  Uncaught exception should fail!
+
+-------------------------------------------------------------------------------
+Testing checked-if 5
+-------------------------------------------------------------------------------
+Misc.tests.cpp:<line number>
+...............................................................................
+
+Misc.tests.cpp:<line number>: FAILED - but was ok:
+  CHECKED_ELSE( false )
+
+Misc.tests.cpp:<line number>: FAILED:
+  {Unknown expression after the reported line}
+due to unexpected exception with message:
+  Uncaught exception should fail!
+
 -------------------------------------------------------------------------------
 The NO_FAIL macro reports a failure but does not fail the test
 -------------------------------------------------------------------------------
@@ -13460,7 +13964,7 @@ with expansion:
 -------------------------------------------------------------------------------
 Usage of AllTrue range matcher
   Basic usage
-  One false evalutes to false
+  One false evaluates to false
 -------------------------------------------------------------------------------
 MatchersRanges.tests.cpp:<line number>
 ...............................................................................
@@ -13499,7 +14003,7 @@ with expansion:
 -------------------------------------------------------------------------------
 Usage of AllTrue range matcher
   Contained type is convertible to bool
-  One false evalutes to false
+  One false evaluates to false
 -------------------------------------------------------------------------------
 MatchersRanges.tests.cpp:<line number>
 ...............................................................................
@@ -13735,7 +14239,7 @@ with expansion:
 -------------------------------------------------------------------------------
 Usage of AnyTrue range matcher
   Basic usage
-  One true evalutes to true
+  One true evaluates to true
 -------------------------------------------------------------------------------
 MatchersRanges.tests.cpp:<line number>
 ...............................................................................
@@ -13774,7 +14278,7 @@ with expansion:
 -------------------------------------------------------------------------------
 Usage of AnyTrue range matcher
   Contained type is convertible to bool
-  One true evalutes to true
+  One true evaluates to true
 -------------------------------------------------------------------------------
 MatchersRanges.tests.cpp:<line number>
 ...............................................................................
@@ -14010,7 +14514,7 @@ with expansion:
 -------------------------------------------------------------------------------
 Usage of NoneTrue range matcher
   Basic usage
-  One true evalutes to false
+  One true evaluates to false
 -------------------------------------------------------------------------------
 MatchersRanges.tests.cpp:<line number>
 ...............................................................................
@@ -14049,7 +14553,7 @@ with expansion:
 -------------------------------------------------------------------------------
 Usage of NoneTrue range matcher
   Contained type is convertible to bool
-  One true evalutes to false
+  One true evaluates to false
 -------------------------------------------------------------------------------
 MatchersRanges.tests.cpp:<line number>
 ...............................................................................
@@ -17744,6 +18248,22 @@ InternalBenchmark.tests.cpp:<line number>: PASSED:
 with expansion:
   0.95 == 0.95
 
+-------------------------------------------------------------------------------
+uniform_integer_distribution can return the bounds
+-------------------------------------------------------------------------------
+RandomNumberGeneration.tests.cpp:<line number>
+...............................................................................
+
+RandomNumberGeneration.tests.cpp:<line number>: PASSED:
+  REQUIRE( dist.a() == -10 )
+with expansion:
+  -10 == -10
+
+RandomNumberGeneration.tests.cpp:<line number>: PASSED:
+  REQUIRE( dist.b() == 10 )
+with expansion:
+  10 == 10
+
 -------------------------------------------------------------------------------
 unique_ptr reimplementation: basic functionality
   Default constructed unique_ptr is empty
@@ -18231,6 +18751,6 @@ Misc.tests.cpp:<line number>
 Misc.tests.cpp:<line number>: PASSED:
 
 ===============================================================================
-test cases:  409 |  309 passed |  84 failed | 5 skipped | 11 failed as expected
-assertions: 2226 | 2049 passed | 145 failed | 32 failed as expected
+test cases:  417 |  312 passed |  85 failed | 6 skipped | 14 failed as expected
+assertions: 2260 | 2079 passed | 146 failed | 35 failed as expected
 
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/console.sw.multi.approved.txt b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/console.sw.multi.approved.txt
index 80b63ab8..5d204990 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/console.sw.multi.approved.txt
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/console.sw.multi.approved.txt
@@ -2738,9 +2738,9 @@ Message.tests.cpp:<line number>
 
 Message.tests.cpp:<line number>: PASSED:
 with messages:
-  std::vector<int>{1, 2, 3}[0, 1, 2] := 3
-  std::vector<int>{1, 2, 3}[(0, 1)] := 2
-  std::vector<int>{1, 2, 3}[0] := 1
+  custom_index_op<int>{1, 2, 3}[0, 1, 2] := 0
+  custom_index_op<int>{1, 2, 3}[(0, 1)] := 0
+  custom_index_op<int>{1, 2, 3}[0] := 0
   (helper_1436<int, int>{12, -12}) := { 12, -12 }
   (helper_1436<int, int>(-12, 12)) := { -12, 12 }
   (1, 2) := 2
@@ -3954,6 +3954,16 @@ with expansion:
   ==
   "{** unexpected enum value **}"
 
+-------------------------------------------------------------------------------
+Empty generators can SKIP in constructor
+-------------------------------------------------------------------------------
+Skip.tests.cpp:<line number>
+...............................................................................
+
+Skip.tests.cpp:<line number>: SKIPPED:
+explicitly with message:
+  This generator is empty
+
 -------------------------------------------------------------------------------
 Empty stream name opens cout stream
 -------------------------------------------------------------------------------
@@ -3965,15 +3975,6 @@ Stream.tests.cpp:<line number>: PASSED:
 with expansion:
   true
 
--------------------------------------------------------------------------------
-Empty tag is not allowed
--------------------------------------------------------------------------------
-Tag.tests.cpp:<line number>
-...............................................................................
-
-Tag.tests.cpp:<line number>: PASSED:
-  REQUIRE_THROWS( Catch::TestCaseInfo( "", { "fake test name", "[]" }, dummySourceLineInfo ) )
-
 -------------------------------------------------------------------------------
 EndsWith string matcher
 -------------------------------------------------------------------------------
@@ -4886,6 +4887,50 @@ Matchers.tests.cpp:<line number>: PASSED:
 with expansion:
   1.0 not is NaN
 
+-------------------------------------------------------------------------------
+GENERATE can combine literals and generators
+-------------------------------------------------------------------------------
+Generators.tests.cpp:<line number>
+...............................................................................
+
+Generators.tests.cpp:<line number>: PASSED:
+  REQUIRE( i % 2 == 0 )
+with expansion:
+  0 == 0
+
+-------------------------------------------------------------------------------
+GENERATE can combine literals and generators
+-------------------------------------------------------------------------------
+Generators.tests.cpp:<line number>
+...............................................................................
+
+Generators.tests.cpp:<line number>: PASSED:
+  REQUIRE( i % 2 == 0 )
+with expansion:
+  0 == 0
+
+-------------------------------------------------------------------------------
+GENERATE can combine literals and generators
+-------------------------------------------------------------------------------
+Generators.tests.cpp:<line number>
+...............................................................................
+
+Generators.tests.cpp:<line number>: PASSED:
+  REQUIRE( i % 2 == 0 )
+with expansion:
+  0 == 0
+
+-------------------------------------------------------------------------------
+GENERATE can combine literals and generators
+-------------------------------------------------------------------------------
+Generators.tests.cpp:<line number>
+...............................................................................
+
+Generators.tests.cpp:<line number>: PASSED:
+  REQUIRE( i % 2 == 0 )
+with expansion:
+  0 == 0
+
 -------------------------------------------------------------------------------
 Generators -- adapters
   Filtering by predicate
@@ -6979,6 +7024,18 @@ with expansion:
   ==
   3422778688 (0x<hex digits>)
 
+-------------------------------------------------------------------------------
+INFO and UNSCOPED_INFO can stream multiple arguments
+-------------------------------------------------------------------------------
+Message.tests.cpp:<line number>
+...............................................................................
+
+Message.tests.cpp:<line number>: FAILED:
+explicitly with messages:
+  This info has multiple parts.
+  This unscoped info has multiple parts.
+  Show infos!
+
 -------------------------------------------------------------------------------
 INFO and WARN do not abort tests
 -------------------------------------------------------------------------------
@@ -7140,6 +7197,17 @@ with messages:
   current counter 10
   i := 10
 
+-------------------------------------------------------------------------------
+Incomplete AssertionHandler
+-------------------------------------------------------------------------------
+AssertionHandler.tests.cpp:<line number>
+...............................................................................
+
+AssertionHandler.tests.cpp:<line number>: FAILED:
+  REQUIRE( Dummy )
+due to unexpected exception with message:
+  Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE
+
 -------------------------------------------------------------------------------
 Inequality checks that should fail
 -------------------------------------------------------------------------------
@@ -7232,6 +7300,291 @@ Condition.tests.cpp:<line number>: PASSED:
 with expansion:
   5 != 6
 
+-------------------------------------------------------------------------------
+JsonWriter
+  Newly constructed JsonWriter does nothing
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "" )
+with expansion:
+  "" == ""
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Calling writeObject will create an empty pair of braces
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "{\n}" )
+with expansion:
+  "{
+  }"
+  ==
+  "{
+  }"
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Calling writeObject with key will create an object to write the value
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE_THAT( stream.str(), ContainsSubstring( "\"int\": 1," ) && ContainsSubstring( "\"double\": 1.5," ) && ContainsSubstring( "\"true\": true," ) && ContainsSubstring( "\"false\": false," ) && ContainsSubstring( "\"string\": \"this is a string\"," ) && ContainsSubstring( "\"array\": [\n    1,\n    2\n  ]\n}" ) )
+with expansion:
+  "{
+    "int": 1,
+    "double": 1.5,
+    "true": true,
+    "false": false,
+    "string": "this is a string",
+    "array": [
+      1,
+      2
+    ]
+  }" ( contains: ""int": 1," and contains: ""double": 1.5," and contains:
+  ""true": true," and contains: ""false": false," and contains: ""string":
+  "this is a string"," and contains: ""array": [
+      1,
+      2
+    ]
+  }" )
+
+-------------------------------------------------------------------------------
+JsonWriter
+  nesting objects
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE_THAT( stream.str(), ContainsSubstring( "\"empty_object\": {\n  }," ) && ContainsSubstring( "\"fully_object\": {\n    \"key\": 1\n  }" ) )
+with expansion:
+  "{
+    "empty_object": {
+    },
+    "fully_object": {
+      "key": 1
+    }
+  }" ( contains: ""empty_object": {
+    }," and contains: ""fully_object": {
+      "key": 1
+    }" )
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Calling writeArray will create an empty pair of braces
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "[\n]" )
+with expansion:
+  "[
+  ]"
+  ==
+  "[
+  ]"
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Calling writeArray creates array to write the values to
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "[\n  1,\n  1.5,\n  true,\n  false,\n  \"this is a string\",\n  {\n    \"object\": 42\n  },\n  [\n    \"array\",\n    42.5\n  ]\n]" )
+with expansion:
+  "[
+    1,
+    1.5,
+    true,
+    false,
+    "this is a string",
+    {
+      "object": 42
+    },
+    [
+      "array",
+      42.5
+    ]
+  ]"
+  ==
+  "[
+    1,
+    1.5,
+    true,
+    false,
+    "this is a string",
+    {
+      "object": 42
+    },
+    [
+      "array",
+      42.5
+    ]
+  ]"
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Moved from JsonObjectWriter shall not insert superfluous brace
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "{\n}" )
+with expansion:
+  "{
+  }"
+  ==
+  "{
+  }"
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Moved from JsonArrayWriter shall not insert superfluous bracket
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "[\n]" )
+with expansion:
+  "[
+  ]"
+  ==
+  "[
+  ]"
+
+-------------------------------------------------------------------------------
+JsonWriter
+  Custom class shall be quoted
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( stream.str() == "\"custom\"" )
+with expansion:
+  ""custom"" == ""custom""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  Quote in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\\"\"" )
+with expansion:
+  ""\""" == ""\"""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  Backslash in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\\\\"" )
+with expansion:
+  ""\\"" == ""\\""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  Forward slash in a string is **not** escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"/\"" )
+with expansion:
+  ""/"" == ""/""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  Backspace in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\b\"" )
+with expansion:
+  ""\b"" == ""\b""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  Formfeed in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\f\"" )
+with expansion:
+  ""\f"" == ""\f""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  linefeed in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\n\"" )
+with expansion:
+  ""\n"" == ""\n""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  carriage return in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\r\"" )
+with expansion:
+  ""\r"" == ""\r""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  tab in a string is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\t\"" )
+with expansion:
+  ""\t"" == ""\t""
+
+-------------------------------------------------------------------------------
+JsonWriter escapes charaters in strings properly
+  combination of characters is escaped
+-------------------------------------------------------------------------------
+Json.tests.cpp:<line number>
+...............................................................................
+
+Json.tests.cpp:<line number>: PASSED:
+  REQUIRE( sstream.str() == "\"\\\\/\\t\\r\\n\"" )
+with expansion:
+  ""\\/\t\r\n"" == ""\\/\t\r\n""
+
 -------------------------------------------------------------------------------
 Lambdas in assertions
 -------------------------------------------------------------------------------
@@ -9730,6 +10083,129 @@ Reporter's write listings to provided stream
 Reporters.tests.cpp:<line number>
 ...............................................................................
 
+Reporters.tests.cpp:<line number>: PASSED:
+  REQUIRE_FALSE( factories.empty() )
+with expansion:
+  !false
+
+-------------------------------------------------------------------------------
+Reporter's write listings to provided stream
+  JSON reporter lists tags
+-------------------------------------------------------------------------------
+Reporters.tests.cpp:<line number>
+...............................................................................
+
+Reporters.tests.cpp:<line number>: PASSED:
+  REQUIRE_THAT( listingString, ContainsSubstring("fakeTag"s) )
+with expansion:
+  "{
+    "version": 1,
+    "metadata": {
+      "name": "",
+      "rng-seed": 1234,
+      "catch2-version": "<version>"
+    },
+    "listings": {
+      "tags": [
+        {
+          "aliases": [
+            "fakeTag"
+          ],
+          "count": 1
+        }
+      ]" contains: "fakeTag"
+with message:
+  Tested reporter: JSON
+
+-------------------------------------------------------------------------------
+Reporter's write listings to provided stream
+-------------------------------------------------------------------------------
+Reporters.tests.cpp:<line number>
+...............................................................................
+
+Reporters.tests.cpp:<line number>: PASSED:
+  REQUIRE_FALSE( factories.empty() )
+with expansion:
+  !false
+
+-------------------------------------------------------------------------------
+Reporter's write listings to provided stream
+  JSON reporter lists reporters
+-------------------------------------------------------------------------------
+Reporters.tests.cpp:<line number>
+...............................................................................
+
+Reporters.tests.cpp:<line number>: PASSED:
+  REQUIRE_THAT( listingString, ContainsSubstring("fake reporter"s) )
+with expansion:
+  "{
+    "version": 1,
+    "metadata": {
+      "name": "",
+      "rng-seed": 1234,
+      "catch2-version": "<version>"
+    },
+    "listings": {
+      "reporters": [
+        {
+          "name": "fake reporter",
+          "description": "fake description"
+        }
+      ]" contains: "fake reporter"
+with message:
+  Tested reporter: JSON
+
+-------------------------------------------------------------------------------
+Reporter's write listings to provided stream
+-------------------------------------------------------------------------------
+Reporters.tests.cpp:<line number>
+...............................................................................
+
+Reporters.tests.cpp:<line number>: PASSED:
+  REQUIRE_FALSE( factories.empty() )
+with expansion:
+  !false
+
+-------------------------------------------------------------------------------
+Reporter's write listings to provided stream
+  JSON reporter lists tests
+-------------------------------------------------------------------------------
+Reporters.tests.cpp:<line number>
+...............................................................................
+
+Reporters.tests.cpp:<line number>: PASSED:
+  REQUIRE_THAT( listingString, ContainsSubstring( "fake test name"s ) && ContainsSubstring( "fakeTestTag"s ) )
+with expansion:
+  "{
+    "version": 1,
+    "metadata": {
+      "name": "",
+      "rng-seed": 1234,
+      "catch2-version": "<version>"
+    },
+    "listings": {
+      "tests": [
+        {
+          "name": "fake test name",
+          "class-name": "",
+          "tags": [
+            "fakeTestTag"
+          ],
+          "source-location": {
+            "filename": "fake-file.cpp",
+            "line": 123456789
+          }
+        }
+      ]" ( contains: "fake test name" and contains: "fakeTestTag" )
+with message:
+  Tested reporter: JSON
+
+-------------------------------------------------------------------------------
+Reporter's write listings to provided stream
+-------------------------------------------------------------------------------
+Reporters.tests.cpp:<line number>
+...............................................................................
+
 Reporters.tests.cpp:<line number>: PASSED:
   REQUIRE_FALSE( factories.empty() )
 with expansion:
@@ -12514,6 +12990,34 @@ Misc.tests.cpp:<line number>: FAILED - but was ok:
 
 Misc.tests.cpp:<line number>: FAILED:
 
+-------------------------------------------------------------------------------
+Testing checked-if 4
+-------------------------------------------------------------------------------
+Misc.tests.cpp:<line number>
+...............................................................................
+
+Misc.tests.cpp:<line number>: PASSED:
+  CHECKED_ELSE( true )
+
+Misc.tests.cpp:<line number>: FAILED:
+  {Unknown expression after the reported line}
+due to unexpected exception with message:
+  Uncaught exception should fail!
+
+-------------------------------------------------------------------------------
+Testing checked-if 5
+-------------------------------------------------------------------------------
+Misc.tests.cpp:<line number>
+...............................................................................
+
+Misc.tests.cpp:<line number>: FAILED - but was ok:
+  CHECKED_ELSE( false )
+
+Misc.tests.cpp:<line number>: FAILED:
+  {Unknown expression after the reported line}
+due to unexpected exception with message:
+  Uncaught exception should fail!
+
 -------------------------------------------------------------------------------
 The NO_FAIL macro reports a failure but does not fail the test
 -------------------------------------------------------------------------------
@@ -13453,7 +13957,7 @@ with expansion:
 -------------------------------------------------------------------------------
 Usage of AllTrue range matcher
   Basic usage
-  One false evalutes to false
+  One false evaluates to false
 -------------------------------------------------------------------------------
 MatchersRanges.tests.cpp:<line number>
 ...............................................................................
@@ -13492,7 +13996,7 @@ with expansion:
 -------------------------------------------------------------------------------
 Usage of AllTrue range matcher
   Contained type is convertible to bool
-  One false evalutes to false
+  One false evaluates to false
 -------------------------------------------------------------------------------
 MatchersRanges.tests.cpp:<line number>
 ...............................................................................
@@ -13728,7 +14232,7 @@ with expansion:
 -------------------------------------------------------------------------------
 Usage of AnyTrue range matcher
   Basic usage
-  One true evalutes to true
+  One true evaluates to true
 -------------------------------------------------------------------------------
 MatchersRanges.tests.cpp:<line number>
 ...............................................................................
@@ -13767,7 +14271,7 @@ with expansion:
 -------------------------------------------------------------------------------
 Usage of AnyTrue range matcher
   Contained type is convertible to bool
-  One true evalutes to true
+  One true evaluates to true
 -------------------------------------------------------------------------------
 MatchersRanges.tests.cpp:<line number>
 ...............................................................................
@@ -14003,7 +14507,7 @@ with expansion:
 -------------------------------------------------------------------------------
 Usage of NoneTrue range matcher
   Basic usage
-  One true evalutes to false
+  One true evaluates to false
 -------------------------------------------------------------------------------
 MatchersRanges.tests.cpp:<line number>
 ...............................................................................
@@ -14042,7 +14546,7 @@ with expansion:
 -------------------------------------------------------------------------------
 Usage of NoneTrue range matcher
   Contained type is convertible to bool
-  One true evalutes to false
+  One true evaluates to false
 -------------------------------------------------------------------------------
 MatchersRanges.tests.cpp:<line number>
 ...............................................................................
@@ -17733,6 +18237,22 @@ InternalBenchmark.tests.cpp:<line number>: PASSED:
 with expansion:
   0.95 == 0.95
 
+-------------------------------------------------------------------------------
+uniform_integer_distribution can return the bounds
+-------------------------------------------------------------------------------
+RandomNumberGeneration.tests.cpp:<line number>
+...............................................................................
+
+RandomNumberGeneration.tests.cpp:<line number>: PASSED:
+  REQUIRE( dist.a() == -10 )
+with expansion:
+  -10 == -10
+
+RandomNumberGeneration.tests.cpp:<line number>: PASSED:
+  REQUIRE( dist.b() == 10 )
+with expansion:
+  10 == 10
+
 -------------------------------------------------------------------------------
 unique_ptr reimplementation: basic functionality
   Default constructed unique_ptr is empty
@@ -18220,6 +18740,6 @@ Misc.tests.cpp:<line number>
 Misc.tests.cpp:<line number>: PASSED:
 
 ===============================================================================
-test cases:  409 |  309 passed |  84 failed | 5 skipped | 11 failed as expected
-assertions: 2226 | 2049 passed | 145 failed | 32 failed as expected
+test cases:  417 |  312 passed |  85 failed | 6 skipped | 14 failed as expected
+assertions: 2260 | 2079 passed | 146 failed | 35 failed as expected
 
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/junit.sw.approved.txt b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/junit.sw.approved.txt
index 25129349..48eccfc3 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/junit.sw.approved.txt
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/junit.sw.approved.txt
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <testsuitesloose text artifact
 >
-  <testsuite name="<exe-name>" errors="17" failures="128" skipped="11" tests="2237" hostname="tbd" time="{duration}" timestamp="{iso8601-timestamp}">
+  <testsuite name="<exe-name>" errors="17" failures="129" skipped="12" tests="2272" hostname="tbd" time="{duration}" timestamp="{iso8601-timestamp}">
     <properties>
       <property name="random-seed" value="1"/>
       <property name="filters" value="&quot;*&quot; ~[!nonportable] ~[!benchmark] ~[approvals]"/>
@@ -462,8 +462,14 @@ at Exception.tests.cpp:<line number>
     </testcase>
     <testcase classname="<exe-name>.global" name="Default scale is invisible to comparison" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Directly creating an EnumInfo" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="Empty generators can SKIP in constructor" time="{duration}" status="run">
+      <skipped type="SKIP">
+SKIPPED
+This generator is empty
+at Skip.tests.cpp:<line number>
+      </skipped>
+    </testcase>
     <testcase classname="<exe-name>.global" name="Empty stream name opens cout stream" time="{duration}" status="run"/>
-    <testcase classname="<exe-name>.global" name="Empty tag is not allowed" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="EndsWith string matcher" time="{duration}" status="run">
       <failure message="testStringForMatching(), EndsWith( &quot;Substring&quot; )" type="CHECK_THAT">
 FAILED:
@@ -702,6 +708,7 @@ at Message.tests.cpp:<line number>
     <testcase classname="<exe-name>.global" name="Floating point matchers: float/Composed" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Floating point matchers: float/Constructor validation" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Floating point matchers: float/IsNaN" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="GENERATE can combine literals and generators" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Generators -- adapters/Filtering by predicate/Basic usage" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Generators -- adapters/Filtering by predicate/Throws if there are no matching values" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Generators -- adapters/Shortening a range" time="{duration}" status="run"/>
@@ -746,6 +753,15 @@ at Message.tests.cpp:<line number>
     <testcase classname="<exe-name>.global" name="Hashing different test cases produces different result/Different classname" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Hashing different test cases produces different result/Different tags" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Hashing test case produces same hash across multiple calls" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="INFO and UNSCOPED_INFO can stream multiple arguments" time="{duration}" status="run">
+      <failure type="FAIL">
+FAILED:
+Show infos!
+This info has multiple parts.
+This unscoped info has multiple parts.
+at Message.tests.cpp:<line number>
+      </failure>
+    </testcase>
     <testcase classname="<exe-name>.global" name="INFO and WARN do not abort tests" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="INFO gets logged on failure" time="{duration}" status="run">
       <failure message="a == 1" type="REQUIRE">
@@ -790,6 +806,15 @@ i := 10
 at Message.tests.cpp:<line number>
       </failure>
     </testcase>
+    <testcase classname="<exe-name>.global" name="Incomplete AssertionHandler" time="{duration}" status="run">
+      <skipped message="TEST_CASE tagged with !mayfail"/>
+      <error message="Dummy" type="REQUIRE">
+FAILED:
+  REQUIRE( Dummy )
+Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE
+at AssertionHandler.tests.cpp:<line number>
+      </error>
+    </testcase>
     <testcase classname="<exe-name>.global" name="Inequality checks that should fail" time="{duration}" status="run">
       <skipped message="TEST_CASE tagged with !mayfail"/>
       <failure message="data.int_seven != 7" type="CHECK">
@@ -829,6 +854,24 @@ at Condition.tests.cpp:<line number>
       </failure>
     </testcase>
     <testcase classname="<exe-name>.global" name="Inequality checks that should succeed" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Newly constructed JsonWriter does nothing" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Calling writeObject will create an empty pair of braces" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Calling writeObject with key will create an object to write the value" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/nesting objects" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Calling writeArray will create an empty pair of braces" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Calling writeArray creates array to write the values to" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Moved from JsonObjectWriter shall not insert superfluous brace" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Moved from JsonArrayWriter shall not insert superfluous bracket" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Custom class shall be quoted" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/Quote in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/Backslash in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/Forward slash in a string is **not** escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/Backspace in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/Formfeed in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/linefeed in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/carriage return in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/tab in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/combination of characters is escaped" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Lambdas in assertions" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Less-than inequalities with different epsilons" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="ManuallyRegistered" time="{duration}" status="run"/>
@@ -1166,6 +1209,9 @@ at Matchers.tests.cpp:<line number>
     <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/console reporter lists tags" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/console reporter lists reporters" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/console reporter lists tests" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/JSON reporter lists tags" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/JSON reporter lists reporters" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/JSON reporter lists tests" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/JUnit reporter lists tags" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/JUnit reporter lists reporters" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/JUnit reporter lists tests" time="{duration}" status="run"/>
@@ -1354,6 +1400,24 @@ FAILED:
 at Misc.tests.cpp:<line number>
       </failure>
     </testcase>
+    <testcase classname="<exe-name>.global" name="Testing checked-if 4" time="{duration}" status="run">
+      <skipped message="TEST_CASE tagged with !mayfail"/>
+      <error message="{Unknown expression after the reported line}">
+FAILED:
+  {Unknown expression after the reported line}
+Uncaught exception should fail!
+at Misc.tests.cpp:<line number>
+      </error>
+    </testcase>
+    <testcase classname="<exe-name>.global" name="Testing checked-if 5" time="{duration}" status="run">
+      <skipped message="TEST_CASE tagged with !mayfail"/>
+      <error message="{Unknown expression after the reported line}">
+FAILED:
+  {Unknown expression after the reported line}
+Uncaught exception should fail!
+at Misc.tests.cpp:<line number>
+      </error>
+    </testcase>
     <testcase classname="<exe-name>.global" name="The NO_FAIL macro reports a failure but does not fail the test" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="The default listing implementation write to provided stream/Listing tags" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="The default listing implementation write to provided stream/Listing reporters" time="{duration}" status="run"/>
@@ -1401,10 +1465,10 @@ at Exception.tests.cpp:<line number>
     <testcase classname="<exe-name>.global" name="Usage of AllMatch range matcher/Shortcircuiting/Short-circuited" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AllTrue range matcher/Basic usage/All true evaluates to true" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AllTrue range matcher/Basic usage/Empty evaluates to true" time="{duration}" status="run"/>
-    <testcase classname="<exe-name>.global" name="Usage of AllTrue range matcher/Basic usage/One false evalutes to false" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="Usage of AllTrue range matcher/Basic usage/One false evaluates to false" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AllTrue range matcher/Basic usage/All false evaluates to false" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AllTrue range matcher/Contained type is convertible to bool/All true evaluates to true" time="{duration}" status="run"/>
-    <testcase classname="<exe-name>.global" name="Usage of AllTrue range matcher/Contained type is convertible to bool/One false evalutes to false" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="Usage of AllTrue range matcher/Contained type is convertible to bool/One false evaluates to false" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AllTrue range matcher/Contained type is convertible to bool/All false evaluates to false" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AllTrue range matcher/Shortcircuiting/All are read" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AllTrue range matcher/Shortcircuiting/Short-circuited" time="{duration}" status="run"/>
@@ -1414,10 +1478,10 @@ at Exception.tests.cpp:<line number>
     <testcase classname="<exe-name>.global" name="Usage of AnyMatch range matcher/Shortcircuiting/Short-circuited" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AnyTrue range matcher/Basic usage/All true evaluates to true" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AnyTrue range matcher/Basic usage/Empty evaluates to false" time="{duration}" status="run"/>
-    <testcase classname="<exe-name>.global" name="Usage of AnyTrue range matcher/Basic usage/One true evalutes to true" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="Usage of AnyTrue range matcher/Basic usage/One true evaluates to true" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AnyTrue range matcher/Basic usage/All false evaluates to false" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AnyTrue range matcher/Contained type is convertible to bool/All true evaluates to true" time="{duration}" status="run"/>
-    <testcase classname="<exe-name>.global" name="Usage of AnyTrue range matcher/Contained type is convertible to bool/One true evalutes to true" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="Usage of AnyTrue range matcher/Contained type is convertible to bool/One true evaluates to true" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AnyTrue range matcher/Contained type is convertible to bool/All false evaluates to false" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AnyTrue range matcher/Shortcircuiting/All are read" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AnyTrue range matcher/Shortcircuiting/Short-circuited" time="{duration}" status="run"/>
@@ -1427,10 +1491,10 @@ at Exception.tests.cpp:<line number>
     <testcase classname="<exe-name>.global" name="Usage of NoneMatch range matcher/Shortcircuiting/Short-circuited" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of NoneTrue range matcher/Basic usage/All true evaluates to false" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of NoneTrue range matcher/Basic usage/Empty evaluates to true" time="{duration}" status="run"/>
-    <testcase classname="<exe-name>.global" name="Usage of NoneTrue range matcher/Basic usage/One true evalutes to false" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="Usage of NoneTrue range matcher/Basic usage/One true evaluates to false" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of NoneTrue range matcher/Basic usage/All false evaluates to true" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of NoneTrue range matcher/Contained type is convertible to bool/All true evaluates to false" time="{duration}" status="run"/>
-    <testcase classname="<exe-name>.global" name="Usage of NoneTrue range matcher/Contained type is convertible to bool/One true evalutes to false" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="Usage of NoneTrue range matcher/Contained type is convertible to bool/One true evaluates to false" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of NoneTrue range matcher/Contained type is convertible to bool/All false evaluates to true" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of NoneTrue range matcher/Shortcircuiting/All are read" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of NoneTrue range matcher/Shortcircuiting/Short-circuited" time="{duration}" status="run"/>
@@ -2011,6 +2075,7 @@ at Exception.tests.cpp:<line number>
     <testcase classname="<exe-name>.global" name="tuple&lt;string,string>" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="tuple&lt;tuple&lt;int>,tuple&lt;>,float>" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="uniform samples" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="uniform_integer_distribution can return the bounds" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="unique_ptr reimplementation: basic functionality/Default constructed unique_ptr is empty" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="unique_ptr reimplementation: basic functionality/Take ownership of allocation" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="unique_ptr reimplementation: basic functionality/Take ownership of allocation/Plain reset deallocates" time="{duration}" status="run"/>
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/junit.sw.multi.approved.txt b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/junit.sw.multi.approved.txt
index 6220d8e2..d270c88f 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/junit.sw.multi.approved.txt
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/junit.sw.multi.approved.txt
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <testsuites>
-  <testsuite name="<exe-name>" errors="17" failures="128" skipped="11" tests="2237" hostname="tbd" time="{duration}" timestamp="{iso8601-timestamp}">
+  <testsuite name="<exe-name>" errors="17" failures="129" skipped="12" tests="2272" hostname="tbd" time="{duration}" timestamp="{iso8601-timestamp}">
     <properties>
       <property name="random-seed" value="1"/>
       <property name="filters" value="&quot;*&quot; ~[!nonportable] ~[!benchmark] ~[approvals]"/>
@@ -461,8 +461,14 @@ at Exception.tests.cpp:<line number>
     </testcase>
     <testcase classname="<exe-name>.global" name="Default scale is invisible to comparison" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Directly creating an EnumInfo" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="Empty generators can SKIP in constructor" time="{duration}" status="run">
+      <skipped type="SKIP">
+SKIPPED
+This generator is empty
+at Skip.tests.cpp:<line number>
+      </skipped>
+    </testcase>
     <testcase classname="<exe-name>.global" name="Empty stream name opens cout stream" time="{duration}" status="run"/>
-    <testcase classname="<exe-name>.global" name="Empty tag is not allowed" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="EndsWith string matcher" time="{duration}" status="run">
       <failure message="testStringForMatching(), EndsWith( &quot;Substring&quot; )" type="CHECK_THAT">
 FAILED:
@@ -701,6 +707,7 @@ at Message.tests.cpp:<line number>
     <testcase classname="<exe-name>.global" name="Floating point matchers: float/Composed" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Floating point matchers: float/Constructor validation" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Floating point matchers: float/IsNaN" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="GENERATE can combine literals and generators" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Generators -- adapters/Filtering by predicate/Basic usage" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Generators -- adapters/Filtering by predicate/Throws if there are no matching values" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Generators -- adapters/Shortening a range" time="{duration}" status="run"/>
@@ -745,6 +752,15 @@ at Message.tests.cpp:<line number>
     <testcase classname="<exe-name>.global" name="Hashing different test cases produces different result/Different classname" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Hashing different test cases produces different result/Different tags" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Hashing test case produces same hash across multiple calls" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="INFO and UNSCOPED_INFO can stream multiple arguments" time="{duration}" status="run">
+      <failure type="FAIL">
+FAILED:
+Show infos!
+This info has multiple parts.
+This unscoped info has multiple parts.
+at Message.tests.cpp:<line number>
+      </failure>
+    </testcase>
     <testcase classname="<exe-name>.global" name="INFO and WARN do not abort tests" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="INFO gets logged on failure" time="{duration}" status="run">
       <failure message="a == 1" type="REQUIRE">
@@ -789,6 +805,15 @@ i := 10
 at Message.tests.cpp:<line number>
       </failure>
     </testcase>
+    <testcase classname="<exe-name>.global" name="Incomplete AssertionHandler" time="{duration}" status="run">
+      <skipped message="TEST_CASE tagged with !mayfail"/>
+      <error message="Dummy" type="REQUIRE">
+FAILED:
+  REQUIRE( Dummy )
+Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE
+at AssertionHandler.tests.cpp:<line number>
+      </error>
+    </testcase>
     <testcase classname="<exe-name>.global" name="Inequality checks that should fail" time="{duration}" status="run">
       <skipped message="TEST_CASE tagged with !mayfail"/>
       <failure message="data.int_seven != 7" type="CHECK">
@@ -828,6 +853,24 @@ at Condition.tests.cpp:<line number>
       </failure>
     </testcase>
     <testcase classname="<exe-name>.global" name="Inequality checks that should succeed" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Newly constructed JsonWriter does nothing" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Calling writeObject will create an empty pair of braces" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Calling writeObject with key will create an object to write the value" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/nesting objects" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Calling writeArray will create an empty pair of braces" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Calling writeArray creates array to write the values to" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Moved from JsonObjectWriter shall not insert superfluous brace" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Moved from JsonArrayWriter shall not insert superfluous bracket" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter/Custom class shall be quoted" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/Quote in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/Backslash in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/Forward slash in a string is **not** escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/Backspace in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/Formfeed in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/linefeed in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/carriage return in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/tab in a string is escaped" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="JsonWriter escapes charaters in strings properly/combination of characters is escaped" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Lambdas in assertions" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Less-than inequalities with different epsilons" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="ManuallyRegistered" time="{duration}" status="run"/>
@@ -1165,6 +1208,9 @@ at Matchers.tests.cpp:<line number>
     <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/console reporter lists tags" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/console reporter lists reporters" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/console reporter lists tests" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/JSON reporter lists tags" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/JSON reporter lists reporters" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/JSON reporter lists tests" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/JUnit reporter lists tags" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/JUnit reporter lists reporters" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Reporter's write listings to provided stream/JUnit reporter lists tests" time="{duration}" status="run"/>
@@ -1353,6 +1399,24 @@ FAILED:
 at Misc.tests.cpp:<line number>
       </failure>
     </testcase>
+    <testcase classname="<exe-name>.global" name="Testing checked-if 4" time="{duration}" status="run">
+      <skipped message="TEST_CASE tagged with !mayfail"/>
+      <error message="{Unknown expression after the reported line}">
+FAILED:
+  {Unknown expression after the reported line}
+Uncaught exception should fail!
+at Misc.tests.cpp:<line number>
+      </error>
+    </testcase>
+    <testcase classname="<exe-name>.global" name="Testing checked-if 5" time="{duration}" status="run">
+      <skipped message="TEST_CASE tagged with !mayfail"/>
+      <error message="{Unknown expression after the reported line}">
+FAILED:
+  {Unknown expression after the reported line}
+Uncaught exception should fail!
+at Misc.tests.cpp:<line number>
+      </error>
+    </testcase>
     <testcase classname="<exe-name>.global" name="The NO_FAIL macro reports a failure but does not fail the test" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="The default listing implementation write to provided stream/Listing tags" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="The default listing implementation write to provided stream/Listing reporters" time="{duration}" status="run"/>
@@ -1400,10 +1464,10 @@ at Exception.tests.cpp:<line number>
     <testcase classname="<exe-name>.global" name="Usage of AllMatch range matcher/Shortcircuiting/Short-circuited" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AllTrue range matcher/Basic usage/All true evaluates to true" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AllTrue range matcher/Basic usage/Empty evaluates to true" time="{duration}" status="run"/>
-    <testcase classname="<exe-name>.global" name="Usage of AllTrue range matcher/Basic usage/One false evalutes to false" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="Usage of AllTrue range matcher/Basic usage/One false evaluates to false" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AllTrue range matcher/Basic usage/All false evaluates to false" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AllTrue range matcher/Contained type is convertible to bool/All true evaluates to true" time="{duration}" status="run"/>
-    <testcase classname="<exe-name>.global" name="Usage of AllTrue range matcher/Contained type is convertible to bool/One false evalutes to false" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="Usage of AllTrue range matcher/Contained type is convertible to bool/One false evaluates to false" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AllTrue range matcher/Contained type is convertible to bool/All false evaluates to false" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AllTrue range matcher/Shortcircuiting/All are read" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AllTrue range matcher/Shortcircuiting/Short-circuited" time="{duration}" status="run"/>
@@ -1413,10 +1477,10 @@ at Exception.tests.cpp:<line number>
     <testcase classname="<exe-name>.global" name="Usage of AnyMatch range matcher/Shortcircuiting/Short-circuited" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AnyTrue range matcher/Basic usage/All true evaluates to true" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AnyTrue range matcher/Basic usage/Empty evaluates to false" time="{duration}" status="run"/>
-    <testcase classname="<exe-name>.global" name="Usage of AnyTrue range matcher/Basic usage/One true evalutes to true" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="Usage of AnyTrue range matcher/Basic usage/One true evaluates to true" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AnyTrue range matcher/Basic usage/All false evaluates to false" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AnyTrue range matcher/Contained type is convertible to bool/All true evaluates to true" time="{duration}" status="run"/>
-    <testcase classname="<exe-name>.global" name="Usage of AnyTrue range matcher/Contained type is convertible to bool/One true evalutes to true" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="Usage of AnyTrue range matcher/Contained type is convertible to bool/One true evaluates to true" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AnyTrue range matcher/Contained type is convertible to bool/All false evaluates to false" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AnyTrue range matcher/Shortcircuiting/All are read" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of AnyTrue range matcher/Shortcircuiting/Short-circuited" time="{duration}" status="run"/>
@@ -1426,10 +1490,10 @@ at Exception.tests.cpp:<line number>
     <testcase classname="<exe-name>.global" name="Usage of NoneMatch range matcher/Shortcircuiting/Short-circuited" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of NoneTrue range matcher/Basic usage/All true evaluates to false" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of NoneTrue range matcher/Basic usage/Empty evaluates to true" time="{duration}" status="run"/>
-    <testcase classname="<exe-name>.global" name="Usage of NoneTrue range matcher/Basic usage/One true evalutes to false" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="Usage of NoneTrue range matcher/Basic usage/One true evaluates to false" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of NoneTrue range matcher/Basic usage/All false evaluates to true" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of NoneTrue range matcher/Contained type is convertible to bool/All true evaluates to false" time="{duration}" status="run"/>
-    <testcase classname="<exe-name>.global" name="Usage of NoneTrue range matcher/Contained type is convertible to bool/One true evalutes to false" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="Usage of NoneTrue range matcher/Contained type is convertible to bool/One true evaluates to false" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of NoneTrue range matcher/Contained type is convertible to bool/All false evaluates to true" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of NoneTrue range matcher/Shortcircuiting/All are read" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="Usage of NoneTrue range matcher/Shortcircuiting/Short-circuited" time="{duration}" status="run"/>
@@ -2010,6 +2074,7 @@ at Exception.tests.cpp:<line number>
     <testcase classname="<exe-name>.global" name="tuple&lt;string,string>" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="tuple&lt;tuple&lt;int>,tuple&lt;>,float>" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="uniform samples" time="{duration}" status="run"/>
+    <testcase classname="<exe-name>.global" name="uniform_integer_distribution can return the bounds" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="unique_ptr reimplementation: basic functionality/Default constructed unique_ptr is empty" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="unique_ptr reimplementation: basic functionality/Take ownership of allocation" time="{duration}" status="run"/>
     <testcase classname="<exe-name>.global" name="unique_ptr reimplementation: basic functionality/Take ownership of allocation/Plain reset deallocates" time="{duration}" status="run"/>
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/sonarqube.sw.approved.txt b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/sonarqube.sw.approved.txt
index a4e08bd2..36b05e54 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/sonarqube.sw.approved.txt
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/sonarqube.sw.approved.txt
@@ -2,6 +2,16 @@
 <!-- filters='"*" ~[!nonportable] ~[!benchmark] ~[approvals]' rng-seed=1 -->
 <testExecutions version="1"loose text artifact
 >
+  <file path="tests/<exe-name>/IntrospectiveTests/AssertionHandler.tests.cpp">
+    <testCase name="Incomplete AssertionHandler" duration="{duration}">
+      <skipped message="REQUIRE(Dummy)">
+FAILED:
+	REQUIRE( Dummy )
+Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE
+at AssertionHandler.tests.cpp:<line number>
+      </skipped>
+    </testCase>
+  </file>
   <file path="tests/<exe-name>/IntrospectiveTests/Clara.tests.cpp">
     <testCase name="Clara::Arg supports single-arg parse the way Opt does" duration="{duration}"/>
     <testCase name="Clara::Opt supports accept-many lambdas/Parsing fails on multiple options without accept_many" duration="{duration}"/>
@@ -120,6 +130,26 @@
     <testCase name="warmup" duration="{duration}"/>
     <testCase name="weighted_average_quantile" duration="{duration}"/>
   </file>
+  <file path="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp">
+    <testCase name="JsonWriter/Newly constructed JsonWriter does nothing" duration="{duration}"/>
+    <testCase name="JsonWriter/Calling writeObject will create an empty pair of braces" duration="{duration}"/>
+    <testCase name="JsonWriter/Calling writeObject with key will create an object to write the value" duration="{duration}"/>
+    <testCase name="JsonWriter/nesting objects" duration="{duration}"/>
+    <testCase name="JsonWriter/Calling writeArray will create an empty pair of braces" duration="{duration}"/>
+    <testCase name="JsonWriter/Calling writeArray creates array to write the values to" duration="{duration}"/>
+    <testCase name="JsonWriter/Moved from JsonObjectWriter shall not insert superfluous brace" duration="{duration}"/>
+    <testCase name="JsonWriter/Moved from JsonArrayWriter shall not insert superfluous bracket" duration="{duration}"/>
+    <testCase name="JsonWriter/Custom class shall be quoted" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/Quote in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/Backslash in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/Forward slash in a string is **not** escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/Backspace in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/Formfeed in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/linefeed in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/carriage return in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/tab in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/combination of characters is escaped" duration="{duration}"/>
+  </file>
   <file path="tests/<exe-name>/IntrospectiveTests/Parse.tests.cpp">
     <testCase name="Parse uints/proper inputs" duration="{duration}"/>
     <testCase name="Parse uints/Bad inputs" duration="{duration}"/>
@@ -151,6 +181,7 @@
     <testCase name="Our PCG implementation provides expected results for known seeds/Specific seed" duration="{duration}"/>
     <testCase name="Random seed generation accepts known methods" duration="{duration}"/>
     <testCase name="Random seed generation reports unknown methods" duration="{duration}"/>
+    <testCase name="uniform_integer_distribution can return the bounds" duration="{duration}"/>
   </file>
   <file path="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp">
     <testCase name="Multireporter calls reporters and listeners in correct order" duration="{duration}"/>
@@ -168,6 +199,9 @@
     <testCase name="Reporter's write listings to provided stream/console reporter lists tags" duration="{duration}"/>
     <testCase name="Reporter's write listings to provided stream/console reporter lists reporters" duration="{duration}"/>
     <testCase name="Reporter's write listings to provided stream/console reporter lists tests" duration="{duration}"/>
+    <testCase name="Reporter's write listings to provided stream/JSON reporter lists tags" duration="{duration}"/>
+    <testCase name="Reporter's write listings to provided stream/JSON reporter lists reporters" duration="{duration}"/>
+    <testCase name="Reporter's write listings to provided stream/JSON reporter lists tests" duration="{duration}"/>
     <testCase name="Reporter's write listings to provided stream/JUnit reporter lists tags" duration="{duration}"/>
     <testCase name="Reporter's write listings to provided stream/JUnit reporter lists reporters" duration="{duration}"/>
     <testCase name="Reporter's write listings to provided stream/JUnit reporter lists tests" duration="{duration}"/>
@@ -233,7 +267,6 @@
     <testCase name="startsWith" duration="{duration}"/>
   </file>
   <file path="tests/<exe-name>/IntrospectiveTests/Tag.tests.cpp">
-    <testCase name="Empty tag is not allowed" duration="{duration}"/>
     <testCase name="Tag alias can be registered against tag patterns/The same tag alias can only be registered once" duration="{duration}"/>
     <testCase name="Tag alias can be registered against tag patterns/Tag aliases must be of the form [@name]" duration="{duration}"/>
     <testCase name="Tags with spaces and non-alphanumerical characters are accepted" duration="{duration}"/>
@@ -1039,6 +1072,7 @@ at Generators.tests.cpp:<line number>
     <testCase name="Copy and then generate a range/from var and iterators" duration="{duration}"/>
     <testCase name="Copy and then generate a range/From a temporary container" duration="{duration}"/>
     <testCase name="Copy and then generate a range/Final validation" duration="{duration}"/>
+    <testCase name="GENERATE can combine literals and generators" duration="{duration}"/>
     <testCase name="Generators -- adapters/Filtering by predicate/Basic usage" duration="{duration}"/>
     <testCase name="Generators -- adapters/Filtering by predicate/Throws if there are no matching values" duration="{duration}"/>
     <testCase name="Generators -- adapters/Shortening a range" duration="{duration}"/>
@@ -1389,10 +1423,10 @@ at Matchers.tests.cpp:<line number>
     <testCase name="Usage of AllMatch range matcher/Shortcircuiting/Short-circuited" duration="{duration}"/>
     <testCase name="Usage of AllTrue range matcher/Basic usage/All true evaluates to true" duration="{duration}"/>
     <testCase name="Usage of AllTrue range matcher/Basic usage/Empty evaluates to true" duration="{duration}"/>
-    <testCase name="Usage of AllTrue range matcher/Basic usage/One false evalutes to false" duration="{duration}"/>
+    <testCase name="Usage of AllTrue range matcher/Basic usage/One false evaluates to false" duration="{duration}"/>
     <testCase name="Usage of AllTrue range matcher/Basic usage/All false evaluates to false" duration="{duration}"/>
     <testCase name="Usage of AllTrue range matcher/Contained type is convertible to bool/All true evaluates to true" duration="{duration}"/>
-    <testCase name="Usage of AllTrue range matcher/Contained type is convertible to bool/One false evalutes to false" duration="{duration}"/>
+    <testCase name="Usage of AllTrue range matcher/Contained type is convertible to bool/One false evaluates to false" duration="{duration}"/>
     <testCase name="Usage of AllTrue range matcher/Contained type is convertible to bool/All false evaluates to false" duration="{duration}"/>
     <testCase name="Usage of AllTrue range matcher/Shortcircuiting/All are read" duration="{duration}"/>
     <testCase name="Usage of AllTrue range matcher/Shortcircuiting/Short-circuited" duration="{duration}"/>
@@ -1402,10 +1436,10 @@ at Matchers.tests.cpp:<line number>
     <testCase name="Usage of AnyMatch range matcher/Shortcircuiting/Short-circuited" duration="{duration}"/>
     <testCase name="Usage of AnyTrue range matcher/Basic usage/All true evaluates to true" duration="{duration}"/>
     <testCase name="Usage of AnyTrue range matcher/Basic usage/Empty evaluates to false" duration="{duration}"/>
-    <testCase name="Usage of AnyTrue range matcher/Basic usage/One true evalutes to true" duration="{duration}"/>
+    <testCase name="Usage of AnyTrue range matcher/Basic usage/One true evaluates to true" duration="{duration}"/>
     <testCase name="Usage of AnyTrue range matcher/Basic usage/All false evaluates to false" duration="{duration}"/>
     <testCase name="Usage of AnyTrue range matcher/Contained type is convertible to bool/All true evaluates to true" duration="{duration}"/>
-    <testCase name="Usage of AnyTrue range matcher/Contained type is convertible to bool/One true evalutes to true" duration="{duration}"/>
+    <testCase name="Usage of AnyTrue range matcher/Contained type is convertible to bool/One true evaluates to true" duration="{duration}"/>
     <testCase name="Usage of AnyTrue range matcher/Contained type is convertible to bool/All false evaluates to false" duration="{duration}"/>
     <testCase name="Usage of AnyTrue range matcher/Shortcircuiting/All are read" duration="{duration}"/>
     <testCase name="Usage of AnyTrue range matcher/Shortcircuiting/Short-circuited" duration="{duration}"/>
@@ -1415,10 +1449,10 @@ at Matchers.tests.cpp:<line number>
     <testCase name="Usage of NoneMatch range matcher/Shortcircuiting/Short-circuited" duration="{duration}"/>
     <testCase name="Usage of NoneTrue range matcher/Basic usage/All true evaluates to false" duration="{duration}"/>
     <testCase name="Usage of NoneTrue range matcher/Basic usage/Empty evaluates to true" duration="{duration}"/>
-    <testCase name="Usage of NoneTrue range matcher/Basic usage/One true evalutes to false" duration="{duration}"/>
+    <testCase name="Usage of NoneTrue range matcher/Basic usage/One true evaluates to false" duration="{duration}"/>
     <testCase name="Usage of NoneTrue range matcher/Basic usage/All false evaluates to true" duration="{duration}"/>
     <testCase name="Usage of NoneTrue range matcher/Contained type is convertible to bool/All true evaluates to false" duration="{duration}"/>
-    <testCase name="Usage of NoneTrue range matcher/Contained type is convertible to bool/One true evalutes to false" duration="{duration}"/>
+    <testCase name="Usage of NoneTrue range matcher/Contained type is convertible to bool/One true evaluates to false" duration="{duration}"/>
     <testCase name="Usage of NoneTrue range matcher/Contained type is convertible to bool/All false evaluates to true" duration="{duration}"/>
     <testCase name="Usage of NoneTrue range matcher/Shortcircuiting/All are read" duration="{duration}"/>
     <testCase name="Usage of NoneTrue range matcher/Shortcircuiting/Short-circuited" duration="{duration}"/>
@@ -1468,6 +1502,15 @@ at Message.tests.cpp:<line number>
       <failure message="FAIL_CHECK()">
 FAILED:
 This is a failure
+at Message.tests.cpp:<line number>
+      </failure>
+    </testCase>
+    <testCase name="INFO and UNSCOPED_INFO can stream multiple arguments" duration="{duration}">
+      <failure message="FAIL()">
+FAILED:
+Show infos!
+This info has multiple parts.
+This unscoped info has multiple parts.
 at Message.tests.cpp:<line number>
       </failure>
     </testCase>
@@ -1728,6 +1771,22 @@ at Misc.tests.cpp:<line number>
     <testCase name="Testing checked-if 3" duration="{duration}">
       <skipped message="FAIL()">
 FAILED:
+at Misc.tests.cpp:<line number>
+      </skipped>
+    </testCase>
+    <testCase name="Testing checked-if 4" duration="{duration}">
+      <skipped message="({Unknown expression after the reported line})">
+FAILED:
+	{Unknown expression after the reported line}
+Uncaught exception should fail!
+at Misc.tests.cpp:<line number>
+      </skipped>
+    </testCase>
+    <testCase name="Testing checked-if 5" duration="{duration}">
+      <skipped message="({Unknown expression after the reported line})">
+FAILED:
+	{Unknown expression after the reported line}
+Uncaught exception should fail!
 at Misc.tests.cpp:<line number>
       </skipped>
     </testCase>
@@ -1871,6 +1930,13 @@ at Misc.tests.cpp:<line number>
     <testCase name="xmlentitycheck/encoded chars: these should all be encoded: &amp;&amp;&amp;&quot;&quot;&quot;&lt;&lt;&lt;&amp;&quot;&lt;&lt;&amp;&quot;" duration="{duration}"/>
   </file>
   <file path="tests/<exe-name>/UsageTests/Skip.tests.cpp">
+    <testCase name="Empty generators can SKIP in constructor" duration="{duration}">
+      <skipped message="SKIP()">
+SKIPPED
+This generator is empty
+at Skip.tests.cpp:<line number>
+      </skipped>
+    </testCase>
     <testCase name="a succeeding test can still be skipped" duration="{duration}">
       <skipped message="SKIP()">
 SKIPPED
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/sonarqube.sw.multi.approved.txt b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/sonarqube.sw.multi.approved.txt
index c00defae..c9d3d205 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/sonarqube.sw.multi.approved.txt
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/sonarqube.sw.multi.approved.txt
@@ -1,6 +1,16 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!-- filters='"*" ~[!nonportable] ~[!benchmark] ~[approvals]' rng-seed=1 -->
 <testExecutions version="1">
+  <file path="tests/<exe-name>/IntrospectiveTests/AssertionHandler.tests.cpp">
+    <testCase name="Incomplete AssertionHandler" duration="{duration}">
+      <skipped message="REQUIRE(Dummy)">
+FAILED:
+	REQUIRE( Dummy )
+Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE
+at AssertionHandler.tests.cpp:<line number>
+      </skipped>
+    </testCase>
+  </file>
   <file path="tests/<exe-name>/IntrospectiveTests/Clara.tests.cpp">
     <testCase name="Clara::Arg supports single-arg parse the way Opt does" duration="{duration}"/>
     <testCase name="Clara::Opt supports accept-many lambdas/Parsing fails on multiple options without accept_many" duration="{duration}"/>
@@ -119,6 +129,26 @@
     <testCase name="warmup" duration="{duration}"/>
     <testCase name="weighted_average_quantile" duration="{duration}"/>
   </file>
+  <file path="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp">
+    <testCase name="JsonWriter/Newly constructed JsonWriter does nothing" duration="{duration}"/>
+    <testCase name="JsonWriter/Calling writeObject will create an empty pair of braces" duration="{duration}"/>
+    <testCase name="JsonWriter/Calling writeObject with key will create an object to write the value" duration="{duration}"/>
+    <testCase name="JsonWriter/nesting objects" duration="{duration}"/>
+    <testCase name="JsonWriter/Calling writeArray will create an empty pair of braces" duration="{duration}"/>
+    <testCase name="JsonWriter/Calling writeArray creates array to write the values to" duration="{duration}"/>
+    <testCase name="JsonWriter/Moved from JsonObjectWriter shall not insert superfluous brace" duration="{duration}"/>
+    <testCase name="JsonWriter/Moved from JsonArrayWriter shall not insert superfluous bracket" duration="{duration}"/>
+    <testCase name="JsonWriter/Custom class shall be quoted" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/Quote in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/Backslash in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/Forward slash in a string is **not** escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/Backspace in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/Formfeed in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/linefeed in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/carriage return in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/tab in a string is escaped" duration="{duration}"/>
+    <testCase name="JsonWriter escapes charaters in strings properly/combination of characters is escaped" duration="{duration}"/>
+  </file>
   <file path="tests/<exe-name>/IntrospectiveTests/Parse.tests.cpp">
     <testCase name="Parse uints/proper inputs" duration="{duration}"/>
     <testCase name="Parse uints/Bad inputs" duration="{duration}"/>
@@ -150,6 +180,7 @@
     <testCase name="Our PCG implementation provides expected results for known seeds/Specific seed" duration="{duration}"/>
     <testCase name="Random seed generation accepts known methods" duration="{duration}"/>
     <testCase name="Random seed generation reports unknown methods" duration="{duration}"/>
+    <testCase name="uniform_integer_distribution can return the bounds" duration="{duration}"/>
   </file>
   <file path="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp">
     <testCase name="Multireporter calls reporters and listeners in correct order" duration="{duration}"/>
@@ -167,6 +198,9 @@
     <testCase name="Reporter's write listings to provided stream/console reporter lists tags" duration="{duration}"/>
     <testCase name="Reporter's write listings to provided stream/console reporter lists reporters" duration="{duration}"/>
     <testCase name="Reporter's write listings to provided stream/console reporter lists tests" duration="{duration}"/>
+    <testCase name="Reporter's write listings to provided stream/JSON reporter lists tags" duration="{duration}"/>
+    <testCase name="Reporter's write listings to provided stream/JSON reporter lists reporters" duration="{duration}"/>
+    <testCase name="Reporter's write listings to provided stream/JSON reporter lists tests" duration="{duration}"/>
     <testCase name="Reporter's write listings to provided stream/JUnit reporter lists tags" duration="{duration}"/>
     <testCase name="Reporter's write listings to provided stream/JUnit reporter lists reporters" duration="{duration}"/>
     <testCase name="Reporter's write listings to provided stream/JUnit reporter lists tests" duration="{duration}"/>
@@ -232,7 +266,6 @@
     <testCase name="startsWith" duration="{duration}"/>
   </file>
   <file path="tests/<exe-name>/IntrospectiveTests/Tag.tests.cpp">
-    <testCase name="Empty tag is not allowed" duration="{duration}"/>
     <testCase name="Tag alias can be registered against tag patterns/The same tag alias can only be registered once" duration="{duration}"/>
     <testCase name="Tag alias can be registered against tag patterns/Tag aliases must be of the form [@name]" duration="{duration}"/>
     <testCase name="Tags with spaces and non-alphanumerical characters are accepted" duration="{duration}"/>
@@ -1038,6 +1071,7 @@ at Generators.tests.cpp:<line number>
     <testCase name="Copy and then generate a range/from var and iterators" duration="{duration}"/>
     <testCase name="Copy and then generate a range/From a temporary container" duration="{duration}"/>
     <testCase name="Copy and then generate a range/Final validation" duration="{duration}"/>
+    <testCase name="GENERATE can combine literals and generators" duration="{duration}"/>
     <testCase name="Generators -- adapters/Filtering by predicate/Basic usage" duration="{duration}"/>
     <testCase name="Generators -- adapters/Filtering by predicate/Throws if there are no matching values" duration="{duration}"/>
     <testCase name="Generators -- adapters/Shortening a range" duration="{duration}"/>
@@ -1388,10 +1422,10 @@ at Matchers.tests.cpp:<line number>
     <testCase name="Usage of AllMatch range matcher/Shortcircuiting/Short-circuited" duration="{duration}"/>
     <testCase name="Usage of AllTrue range matcher/Basic usage/All true evaluates to true" duration="{duration}"/>
     <testCase name="Usage of AllTrue range matcher/Basic usage/Empty evaluates to true" duration="{duration}"/>
-    <testCase name="Usage of AllTrue range matcher/Basic usage/One false evalutes to false" duration="{duration}"/>
+    <testCase name="Usage of AllTrue range matcher/Basic usage/One false evaluates to false" duration="{duration}"/>
     <testCase name="Usage of AllTrue range matcher/Basic usage/All false evaluates to false" duration="{duration}"/>
     <testCase name="Usage of AllTrue range matcher/Contained type is convertible to bool/All true evaluates to true" duration="{duration}"/>
-    <testCase name="Usage of AllTrue range matcher/Contained type is convertible to bool/One false evalutes to false" duration="{duration}"/>
+    <testCase name="Usage of AllTrue range matcher/Contained type is convertible to bool/One false evaluates to false" duration="{duration}"/>
     <testCase name="Usage of AllTrue range matcher/Contained type is convertible to bool/All false evaluates to false" duration="{duration}"/>
     <testCase name="Usage of AllTrue range matcher/Shortcircuiting/All are read" duration="{duration}"/>
     <testCase name="Usage of AllTrue range matcher/Shortcircuiting/Short-circuited" duration="{duration}"/>
@@ -1401,10 +1435,10 @@ at Matchers.tests.cpp:<line number>
     <testCase name="Usage of AnyMatch range matcher/Shortcircuiting/Short-circuited" duration="{duration}"/>
     <testCase name="Usage of AnyTrue range matcher/Basic usage/All true evaluates to true" duration="{duration}"/>
     <testCase name="Usage of AnyTrue range matcher/Basic usage/Empty evaluates to false" duration="{duration}"/>
-    <testCase name="Usage of AnyTrue range matcher/Basic usage/One true evalutes to true" duration="{duration}"/>
+    <testCase name="Usage of AnyTrue range matcher/Basic usage/One true evaluates to true" duration="{duration}"/>
     <testCase name="Usage of AnyTrue range matcher/Basic usage/All false evaluates to false" duration="{duration}"/>
     <testCase name="Usage of AnyTrue range matcher/Contained type is convertible to bool/All true evaluates to true" duration="{duration}"/>
-    <testCase name="Usage of AnyTrue range matcher/Contained type is convertible to bool/One true evalutes to true" duration="{duration}"/>
+    <testCase name="Usage of AnyTrue range matcher/Contained type is convertible to bool/One true evaluates to true" duration="{duration}"/>
     <testCase name="Usage of AnyTrue range matcher/Contained type is convertible to bool/All false evaluates to false" duration="{duration}"/>
     <testCase name="Usage of AnyTrue range matcher/Shortcircuiting/All are read" duration="{duration}"/>
     <testCase name="Usage of AnyTrue range matcher/Shortcircuiting/Short-circuited" duration="{duration}"/>
@@ -1414,10 +1448,10 @@ at Matchers.tests.cpp:<line number>
     <testCase name="Usage of NoneMatch range matcher/Shortcircuiting/Short-circuited" duration="{duration}"/>
     <testCase name="Usage of NoneTrue range matcher/Basic usage/All true evaluates to false" duration="{duration}"/>
     <testCase name="Usage of NoneTrue range matcher/Basic usage/Empty evaluates to true" duration="{duration}"/>
-    <testCase name="Usage of NoneTrue range matcher/Basic usage/One true evalutes to false" duration="{duration}"/>
+    <testCase name="Usage of NoneTrue range matcher/Basic usage/One true evaluates to false" duration="{duration}"/>
     <testCase name="Usage of NoneTrue range matcher/Basic usage/All false evaluates to true" duration="{duration}"/>
     <testCase name="Usage of NoneTrue range matcher/Contained type is convertible to bool/All true evaluates to false" duration="{duration}"/>
-    <testCase name="Usage of NoneTrue range matcher/Contained type is convertible to bool/One true evalutes to false" duration="{duration}"/>
+    <testCase name="Usage of NoneTrue range matcher/Contained type is convertible to bool/One true evaluates to false" duration="{duration}"/>
     <testCase name="Usage of NoneTrue range matcher/Contained type is convertible to bool/All false evaluates to true" duration="{duration}"/>
     <testCase name="Usage of NoneTrue range matcher/Shortcircuiting/All are read" duration="{duration}"/>
     <testCase name="Usage of NoneTrue range matcher/Shortcircuiting/Short-circuited" duration="{duration}"/>
@@ -1467,6 +1501,15 @@ at Message.tests.cpp:<line number>
       <failure message="FAIL_CHECK()">
 FAILED:
 This is a failure
+at Message.tests.cpp:<line number>
+      </failure>
+    </testCase>
+    <testCase name="INFO and UNSCOPED_INFO can stream multiple arguments" duration="{duration}">
+      <failure message="FAIL()">
+FAILED:
+Show infos!
+This info has multiple parts.
+This unscoped info has multiple parts.
 at Message.tests.cpp:<line number>
       </failure>
     </testCase>
@@ -1727,6 +1770,22 @@ at Misc.tests.cpp:<line number>
     <testCase name="Testing checked-if 3" duration="{duration}">
       <skipped message="FAIL()">
 FAILED:
+at Misc.tests.cpp:<line number>
+      </skipped>
+    </testCase>
+    <testCase name="Testing checked-if 4" duration="{duration}">
+      <skipped message="({Unknown expression after the reported line})">
+FAILED:
+	{Unknown expression after the reported line}
+Uncaught exception should fail!
+at Misc.tests.cpp:<line number>
+      </skipped>
+    </testCase>
+    <testCase name="Testing checked-if 5" duration="{duration}">
+      <skipped message="({Unknown expression after the reported line})">
+FAILED:
+	{Unknown expression after the reported line}
+Uncaught exception should fail!
 at Misc.tests.cpp:<line number>
       </skipped>
     </testCase>
@@ -1870,6 +1929,13 @@ at Misc.tests.cpp:<line number>
     <testCase name="xmlentitycheck/encoded chars: these should all be encoded: &amp;&amp;&amp;&quot;&quot;&quot;&lt;&lt;&lt;&amp;&quot;&lt;&lt;&amp;&quot;" duration="{duration}"/>
   </file>
   <file path="tests/<exe-name>/UsageTests/Skip.tests.cpp">
+    <testCase name="Empty generators can SKIP in constructor" duration="{duration}">
+      <skipped message="SKIP()">
+SKIPPED
+This generator is empty
+at Skip.tests.cpp:<line number>
+      </skipped>
+    </testCase>
     <testCase name="a succeeding test can still be skipped" duration="{duration}">
       <skipped message="SKIP()">
 SKIPPED
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/tap.sw.approved.txt b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/tap.sw.approved.txt
index 920c95fd..a02dbd95 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/tap.sw.approved.txt
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/tap.sw.approved.txt
@@ -659,7 +659,7 @@ ok {test-number} - unrelated::ADL_empty{}, IsEmpty() for: {?} is empty
 # CAPTURE can deal with complex expressions
 ok {test-number} - with 7 messages: 'a := 1' and 'b := 2' and 'c := 3' and 'a + b := 3' and 'a+b := 3' and 'c > b := true' and 'a == 1 := true'
 # CAPTURE can deal with complex expressions involving commas
-ok {test-number} - with 7 messages: 'std::vector<int>{1, 2, 3}[0, 1, 2] := 3' and 'std::vector<int>{1, 2, 3}[(0, 1)] := 2' and 'std::vector<int>{1, 2, 3}[0] := 1' and '(helper_1436<int, int>{12, -12}) := { 12, -12 }' and '(helper_1436<int, int>(-12, 12)) := { -12, 12 }' and '(1, 2) := 2' and '(2, 3) := 3'
+ok {test-number} - with 7 messages: 'custom_index_op<int>{1, 2, 3}[0, 1, 2] := 0' and 'custom_index_op<int>{1, 2, 3}[(0, 1)] := 0' and 'custom_index_op<int>{1, 2, 3}[0] := 0' and '(helper_1436<int, int>{12, -12}) := { 12, -12 }' and '(helper_1436<int, int>(-12, 12)) := { -12, 12 }' and '(1, 2) := 2' and '(2, 3) := 3'
 # CAPTURE parses string and character constants
 ok {test-number} - with 11 messages: '("comma, in string", "escaped, \", ") := "escaped, ", "' and '"single quote in string,'," := "single quote in string,',"' and '"some escapes, \\,\\\\" := "some escapes, \,\\"' and '"some, ), unmatched, } prenheses {[<" := "some, ), unmatched, } prenheses {[<"' and ''"' := '"'' and ''\'' := '''' and '',' := ','' and ''}' := '}'' and '')' := ')'' and ''(' := '('' and ''{' := '{''
 # Capture and info messages
@@ -984,10 +984,10 @@ ok {test-number} - enumInfo->lookup(0) == "Value1" for: Value1 == "Value1"
 ok {test-number} - enumInfo->lookup(1) == "Value2" for: Value2 == "Value2"
 # Directly creating an EnumInfo
 ok {test-number} - enumInfo->lookup(3) == "{** unexpected enum value **}" for: {** unexpected enum value **} == "{** unexpected enum value **}"
+# Empty generators can SKIP in constructor
+ok {test-number} -  # SKIP 'This generator is empty'
 # Empty stream name opens cout stream
 ok {test-number} - Catch::makeStream( "" )->isConsole() for: true
-# Empty tag is not allowed
-ok {test-number} - Catch::TestCaseInfo( "", { "fake test name", "[]" }, dummySourceLineInfo )
 # EndsWith string matcher
 not ok {test-number} - testStringForMatching(), EndsWith( "Substring" ) for: "this string contains 'abc' as a substring" ends with: "Substring"
 # EndsWith string matcher
@@ -1258,6 +1258,14 @@ ok {test-number} - WithinRel( 1.f, -0.2f ), std::domain_error
 ok {test-number} - WithinRel( 1.f, 1.f ), std::domain_error
 # Floating point matchers: float
 ok {test-number} - 1., !IsNaN() for: 1.0 not is NaN
+# GENERATE can combine literals and generators
+ok {test-number} - i % 2 == 0 for: 0 == 0
+# GENERATE can combine literals and generators
+ok {test-number} - i % 2 == 0 for: 0 == 0
+# GENERATE can combine literals and generators
+ok {test-number} - i % 2 == 0 for: 0 == 0
+# GENERATE can combine literals and generators
+ok {test-number} - i % 2 == 0 for: 0 == 0
 # Generators -- adapters
 ok {test-number} - i % 2 == 0 for: 0 == 0
 # Generators -- adapters
@@ -1796,6 +1804,8 @@ ok {test-number} - h( dummy1 ) != h( dummy2 ) for: 2673152918 (0x<hex digits>) !
 ok {test-number} - h( dummy1 ) != h( dummy2 ) for: 2074929312 (0x<hex digits>) != 3429949824 (0x<hex digits>)
 # Hashing test case produces same hash across multiple calls
 ok {test-number} - h( dummy ) == h( dummy ) for: 3422778688 (0x<hex digits>) == 3422778688 (0x<hex digits>)
+# INFO and UNSCOPED_INFO can stream multiple arguments
+not ok {test-number} - explicitly with 3 messages: 'This info has multiple parts.' and 'This unscoped info has multiple parts.' and 'Show infos!'
 # INFO and WARN do not abort tests
 warning {test-number} - 'this is a message' with 1 message: 'this is a warning'
 # INFO gets logged on failure
@@ -1830,6 +1840,8 @@ ok {test-number} - i < 10 for: 8 < 10 with 2 messages: 'current counter 8' and '
 ok {test-number} - i < 10 for: 9 < 10 with 2 messages: 'current counter 9' and 'i := 9'
 # INFO is reset for each loop
 not ok {test-number} - i < 10 for: 10 < 10 with 2 messages: 'current counter 10' and 'i := 10'
+# Incomplete AssertionHandler
+not ok {test-number} - unexpected exception with message: 'Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE'; expression was: Dummy
 # Inequality checks that should fail
 not ok {test-number} - data.int_seven != 7 for: 7 != 7
 # Inequality checks that should fail
@@ -1862,6 +1874,42 @@ ok {test-number} - data.str_hello != "hell" for: "hello" != "hell"
 ok {test-number} - data.str_hello != "hello1" for: "hello" != "hello1"
 # Inequality checks that should succeed
 ok {test-number} - data.str_hello.size() != 6 for: 5 != 6
+# JsonWriter
+ok {test-number} - stream.str() == "" for: "" == ""
+# JsonWriter
+ok {test-number} - stream.str() == "{\n}" for: "{ }" == "{ }"
+# JsonWriter
+ok {test-number} - stream.str(), ContainsSubstring( "\"int\": 1," ) && ContainsSubstring( "\"double\": 1.5," ) && ContainsSubstring( "\"true\": true," ) && ContainsSubstring( "\"false\": false," ) && ContainsSubstring( "\"string\": \"this is a string\"," ) && ContainsSubstring( "\"array\": [\n    1,\n    2\n  ]\n}" ) for: "{   "int": 1,   "double": 1.5,   "true": true,   "false": false,   "string": "this is a string",   "array": [     1,     2   ] }" ( contains: ""int": 1," and contains: ""double": 1.5," and contains: ""true": true," and contains: ""false": false," and contains: ""string": "this is a string"," and contains: ""array": [     1,     2   ] }" )
+# JsonWriter
+ok {test-number} - stream.str(), ContainsSubstring( "\"empty_object\": {\n  }," ) && ContainsSubstring( "\"fully_object\": {\n    \"key\": 1\n  }" ) for: "{   "empty_object": {   },   "fully_object": {     "key": 1   } }" ( contains: ""empty_object": {   }," and contains: ""fully_object": {     "key": 1   }" )
+# JsonWriter
+ok {test-number} - stream.str() == "[\n]" for: "[ ]" == "[ ]"
+# JsonWriter
+ok {test-number} - stream.str() == "[\n  1,\n  1.5,\n  true,\n  false,\n  \"this is a string\",\n  {\n    \"object\": 42\n  },\n  [\n    \"array\",\n    42.5\n  ]\n]" for: "[   1,   1.5,   true,   false,   "this is a string",   {     "object": 42   },   [     "array",     42.5   ] ]" == "[   1,   1.5,   true,   false,   "this is a string",   {     "object": 42   },   [     "array",     42.5   ] ]"
+# JsonWriter
+ok {test-number} - stream.str() == "{\n}" for: "{ }" == "{ }"
+# JsonWriter
+ok {test-number} - stream.str() == "[\n]" for: "[ ]" == "[ ]"
+# JsonWriter
+ok {test-number} - stream.str() == "\"custom\"" for: ""custom"" == ""custom""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\\"\"" for: ""\""" == ""\"""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\\\\"" for: ""\\"" == ""\\""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"/\"" for: ""/"" == ""/""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\b\"" for: ""\b"" == ""\b""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\f\"" for: ""\f"" == ""\f""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\n\"" for: ""\n"" == ""\n""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\r\"" for: ""\r"" == ""\r""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\t\"" for: ""\t"" == ""\t""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\\\/\\t\\r\\n\"" for: ""\\/\t\r\n"" == ""\\/\t\r\n""
 # Lambdas in assertions
 ok {test-number} - []() { return true; }() for: true
 # Less-than inequalities with different epsilons
@@ -2455,6 +2503,18 @@ ok {test-number} - listingString, ContainsSubstring( "fake test name"s ) && Cont
 # Reporter's write listings to provided stream
 ok {test-number} - !(factories.empty()) for: !false
 # Reporter's write listings to provided stream
+ok {test-number} - listingString, ContainsSubstring("fakeTag"s) for: "{   "version": 1,   "metadata": {     "name": "",     "rng-seed": 1234,     "catch2-version": "<version>"   },   "listings": {     "tags": [       {         "aliases": [           "fakeTag"         ],         "count": 1       }     ]" contains: "fakeTag" with 1 message: 'Tested reporter: JSON'
+# Reporter's write listings to provided stream
+ok {test-number} - !(factories.empty()) for: !false
+# Reporter's write listings to provided stream
+ok {test-number} - listingString, ContainsSubstring("fake reporter"s) for: "{   "version": 1,   "metadata": {     "name": "",     "rng-seed": 1234,     "catch2-version": "<version>"   },   "listings": {     "reporters": [       {         "name": "fake reporter",         "description": "fake description"       }     ]" contains: "fake reporter" with 1 message: 'Tested reporter: JSON'
+# Reporter's write listings to provided stream
+ok {test-number} - !(factories.empty()) for: !false
+# Reporter's write listings to provided stream
+ok {test-number} - listingString, ContainsSubstring( "fake test name"s ) && ContainsSubstring( "fakeTestTag"s ) for: "{   "version": 1,   "metadata": {     "name": "",     "rng-seed": 1234,     "catch2-version": "<version>"   },   "listings": {     "tests": [       {         "name": "fake test name",         "class-name": "",         "tags": [           "fakeTestTag"         ],         "source-location": {           "filename": "fake-file.cpp",           "line": 123456789         }       }     ]" ( contains: "fake test name" and contains: "fakeTestTag" ) with 1 message: 'Tested reporter: JSON'
+# Reporter's write listings to provided stream
+ok {test-number} - !(factories.empty()) for: !false
+# Reporter's write listings to provided stream
 ok {test-number} - listingString, ContainsSubstring("fakeTag"s) for: "<?xml version="1.0" encoding="UTF-8"?> All available tags:    1  [fakeTag] 1 tag  " contains: "fakeTag" with 1 message: 'Tested reporter: JUnit'
 # Reporter's write listings to provided stream
 ok {test-number} - !(factories.empty()) for: !false
@@ -3067,6 +3127,14 @@ not ok {test-number} - explicitly
 ok {test-number} - false  # TODO
 # Testing checked-if 3
 not ok {test-number} - explicitly
+# Testing checked-if 4
+ok {test-number} - true
+# Testing checked-if 4
+not ok {test-number} - unexpected exception with message: 'Uncaught exception should fail!'; expression was: {Unknown expression after the reported line}
+# Testing checked-if 5
+ok {test-number} - false  # TODO
+# Testing checked-if 5
+not ok {test-number} - unexpected exception with message: 'Uncaught exception should fail!'; expression was: {Unknown expression after the reported line}
 # The NO_FAIL macro reports a failure but does not fail the test
 ok {test-number} - 1 == 2  # TODO
 # The default listing implementation write to provided stream
@@ -4355,6 +4423,10 @@ ok {test-number} - e.upper_bound == 23 for: 23.0 == 23
 ok {test-number} - e.lower_bound == 23 for: 23.0 == 23
 # uniform samples
 ok {test-number} - e.confidence_interval == 0.95 for: 0.95 == 0.95
+# uniform_integer_distribution can return the bounds
+ok {test-number} - dist.a() == -10 for: -10 == -10
+# uniform_integer_distribution can return the bounds
+ok {test-number} - dist.b() == 10 for: 10 == 10
 # unique_ptr reimplementation: basic functionality
 ok {test-number} - !(ptr) for: !{?}
 # unique_ptr reimplementation: basic functionality
@@ -4477,5 +4549,5 @@ ok {test-number} - q3 == 23. for: 23.0 == 23.0
 ok {test-number} -
 # xmlentitycheck
 ok {test-number} -
-1..2237
+1..2272
 
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/tap.sw.multi.approved.txt b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/tap.sw.multi.approved.txt
index c0e0c4db..13449bd4 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/tap.sw.multi.approved.txt
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/tap.sw.multi.approved.txt
@@ -657,7 +657,7 @@ ok {test-number} - unrelated::ADL_empty{}, IsEmpty() for: {?} is empty
 # CAPTURE can deal with complex expressions
 ok {test-number} - with 7 messages: 'a := 1' and 'b := 2' and 'c := 3' and 'a + b := 3' and 'a+b := 3' and 'c > b := true' and 'a == 1 := true'
 # CAPTURE can deal with complex expressions involving commas
-ok {test-number} - with 7 messages: 'std::vector<int>{1, 2, 3}[0, 1, 2] := 3' and 'std::vector<int>{1, 2, 3}[(0, 1)] := 2' and 'std::vector<int>{1, 2, 3}[0] := 1' and '(helper_1436<int, int>{12, -12}) := { 12, -12 }' and '(helper_1436<int, int>(-12, 12)) := { -12, 12 }' and '(1, 2) := 2' and '(2, 3) := 3'
+ok {test-number} - with 7 messages: 'custom_index_op<int>{1, 2, 3}[0, 1, 2] := 0' and 'custom_index_op<int>{1, 2, 3}[(0, 1)] := 0' and 'custom_index_op<int>{1, 2, 3}[0] := 0' and '(helper_1436<int, int>{12, -12}) := { 12, -12 }' and '(helper_1436<int, int>(-12, 12)) := { -12, 12 }' and '(1, 2) := 2' and '(2, 3) := 3'
 # CAPTURE parses string and character constants
 ok {test-number} - with 11 messages: '("comma, in string", "escaped, \", ") := "escaped, ", "' and '"single quote in string,'," := "single quote in string,',"' and '"some escapes, \\,\\\\" := "some escapes, \,\\"' and '"some, ), unmatched, } prenheses {[<" := "some, ), unmatched, } prenheses {[<"' and ''"' := '"'' and ''\'' := '''' and '',' := ','' and ''}' := '}'' and '')' := ')'' and ''(' := '('' and ''{' := '{''
 # Capture and info messages
@@ -982,10 +982,10 @@ ok {test-number} - enumInfo->lookup(0) == "Value1" for: Value1 == "Value1"
 ok {test-number} - enumInfo->lookup(1) == "Value2" for: Value2 == "Value2"
 # Directly creating an EnumInfo
 ok {test-number} - enumInfo->lookup(3) == "{** unexpected enum value **}" for: {** unexpected enum value **} == "{** unexpected enum value **}"
+# Empty generators can SKIP in constructor
+ok {test-number} -  # SKIP 'This generator is empty'
 # Empty stream name opens cout stream
 ok {test-number} - Catch::makeStream( "" )->isConsole() for: true
-# Empty tag is not allowed
-ok {test-number} - Catch::TestCaseInfo( "", { "fake test name", "[]" }, dummySourceLineInfo )
 # EndsWith string matcher
 not ok {test-number} - testStringForMatching(), EndsWith( "Substring" ) for: "this string contains 'abc' as a substring" ends with: "Substring"
 # EndsWith string matcher
@@ -1256,6 +1256,14 @@ ok {test-number} - WithinRel( 1.f, -0.2f ), std::domain_error
 ok {test-number} - WithinRel( 1.f, 1.f ), std::domain_error
 # Floating point matchers: float
 ok {test-number} - 1., !IsNaN() for: 1.0 not is NaN
+# GENERATE can combine literals and generators
+ok {test-number} - i % 2 == 0 for: 0 == 0
+# GENERATE can combine literals and generators
+ok {test-number} - i % 2 == 0 for: 0 == 0
+# GENERATE can combine literals and generators
+ok {test-number} - i % 2 == 0 for: 0 == 0
+# GENERATE can combine literals and generators
+ok {test-number} - i % 2 == 0 for: 0 == 0
 # Generators -- adapters
 ok {test-number} - i % 2 == 0 for: 0 == 0
 # Generators -- adapters
@@ -1794,6 +1802,8 @@ ok {test-number} - h( dummy1 ) != h( dummy2 ) for: 2673152918 (0x<hex digits>) !
 ok {test-number} - h( dummy1 ) != h( dummy2 ) for: 2074929312 (0x<hex digits>) != 3429949824 (0x<hex digits>)
 # Hashing test case produces same hash across multiple calls
 ok {test-number} - h( dummy ) == h( dummy ) for: 3422778688 (0x<hex digits>) == 3422778688 (0x<hex digits>)
+# INFO and UNSCOPED_INFO can stream multiple arguments
+not ok {test-number} - explicitly with 3 messages: 'This info has multiple parts.' and 'This unscoped info has multiple parts.' and 'Show infos!'
 # INFO and WARN do not abort tests
 warning {test-number} - 'this is a message' with 1 message: 'this is a warning'
 # INFO gets logged on failure
@@ -1828,6 +1838,8 @@ ok {test-number} - i < 10 for: 8 < 10 with 2 messages: 'current counter 8' and '
 ok {test-number} - i < 10 for: 9 < 10 with 2 messages: 'current counter 9' and 'i := 9'
 # INFO is reset for each loop
 not ok {test-number} - i < 10 for: 10 < 10 with 2 messages: 'current counter 10' and 'i := 10'
+# Incomplete AssertionHandler
+not ok {test-number} - unexpected exception with message: 'Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE'; expression was: Dummy
 # Inequality checks that should fail
 not ok {test-number} - data.int_seven != 7 for: 7 != 7
 # Inequality checks that should fail
@@ -1860,6 +1872,42 @@ ok {test-number} - data.str_hello != "hell" for: "hello" != "hell"
 ok {test-number} - data.str_hello != "hello1" for: "hello" != "hello1"
 # Inequality checks that should succeed
 ok {test-number} - data.str_hello.size() != 6 for: 5 != 6
+# JsonWriter
+ok {test-number} - stream.str() == "" for: "" == ""
+# JsonWriter
+ok {test-number} - stream.str() == "{\n}" for: "{ }" == "{ }"
+# JsonWriter
+ok {test-number} - stream.str(), ContainsSubstring( "\"int\": 1," ) && ContainsSubstring( "\"double\": 1.5," ) && ContainsSubstring( "\"true\": true," ) && ContainsSubstring( "\"false\": false," ) && ContainsSubstring( "\"string\": \"this is a string\"," ) && ContainsSubstring( "\"array\": [\n    1,\n    2\n  ]\n}" ) for: "{   "int": 1,   "double": 1.5,   "true": true,   "false": false,   "string": "this is a string",   "array": [     1,     2   ] }" ( contains: ""int": 1," and contains: ""double": 1.5," and contains: ""true": true," and contains: ""false": false," and contains: ""string": "this is a string"," and contains: ""array": [     1,     2   ] }" )
+# JsonWriter
+ok {test-number} - stream.str(), ContainsSubstring( "\"empty_object\": {\n  }," ) && ContainsSubstring( "\"fully_object\": {\n    \"key\": 1\n  }" ) for: "{   "empty_object": {   },   "fully_object": {     "key": 1   } }" ( contains: ""empty_object": {   }," and contains: ""fully_object": {     "key": 1   }" )
+# JsonWriter
+ok {test-number} - stream.str() == "[\n]" for: "[ ]" == "[ ]"
+# JsonWriter
+ok {test-number} - stream.str() == "[\n  1,\n  1.5,\n  true,\n  false,\n  \"this is a string\",\n  {\n    \"object\": 42\n  },\n  [\n    \"array\",\n    42.5\n  ]\n]" for: "[   1,   1.5,   true,   false,   "this is a string",   {     "object": 42   },   [     "array",     42.5   ] ]" == "[   1,   1.5,   true,   false,   "this is a string",   {     "object": 42   },   [     "array",     42.5   ] ]"
+# JsonWriter
+ok {test-number} - stream.str() == "{\n}" for: "{ }" == "{ }"
+# JsonWriter
+ok {test-number} - stream.str() == "[\n]" for: "[ ]" == "[ ]"
+# JsonWriter
+ok {test-number} - stream.str() == "\"custom\"" for: ""custom"" == ""custom""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\\"\"" for: ""\""" == ""\"""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\\\\"" for: ""\\"" == ""\\""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"/\"" for: ""/"" == ""/""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\b\"" for: ""\b"" == ""\b""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\f\"" for: ""\f"" == ""\f""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\n\"" for: ""\n"" == ""\n""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\r\"" for: ""\r"" == ""\r""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\t\"" for: ""\t"" == ""\t""
+# JsonWriter escapes charaters in strings properly
+ok {test-number} - sstream.str() == "\"\\\\/\\t\\r\\n\"" for: ""\\/\t\r\n"" == ""\\/\t\r\n""
 # Lambdas in assertions
 ok {test-number} - []() { return true; }() for: true
 # Less-than inequalities with different epsilons
@@ -2453,6 +2501,18 @@ ok {test-number} - listingString, ContainsSubstring( "fake test name"s ) && Cont
 # Reporter's write listings to provided stream
 ok {test-number} - !(factories.empty()) for: !false
 # Reporter's write listings to provided stream
+ok {test-number} - listingString, ContainsSubstring("fakeTag"s) for: "{   "version": 1,   "metadata": {     "name": "",     "rng-seed": 1234,     "catch2-version": "<version>"   },   "listings": {     "tags": [       {         "aliases": [           "fakeTag"         ],         "count": 1       }     ]" contains: "fakeTag" with 1 message: 'Tested reporter: JSON'
+# Reporter's write listings to provided stream
+ok {test-number} - !(factories.empty()) for: !false
+# Reporter's write listings to provided stream
+ok {test-number} - listingString, ContainsSubstring("fake reporter"s) for: "{   "version": 1,   "metadata": {     "name": "",     "rng-seed": 1234,     "catch2-version": "<version>"   },   "listings": {     "reporters": [       {         "name": "fake reporter",         "description": "fake description"       }     ]" contains: "fake reporter" with 1 message: 'Tested reporter: JSON'
+# Reporter's write listings to provided stream
+ok {test-number} - !(factories.empty()) for: !false
+# Reporter's write listings to provided stream
+ok {test-number} - listingString, ContainsSubstring( "fake test name"s ) && ContainsSubstring( "fakeTestTag"s ) for: "{   "version": 1,   "metadata": {     "name": "",     "rng-seed": 1234,     "catch2-version": "<version>"   },   "listings": {     "tests": [       {         "name": "fake test name",         "class-name": "",         "tags": [           "fakeTestTag"         ],         "source-location": {           "filename": "fake-file.cpp",           "line": 123456789         }       }     ]" ( contains: "fake test name" and contains: "fakeTestTag" ) with 1 message: 'Tested reporter: JSON'
+# Reporter's write listings to provided stream
+ok {test-number} - !(factories.empty()) for: !false
+# Reporter's write listings to provided stream
 ok {test-number} - listingString, ContainsSubstring("fakeTag"s) for: "<?xml version="1.0" encoding="UTF-8"?> All available tags:    1  [fakeTag] 1 tag  " contains: "fakeTag" with 1 message: 'Tested reporter: JUnit'
 # Reporter's write listings to provided stream
 ok {test-number} - !(factories.empty()) for: !false
@@ -3060,6 +3120,14 @@ not ok {test-number} - explicitly
 ok {test-number} - false  # TODO
 # Testing checked-if 3
 not ok {test-number} - explicitly
+# Testing checked-if 4
+ok {test-number} - true
+# Testing checked-if 4
+not ok {test-number} - unexpected exception with message: 'Uncaught exception should fail!'; expression was: {Unknown expression after the reported line}
+# Testing checked-if 5
+ok {test-number} - false  # TODO
+# Testing checked-if 5
+not ok {test-number} - unexpected exception with message: 'Uncaught exception should fail!'; expression was: {Unknown expression after the reported line}
 # The NO_FAIL macro reports a failure but does not fail the test
 ok {test-number} - 1 == 2  # TODO
 # The default listing implementation write to provided stream
@@ -4344,6 +4412,10 @@ ok {test-number} - e.upper_bound == 23 for: 23.0 == 23
 ok {test-number} - e.lower_bound == 23 for: 23.0 == 23
 # uniform samples
 ok {test-number} - e.confidence_interval == 0.95 for: 0.95 == 0.95
+# uniform_integer_distribution can return the bounds
+ok {test-number} - dist.a() == -10 for: -10 == -10
+# uniform_integer_distribution can return the bounds
+ok {test-number} - dist.b() == 10 for: 10 == 10
 # unique_ptr reimplementation: basic functionality
 ok {test-number} - !(ptr) for: !{?}
 # unique_ptr reimplementation: basic functionality
@@ -4466,5 +4538,5 @@ ok {test-number} - q3 == 23. for: 23.0 == 23.0
 ok {test-number} -
 # xmlentitycheck
 ok {test-number} -
-1..2237
+1..2272
 
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/teamcity.sw.approved.txt b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/teamcity.sw.approved.txt
index 8a8b55e2..2a2c40cf 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/teamcity.sw.approved.txt
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/teamcity.sw.approved.txt
@@ -299,10 +299,11 @@
 ##teamcity[testFinished name='Default scale is invisible to comparison' duration="{duration}"]
 ##teamcity[testStarted name='Directly creating an EnumInfo']
 ##teamcity[testFinished name='Directly creating an EnumInfo' duration="{duration}"]
+##teamcity[testStarted name='Empty generators can SKIP in constructor']
+##teamcity[testIgnored name='Empty generators can SKIP in constructor' message='Skip.tests.cpp:<line number>|n...............................................................................|n|nSkip.tests.cpp:<line number>|nexplicit skip with message:|n  "This generator is empty"']
+##teamcity[testFinished name='Empty generators can SKIP in constructor' duration="{duration}"]
 ##teamcity[testStarted name='Empty stream name opens cout stream']
 ##teamcity[testFinished name='Empty stream name opens cout stream' duration="{duration}"]
-##teamcity[testStarted name='Empty tag is not allowed']
-##teamcity[testFinished name='Empty tag is not allowed' duration="{duration}"]
 ##teamcity[testStarted name='EndsWith string matcher']
 ##teamcity[testFailed name='EndsWith string matcher' message='Matchers.tests.cpp:<line number>|n...............................................................................|n|nMatchers.tests.cpp:<line number>|nexpression failed|n  CHECK_THAT( testStringForMatching(), EndsWith( "Substring" ) )|nwith expansion:|n  "this string contains |'abc|' as a substring" ends with: "Substring"|n']
 ##teamcity[testFailed name='EndsWith string matcher' message='Matchers.tests.cpp:<line number>|nexpression failed|n  CHECK_THAT( testStringForMatching(), EndsWith( "this", Catch::CaseSensitive::No ) )|nwith expansion:|n  "this string contains |'abc|' as a substring" ends with: "this" (case insensitive)|n']
@@ -376,6 +377,8 @@
 ##teamcity[testFinished name='Floating point matchers: double' duration="{duration}"]
 ##teamcity[testStarted name='Floating point matchers: float']
 ##teamcity[testFinished name='Floating point matchers: float' duration="{duration}"]
+##teamcity[testStarted name='GENERATE can combine literals and generators']
+##teamcity[testFinished name='GENERATE can combine literals and generators' duration="{duration}"]
 ##teamcity[testStarted name='Generators -- adapters']
 ##teamcity[testFinished name='Generators -- adapters' duration="{duration}"]
 ##teamcity[testStarted name='Generators -- simple']
@@ -392,6 +395,9 @@
 ##teamcity[testFinished name='Hashing different test cases produces different result' duration="{duration}"]
 ##teamcity[testStarted name='Hashing test case produces same hash across multiple calls']
 ##teamcity[testFinished name='Hashing test case produces same hash across multiple calls' duration="{duration}"]
+##teamcity[testStarted name='INFO and UNSCOPED_INFO can stream multiple arguments']
+##teamcity[testFailed name='INFO and UNSCOPED_INFO can stream multiple arguments' message='Message.tests.cpp:<line number>|n...............................................................................|n|nMessage.tests.cpp:<line number>|nexplicit failure with messages:|n  "This info has multiple parts."|n  "This unscoped info has multiple parts."|n  "Show infos!"']
+##teamcity[testFinished name='INFO and UNSCOPED_INFO can stream multiple arguments' duration="{duration}"]
 ##teamcity[testStarted name='INFO and WARN do not abort tests']
 ##teamcity[testFinished name='INFO and WARN do not abort tests' duration="{duration}"]
 ##teamcity[testStarted name='INFO gets logged on failure']
@@ -404,6 +410,9 @@
 ##teamcity[testStarted name='INFO is reset for each loop']
 ##teamcity[testFailed name='INFO is reset for each loop' message='Message.tests.cpp:<line number>|n...............................................................................|n|nMessage.tests.cpp:<line number>|nexpression failed with messages:|n  "current counter 10"|n  "i := 10"|n  REQUIRE( i < 10 )|nwith expansion:|n  10 < 10|n']
 ##teamcity[testFinished name='INFO is reset for each loop' duration="{duration}"]
+##teamcity[testStarted name='Incomplete AssertionHandler']
+##teamcity[testIgnored name='Incomplete AssertionHandler' message='AssertionHandler.tests.cpp:<line number>|n...............................................................................|n|nAssertionHandler.tests.cpp:<line number>|nunexpected exception with message:|n  "Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE"|n  REQUIRE( Dummy )|nwith expansion:|n  Dummy|n- failure ignore as test marked as |'ok to fail|'|n']
+##teamcity[testFinished name='Incomplete AssertionHandler' duration="{duration}"]
 ##teamcity[testStarted name='Inequality checks that should fail']
 ##teamcity[testIgnored name='Inequality checks that should fail' message='Condition.tests.cpp:<line number>|n...............................................................................|n|nCondition.tests.cpp:<line number>|nexpression failed|n  CHECK( data.int_seven != 7 )|nwith expansion:|n  7 != 7|n- failure ignore as test marked as |'ok to fail|'|n']
 ##teamcity[testIgnored name='Inequality checks that should fail' message='Condition.tests.cpp:<line number>|nexpression failed|n  CHECK( data.float_nine_point_one != Approx( 9.1f ) )|nwith expansion:|n  9.1f != Approx( 9.1000003815 )|n- failure ignore as test marked as |'ok to fail|'|n']
@@ -413,6 +422,10 @@
 ##teamcity[testFinished name='Inequality checks that should fail' duration="{duration}"]
 ##teamcity[testStarted name='Inequality checks that should succeed']
 ##teamcity[testFinished name='Inequality checks that should succeed' duration="{duration}"]
+##teamcity[testStarted name='JsonWriter']
+##teamcity[testFinished name='JsonWriter' duration="{duration}"]
+##teamcity[testStarted name='JsonWriter escapes charaters in strings properly']
+##teamcity[testFinished name='JsonWriter escapes charaters in strings properly' duration="{duration}"]
 ##teamcity[testStarted name='Lambdas in assertions']
 ##teamcity[testFinished name='Lambdas in assertions' duration="{duration}"]
 ##teamcity[testStarted name='Less-than inequalities with different epsilons']
@@ -638,6 +651,12 @@
 ##teamcity[testStarted name='Testing checked-if 3']
 ##teamcity[testIgnored name='Testing checked-if 3' message='Misc.tests.cpp:<line number>|n...............................................................................|n|nMisc.tests.cpp:<line number>|nexplicit failure- failure ignore as test marked as |'ok to fail|'|n']
 ##teamcity[testFinished name='Testing checked-if 3' duration="{duration}"]
+##teamcity[testStarted name='Testing checked-if 4']
+##teamcity[testIgnored name='Testing checked-if 4' message='Misc.tests.cpp:<line number>|n...............................................................................|n|nMisc.tests.cpp:<line number>|nunexpected exception with message:|n  "Uncaught exception should fail!"|n  {Unknown expression after the reported line}|nwith expansion:|n  {Unknown expression after the reported line}|n- failure ignore as test marked as |'ok to fail|'|n']
+##teamcity[testFinished name='Testing checked-if 4' duration="{duration}"]
+##teamcity[testStarted name='Testing checked-if 5']
+##teamcity[testIgnored name='Testing checked-if 5' message='Misc.tests.cpp:<line number>|n...............................................................................|n|nMisc.tests.cpp:<line number>|nunexpected exception with message:|n  "Uncaught exception should fail!"|n  {Unknown expression after the reported line}|nwith expansion:|n  {Unknown expression after the reported line}|n- failure ignore as test marked as |'ok to fail|'|n']
+##teamcity[testFinished name='Testing checked-if 5' duration="{duration}"]
 ##teamcity[testStarted name='The NO_FAIL macro reports a failure but does not fail the test']
 ##teamcity[testFinished name='The NO_FAIL macro reports a failure but does not fail the test' duration="{duration}"]
 ##teamcity[testStarted name='The default listing implementation write to provided stream']
@@ -975,6 +994,8 @@ loose text artifact
 ##teamcity[testFinished name='tuple<tuple<int>,tuple<>,float>' duration="{duration}"]
 ##teamcity[testStarted name='uniform samples']
 ##teamcity[testFinished name='uniform samples' duration="{duration}"]
+##teamcity[testStarted name='uniform_integer_distribution can return the bounds']
+##teamcity[testFinished name='uniform_integer_distribution can return the bounds' duration="{duration}"]
 ##teamcity[testStarted name='unique_ptr reimplementation: basic functionality']
 ##teamcity[testFinished name='unique_ptr reimplementation: basic functionality' duration="{duration}"]
 ##teamcity[testStarted name='vec<vec<string,alloc>> -> toString']
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/teamcity.sw.multi.approved.txt b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/teamcity.sw.multi.approved.txt
index 77f70a63..24ed5d98 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/teamcity.sw.multi.approved.txt
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/teamcity.sw.multi.approved.txt
@@ -299,10 +299,11 @@
 ##teamcity[testFinished name='Default scale is invisible to comparison' duration="{duration}"]
 ##teamcity[testStarted name='Directly creating an EnumInfo']
 ##teamcity[testFinished name='Directly creating an EnumInfo' duration="{duration}"]
+##teamcity[testStarted name='Empty generators can SKIP in constructor']
+##teamcity[testIgnored name='Empty generators can SKIP in constructor' message='Skip.tests.cpp:<line number>|n...............................................................................|n|nSkip.tests.cpp:<line number>|nexplicit skip with message:|n  "This generator is empty"']
+##teamcity[testFinished name='Empty generators can SKIP in constructor' duration="{duration}"]
 ##teamcity[testStarted name='Empty stream name opens cout stream']
 ##teamcity[testFinished name='Empty stream name opens cout stream' duration="{duration}"]
-##teamcity[testStarted name='Empty tag is not allowed']
-##teamcity[testFinished name='Empty tag is not allowed' duration="{duration}"]
 ##teamcity[testStarted name='EndsWith string matcher']
 ##teamcity[testFailed name='EndsWith string matcher' message='Matchers.tests.cpp:<line number>|n...............................................................................|n|nMatchers.tests.cpp:<line number>|nexpression failed|n  CHECK_THAT( testStringForMatching(), EndsWith( "Substring" ) )|nwith expansion:|n  "this string contains |'abc|' as a substring" ends with: "Substring"|n']
 ##teamcity[testFailed name='EndsWith string matcher' message='Matchers.tests.cpp:<line number>|nexpression failed|n  CHECK_THAT( testStringForMatching(), EndsWith( "this", Catch::CaseSensitive::No ) )|nwith expansion:|n  "this string contains |'abc|' as a substring" ends with: "this" (case insensitive)|n']
@@ -376,6 +377,8 @@
 ##teamcity[testFinished name='Floating point matchers: double' duration="{duration}"]
 ##teamcity[testStarted name='Floating point matchers: float']
 ##teamcity[testFinished name='Floating point matchers: float' duration="{duration}"]
+##teamcity[testStarted name='GENERATE can combine literals and generators']
+##teamcity[testFinished name='GENERATE can combine literals and generators' duration="{duration}"]
 ##teamcity[testStarted name='Generators -- adapters']
 ##teamcity[testFinished name='Generators -- adapters' duration="{duration}"]
 ##teamcity[testStarted name='Generators -- simple']
@@ -392,6 +395,9 @@
 ##teamcity[testFinished name='Hashing different test cases produces different result' duration="{duration}"]
 ##teamcity[testStarted name='Hashing test case produces same hash across multiple calls']
 ##teamcity[testFinished name='Hashing test case produces same hash across multiple calls' duration="{duration}"]
+##teamcity[testStarted name='INFO and UNSCOPED_INFO can stream multiple arguments']
+##teamcity[testFailed name='INFO and UNSCOPED_INFO can stream multiple arguments' message='Message.tests.cpp:<line number>|n...............................................................................|n|nMessage.tests.cpp:<line number>|nexplicit failure with messages:|n  "This info has multiple parts."|n  "This unscoped info has multiple parts."|n  "Show infos!"']
+##teamcity[testFinished name='INFO and UNSCOPED_INFO can stream multiple arguments' duration="{duration}"]
 ##teamcity[testStarted name='INFO and WARN do not abort tests']
 ##teamcity[testFinished name='INFO and WARN do not abort tests' duration="{duration}"]
 ##teamcity[testStarted name='INFO gets logged on failure']
@@ -404,6 +410,9 @@
 ##teamcity[testStarted name='INFO is reset for each loop']
 ##teamcity[testFailed name='INFO is reset for each loop' message='Message.tests.cpp:<line number>|n...............................................................................|n|nMessage.tests.cpp:<line number>|nexpression failed with messages:|n  "current counter 10"|n  "i := 10"|n  REQUIRE( i < 10 )|nwith expansion:|n  10 < 10|n']
 ##teamcity[testFinished name='INFO is reset for each loop' duration="{duration}"]
+##teamcity[testStarted name='Incomplete AssertionHandler']
+##teamcity[testIgnored name='Incomplete AssertionHandler' message='AssertionHandler.tests.cpp:<line number>|n...............................................................................|n|nAssertionHandler.tests.cpp:<line number>|nunexpected exception with message:|n  "Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE"|n  REQUIRE( Dummy )|nwith expansion:|n  Dummy|n- failure ignore as test marked as |'ok to fail|'|n']
+##teamcity[testFinished name='Incomplete AssertionHandler' duration="{duration}"]
 ##teamcity[testStarted name='Inequality checks that should fail']
 ##teamcity[testIgnored name='Inequality checks that should fail' message='Condition.tests.cpp:<line number>|n...............................................................................|n|nCondition.tests.cpp:<line number>|nexpression failed|n  CHECK( data.int_seven != 7 )|nwith expansion:|n  7 != 7|n- failure ignore as test marked as |'ok to fail|'|n']
 ##teamcity[testIgnored name='Inequality checks that should fail' message='Condition.tests.cpp:<line number>|nexpression failed|n  CHECK( data.float_nine_point_one != Approx( 9.1f ) )|nwith expansion:|n  9.1f != Approx( 9.1000003815 )|n- failure ignore as test marked as |'ok to fail|'|n']
@@ -413,6 +422,10 @@
 ##teamcity[testFinished name='Inequality checks that should fail' duration="{duration}"]
 ##teamcity[testStarted name='Inequality checks that should succeed']
 ##teamcity[testFinished name='Inequality checks that should succeed' duration="{duration}"]
+##teamcity[testStarted name='JsonWriter']
+##teamcity[testFinished name='JsonWriter' duration="{duration}"]
+##teamcity[testStarted name='JsonWriter escapes charaters in strings properly']
+##teamcity[testFinished name='JsonWriter escapes charaters in strings properly' duration="{duration}"]
 ##teamcity[testStarted name='Lambdas in assertions']
 ##teamcity[testFinished name='Lambdas in assertions' duration="{duration}"]
 ##teamcity[testStarted name='Less-than inequalities with different epsilons']
@@ -638,6 +651,12 @@
 ##teamcity[testStarted name='Testing checked-if 3']
 ##teamcity[testIgnored name='Testing checked-if 3' message='Misc.tests.cpp:<line number>|n...............................................................................|n|nMisc.tests.cpp:<line number>|nexplicit failure- failure ignore as test marked as |'ok to fail|'|n']
 ##teamcity[testFinished name='Testing checked-if 3' duration="{duration}"]
+##teamcity[testStarted name='Testing checked-if 4']
+##teamcity[testIgnored name='Testing checked-if 4' message='Misc.tests.cpp:<line number>|n...............................................................................|n|nMisc.tests.cpp:<line number>|nunexpected exception with message:|n  "Uncaught exception should fail!"|n  {Unknown expression after the reported line}|nwith expansion:|n  {Unknown expression after the reported line}|n- failure ignore as test marked as |'ok to fail|'|n']
+##teamcity[testFinished name='Testing checked-if 4' duration="{duration}"]
+##teamcity[testStarted name='Testing checked-if 5']
+##teamcity[testIgnored name='Testing checked-if 5' message='Misc.tests.cpp:<line number>|n...............................................................................|n|nMisc.tests.cpp:<line number>|nunexpected exception with message:|n  "Uncaught exception should fail!"|n  {Unknown expression after the reported line}|nwith expansion:|n  {Unknown expression after the reported line}|n- failure ignore as test marked as |'ok to fail|'|n']
+##teamcity[testFinished name='Testing checked-if 5' duration="{duration}"]
 ##teamcity[testStarted name='The NO_FAIL macro reports a failure but does not fail the test']
 ##teamcity[testFinished name='The NO_FAIL macro reports a failure but does not fail the test' duration="{duration}"]
 ##teamcity[testStarted name='The default listing implementation write to provided stream']
@@ -974,6 +993,8 @@
 ##teamcity[testFinished name='tuple<tuple<int>,tuple<>,float>' duration="{duration}"]
 ##teamcity[testStarted name='uniform samples']
 ##teamcity[testFinished name='uniform samples' duration="{duration}"]
+##teamcity[testStarted name='uniform_integer_distribution can return the bounds']
+##teamcity[testFinished name='uniform_integer_distribution can return the bounds' duration="{duration}"]
 ##teamcity[testStarted name='unique_ptr reimplementation: basic functionality']
 ##teamcity[testFinished name='unique_ptr reimplementation: basic functionality' duration="{duration}"]
 ##teamcity[testStarted name='vec<vec<string,alloc>> -> toString']
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/xml.sw.approved.txt b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/xml.sw.approved.txt
index 6a0d1587..be57798b 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/xml.sw.approved.txt
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/xml.sw.approved.txt
@@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<Catch2TestRun name="<exe-name>" rng-seed="1" xml-format-version="2" catch2-version="<version>" filters="&quot;*&quot; ~[!nonportable] ~[!benchmark] ~[approvals]">
+<Catch2TestRun name="<exe-name>" rng-seed="1" xml-format-version="3" catch2-version="<version>" filters="&quot;*&quot; ~[!nonportable] ~[!benchmark] ~[approvals]">
   <TestCase name="# A test name that starts with a #" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
     <OverallResult success="true" skips="0"/>
   </TestCase>
@@ -77,10 +77,10 @@
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="#1238" filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
       uarr := "123"
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
       sarr := "456"
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
@@ -91,10 +91,10 @@
         0 == 0
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
       uarr := "123"
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
       sarr := "456"
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
@@ -128,11 +128,11 @@
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="#1455 - INFO and WARN can start with a linebreak" tags="[.][messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
 
 This info message starts with a linebreak
     </Info>
-    <Warning>
+    <Warning filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
 
 This warning message starts with a linebreak
     </Warning>
@@ -384,91 +384,91 @@ Nor would this
     <Section name="A" filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
     </Section>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       i := 1
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       j := 3
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       k := 5
     </Info>
     <Section name="B" filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
     </Section>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       i := 1
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       j := 3
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       k := 6
     </Info>
     <Section name="B" filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
     </Section>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       i := 1
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       j := 4
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       k := 5
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       i := 1
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       j := 4
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       k := 6
     </Info>
     <Section name="A" filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
     </Section>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       i := 2
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       j := 3
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       k := 5
     </Info>
     <Section name="B" filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
     </Section>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       i := 2
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       j := 3
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       k := 6
     </Info>
     <Section name="B" filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
     </Section>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       i := 2
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       j := 4
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       k := 5
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       i := 2
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       j := 4
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       k := 6
     </Info>
     <OverallResult success="true" skips="0"/>
@@ -667,7 +667,7 @@ Nor would this
     </Expression>
     <OverallResult success="true" skips="0"/>
   </TestCase>
-  <TestCase name="#2615 - Throwing in constructor generator fails test case but does not abort" tags="[!shouldfail]" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
+  <TestCase name="#2615 - Throwing in constructor generator fails test case but does not abort" tags="[!shouldfail][generators][regression]" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
     <Exception filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
       failure to init
     </Exception>
@@ -675,7 +675,7 @@ Nor would this
   </TestCase>
   <TestCase name="#748 - captures with unexpected exceptions" tags="[!shouldfail][!throws][.][failing]" filename="tests/<exe-name>/UsageTests/Exception.tests.cpp" >
     <Section name="outside assertions" filename="tests/<exe-name>/UsageTests/Exception.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/UsageTests/Exception.tests.cpp" >
         answer := 42
       </Info>
       <Exception filename="tests/<exe-name>/UsageTests/Exception.tests.cpp" >
@@ -684,7 +684,7 @@ Nor would this
       <OverallResults successes="0" failures="0" expectedFailures="1" skipped="false"/>
     </Section>
     <Section name="inside REQUIRE_NOTHROW" filename="tests/<exe-name>/UsageTests/Exception.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/UsageTests/Exception.tests.cpp" >
         answer := 42
       </Info>
       <Expression success="false" type="REQUIRE_NOTHROW" filename="tests/<exe-name>/UsageTests/Exception.tests.cpp" >
@@ -701,7 +701,7 @@ Nor would this
       <OverallResults successes="0" failures="0" expectedFailures="1" skipped="false"/>
     </Section>
     <Section name="inside REQUIRE_THROWS" filename="tests/<exe-name>/UsageTests/Exception.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/UsageTests/Exception.tests.cpp" >
         answer := 42
       </Info>
       <Expression success="true" type="REQUIRE_THROWS" filename="tests/<exe-name>/UsageTests/Exception.tests.cpp" >
@@ -806,7 +806,7 @@ Nor would this
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="#872" filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
       dummy := 0
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
@@ -2886,92 +2886,92 @@ Nor would this
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="CAPTURE can deal with complex expressions" tags="[capture][messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       a := 1
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       b := 2
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       c := 3
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       a + b := 3
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       a+b := 3
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       c > b := true
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       a == 1 := true
     </Info>
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="CAPTURE can deal with complex expressions involving commas" tags="[capture][messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
-      std::vector&lt;int>{1, 2, 3}[0, 1, 2] := 3
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
+      custom_index_op&lt;int>{1, 2, 3}[0, 1, 2] := 0
     </Info>
-    <Info>
-      std::vector&lt;int>{1, 2, 3}[(0, 1)] := 2
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
+      custom_index_op&lt;int>{1, 2, 3}[(0, 1)] := 0
     </Info>
-    <Info>
-      std::vector&lt;int>{1, 2, 3}[0] := 1
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
+      custom_index_op&lt;int>{1, 2, 3}[0] := 0
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       (helper_1436&lt;int, int>{12, -12}) := { 12, -12 }
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       (helper_1436&lt;int, int>(-12, 12)) := { -12, 12 }
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       (1, 2) := 2
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       (2, 3) := 3
     </Info>
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="CAPTURE parses string and character constants" tags="[capture][messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       ("comma, in string", "escaped, \", ") := "escaped, ", "
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       "single quote in string,'," := "single quote in string,',"
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       "some escapes, \\,\\\\" := "some escapes, \,\\"
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       "some, ), unmatched, } prenheses {[&lt;" := "some, ), unmatched, } prenheses {[&lt;"
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       '"' := '"'
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       '\'' := '''
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       ',' := ','
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       '}' := '}'
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       ')' := ')'
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       '(' := '('
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       '{' := '{'
     </Info>
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="Capture and info messages" filename="tests/<exe-name>/UsageTests/ToStringGeneral.tests.cpp" >
     <Section name="Capture should stringify like assertions" filename="tests/<exe-name>/UsageTests/ToStringGeneral.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/UsageTests/ToStringGeneral.tests.cpp" >
         i := 2
       </Info>
       <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/ToStringGeneral.tests.cpp" >
@@ -2985,7 +2985,7 @@ Nor would this
       <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
     </Section>
     <Section name="Info should NOT stringify the way assertions do" filename="tests/<exe-name>/UsageTests/ToStringGeneral.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/UsageTests/ToStringGeneral.tests.cpp" >
         3
       </Info>
       <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/ToStringGeneral.tests.cpp" >
@@ -4364,6 +4364,12 @@ C
     </Expression>
     <OverallResult success="true" skips="0"/>
   </TestCase>
+  <TestCase name="Empty generators can SKIP in constructor" tags="[skipping]" filename="tests/<exe-name>/UsageTests/Skip.tests.cpp" >
+    <Skip filename="tests/<exe-name>/UsageTests/Skip.tests.cpp" >
+      This generator is empty
+    </Skip>
+    <OverallResult success="true" skips="1"/>
+  </TestCase>
   <TestCase name="Empty stream name opens cout stream" tags="[streams]" filename="tests/<exe-name>/IntrospectiveTests/Stream.tests.cpp" >
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Stream.tests.cpp" >
       <Original>
@@ -4375,17 +4381,6 @@ C
     </Expression>
     <OverallResult success="true" skips="0"/>
   </TestCase>
-  <TestCase name="Empty tag is not allowed" filename="tests/<exe-name>/IntrospectiveTests/Tag.tests.cpp" >
-    <Expression success="true" type="REQUIRE_THROWS" filename="tests/<exe-name>/IntrospectiveTests/Tag.tests.cpp" >
-      <Original>
-        Catch::TestCaseInfo( "", { "fake test name", "[]" }, dummySourceLineInfo )
-      </Original>
-      <Expanded>
-        Catch::TestCaseInfo( "", { "fake test name", "[]" }, dummySourceLineInfo )
-      </Expanded>
-    </Expression>
-    <OverallResult success="true" skips="0"/>
-  </TestCase>
   <TestCase name="EndsWith string matcher" tags="[.][failing][matchers]" filename="tests/<exe-name>/UsageTests/Matchers.tests.cpp" >
     <Expression success="false" type="CHECK_THAT" filename="tests/<exe-name>/UsageTests/Matchers.tests.cpp" >
       <Original>
@@ -4977,7 +4972,7 @@ C
     <Failure filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       This is a failure
     </Failure>
-    <Warning>
+    <Warning filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       This message appears in the output
     </Warning>
     <OverallResult success="false" skips="0"/>
@@ -5588,6 +5583,41 @@ C
     </Section>
     <OverallResult success="true" skips="0"/>
   </TestCase>
+  <TestCase name="GENERATE can combine literals and generators" tags="[generators]" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
+    <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
+      <Original>
+        i % 2 == 0
+      </Original>
+      <Expanded>
+        0 == 0
+      </Expanded>
+    </Expression>
+    <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
+      <Original>
+        i % 2 == 0
+      </Original>
+      <Expanded>
+        0 == 0
+      </Expanded>
+    </Expression>
+    <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
+      <Original>
+        i % 2 == 0
+      </Original>
+      <Expanded>
+        0 == 0
+      </Expanded>
+    </Expression>
+    <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
+      <Original>
+        i % 2 == 0
+      </Original>
+      <Expanded>
+        0 == 0
+      </Expanded>
+    </Expression>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
   <TestCase name="Generators -- adapters" tags="[generators][generic]" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
     <Section name="Filtering by predicate" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
       <Section name="Basic usage" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
@@ -7246,7 +7276,7 @@ C
       <Section name="Positive manual step" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
         <Section name="Floating Point" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
           <Section name="Exact" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7257,7 +7287,7 @@ C
                 -1.0 == Approx( -1.0 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7268,7 +7298,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.9
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7279,7 +7309,7 @@ C
                 -0.9 == Approx( -0.9 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.9
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7290,7 +7320,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.8
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7301,7 +7331,7 @@ C
                 -0.8 == Approx( -0.8 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.8
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7312,7 +7342,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.7
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7323,7 +7353,7 @@ C
                 -0.7 == Approx( -0.7 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.7
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7334,7 +7364,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.6
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7345,7 +7375,7 @@ C
                 -0.6 == Approx( -0.6 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.6
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7356,7 +7386,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.5
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7367,7 +7397,7 @@ C
                 -0.5 == Approx( -0.5 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.5
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7378,7 +7408,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.4
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7389,7 +7419,7 @@ C
                 -0.4 == Approx( -0.4 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.4
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7400,7 +7430,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.3
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7411,7 +7441,7 @@ C
                 -0.3 == Approx( -0.3 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.3
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7422,7 +7452,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.2
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7433,7 +7463,7 @@ C
                 -0.2 == Approx( -0.2 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.2
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7444,7 +7474,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7455,7 +7485,7 @@ C
                 -0.1 == Approx( -0.1 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7466,7 +7496,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -1.38778e-16
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7477,7 +7507,7 @@ C
                 -0.0 == Approx( -0.0 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -1.38778e-16
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7488,7 +7518,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7499,7 +7529,7 @@ C
                 0.1 == Approx( 0.1 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7510,7 +7540,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.2
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7521,7 +7551,7 @@ C
                 0.2 == Approx( 0.2 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.2
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7532,7 +7562,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.3
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7543,7 +7573,7 @@ C
                 0.3 == Approx( 0.3 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.3
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7554,7 +7584,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.4
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7565,7 +7595,7 @@ C
                 0.4 == Approx( 0.4 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.4
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7576,7 +7606,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.5
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7587,7 +7617,7 @@ C
                 0.5 == Approx( 0.5 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.5
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7598,7 +7628,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.6
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7609,7 +7639,7 @@ C
                 0.6 == Approx( 0.6 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.6
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7620,7 +7650,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.7
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7631,7 +7661,7 @@ C
                 0.7 == Approx( 0.7 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.7
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7642,7 +7672,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.8
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7653,7 +7683,7 @@ C
                 0.8 == Approx( 0.8 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.8
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7664,7 +7694,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.9
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7675,7 +7705,7 @@ C
                 0.9 == Approx( 0.9 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.9
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7714,7 +7744,7 @@ C
       <Section name="Positive manual step" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
         <Section name="Floating Point" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
           <Section name="Slightly over end" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7725,7 +7755,7 @@ C
                 -1.0 == Approx( -1.0 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7736,7 +7766,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.7
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7747,7 +7777,7 @@ C
                 -0.7 == Approx( -0.7 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.7
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7758,7 +7788,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.4
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7769,7 +7799,7 @@ C
                 -0.4 == Approx( -0.4 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.4
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7780,7 +7810,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7791,7 +7821,7 @@ C
                 -0.1 == Approx( -0.1 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7802,7 +7832,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.2
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7813,7 +7843,7 @@ C
                 0.2 == Approx( 0.2 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.2
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7824,7 +7854,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.5
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7835,7 +7865,7 @@ C
                 0.5 == Approx( 0.5 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.5
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7866,7 +7896,7 @@ C
       <Section name="Positive manual step" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
         <Section name="Floating Point" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
           <Section name="Slightly under end" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7877,7 +7907,7 @@ C
                 -1.0 == Approx( -1.0 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7888,7 +7918,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.7
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7899,7 +7929,7 @@ C
                 -0.7 == Approx( -0.7 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.7
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7910,7 +7940,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.4
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7921,7 +7951,7 @@ C
                 -0.4 == Approx( -0.4 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.4
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7932,7 +7962,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7943,7 +7973,7 @@ C
                 -0.1 == Approx( -0.1 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7954,7 +7984,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.2
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7965,7 +7995,7 @@ C
                 0.2 == Approx( 0.2 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.2
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7976,7 +8006,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.5
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7987,7 +8017,7 @@ C
                 0.5 == Approx( 0.5 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.5
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -8376,20 +8406,32 @@ C
     </Expression>
     <OverallResult success="true" skips="0"/>
   </TestCase>
+  <TestCase name="INFO and UNSCOPED_INFO can stream multiple arguments" tags="[.][failing][info][messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
+      This info has multiple parts.
+    </Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
+      This unscoped info has multiple parts.
+    </Info>
+    <Failure filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
+      Show infos!
+    </Failure>
+    <OverallResult success="false" skips="0"/>
+  </TestCase>
   <TestCase name="INFO and WARN do not abort tests" tags="[.][messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this is a message
     </Info>
-    <Warning>
+    <Warning filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this is a warning
     </Warning>
     <OverallResult success="false" skips="0"/>
   </TestCase>
   <TestCase name="INFO gets logged on failure" tags="[.][failing][messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this message should be logged
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       so should this
     </Info>
     <Expression success="false" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8403,7 +8445,7 @@ C
     <OverallResult success="false" skips="0"/>
   </TestCase>
   <TestCase name="INFO gets logged on failure, even if captured before successful assertions" tags="[.][failing][messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this message may be logged later
     </Info>
     <Expression success="true" type="CHECK" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8414,10 +8456,10 @@ C
         2 == 2
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this message may be logged later
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this message should be logged
     </Info>
     <Expression success="false" type="CHECK" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8428,13 +8470,13 @@ C
         2 == 1
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this message may be logged later
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this message should be logged
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       and this, but later
     </Info>
     <Expression success="false" type="CHECK" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8445,16 +8487,16 @@ C
         2 == 0
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this message may be logged later
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this message should be logged
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       and this, but later
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       but not this
     </Info>
     <Expression success="true" type="CHECK" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8468,10 +8510,10 @@ C
     <OverallResult success="false" skips="0"/>
   </TestCase>
   <TestCase name="INFO is reset for each loop" tags="[.][failing][messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       current counter 0
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       i := 0
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8482,10 +8524,10 @@ C
         0 &lt; 10
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       current counter 1
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       i := 1
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8496,10 +8538,10 @@ C
         1 &lt; 10
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       current counter 2
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       i := 2
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8510,10 +8552,10 @@ C
         2 &lt; 10
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       current counter 3
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       i := 3
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8524,10 +8566,10 @@ C
         3 &lt; 10
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       current counter 4
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       i := 4
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8538,10 +8580,10 @@ C
         4 &lt; 10
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       current counter 5
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       i := 5
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8552,10 +8594,10 @@ C
         5 &lt; 10
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       current counter 6
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       i := 6
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8566,10 +8608,10 @@ C
         6 &lt; 10
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       current counter 7
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       i := 7
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8580,10 +8622,10 @@ C
         7 &lt; 10
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       current counter 8
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       i := 8
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8594,10 +8636,10 @@ C
         8 &lt; 10
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       current counter 9
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       i := 9
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8608,10 +8650,10 @@ C
         9 &lt; 10
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       current counter 10
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       i := 10
     </Info>
     <Expression success="false" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8624,6 +8666,20 @@ C
     </Expression>
     <OverallResult success="false" skips="0"/>
   </TestCase>
+  <TestCase name="Incomplete AssertionHandler" tags="[!shouldfail][assertion-handler]" filename="tests/<exe-name>/IntrospectiveTests/AssertionHandler.tests.cpp" >
+    <Expression success="false" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/AssertionHandler.tests.cpp" >
+      <Original>
+        Dummy
+      </Original>
+      <Expanded>
+        Dummy
+      </Expanded>
+      <Exception filename="tests/<exe-name>/IntrospectiveTests/AssertionHandler.tests.cpp" >
+        Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE
+      </Exception>
+    </Expression>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
   <TestCase name="Inequality checks that should fail" tags="[!shouldfail][.][failing]" filename="tests/<exe-name>/UsageTests/Condition.tests.cpp" >
     <Expression success="false" type="CHECK" filename="tests/<exe-name>/UsageTests/Condition.tests.cpp" >
       <Original>
@@ -8758,6 +8814,277 @@ C
     </Expression>
     <OverallResult success="true" skips="0"/>
   </TestCase>
+  <TestCase name="JsonWriter" tags="[JSON][JsonWriter]" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+    <Section name="Newly constructed JsonWriter does nothing" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == ""
+        </Original>
+        <Expanded>
+          "" == ""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Calling writeObject will create an empty pair of braces" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == "{\n}"
+        </Original>
+        <Expanded>
+          "{
+}"
+==
+"{
+}"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Calling writeObject with key will create an object to write the value" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str(), ContainsSubstring( "\"int\": 1," ) &amp;&amp; ContainsSubstring( "\"double\": 1.5," ) &amp;&amp; ContainsSubstring( "\"true\": true," ) &amp;&amp; ContainsSubstring( "\"false\": false," ) &amp;&amp; ContainsSubstring( "\"string\": \"this is a string\"," ) &amp;&amp; ContainsSubstring( "\"array\": [\n    1,\n    2\n  ]\n}" )
+        </Original>
+        <Expanded>
+          "{
+  "int": 1,
+  "double": 1.5,
+  "true": true,
+  "false": false,
+  "string": "this is a string",
+  "array": [
+    1,
+    2
+  ]
+}" ( contains: ""int": 1," and contains: ""double": 1.5," and contains: ""true": true," and contains: ""false": false," and contains: ""string": "this is a string"," and contains: ""array": [
+    1,
+    2
+  ]
+}" )
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="nesting objects" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str(), ContainsSubstring( "\"empty_object\": {\n  }," ) &amp;&amp; ContainsSubstring( "\"fully_object\": {\n    \"key\": 1\n  }" )
+        </Original>
+        <Expanded>
+          "{
+  "empty_object": {
+  },
+  "fully_object": {
+    "key": 1
+  }
+}" ( contains: ""empty_object": {
+  }," and contains: ""fully_object": {
+    "key": 1
+  }" )
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Calling writeArray will create an empty pair of braces" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == "[\n]"
+        </Original>
+        <Expanded>
+          "[
+]"
+==
+"[
+]"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Calling writeArray creates array to write the values to" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == "[\n  1,\n  1.5,\n  true,\n  false,\n  \"this is a string\",\n  {\n    \"object\": 42\n  },\n  [\n    \"array\",\n    42.5\n  ]\n]"
+        </Original>
+        <Expanded>
+          "[
+  1,
+  1.5,
+  true,
+  false,
+  "this is a string",
+  {
+    "object": 42
+  },
+  [
+    "array",
+    42.5
+  ]
+]"
+==
+"[
+  1,
+  1.5,
+  true,
+  false,
+  "this is a string",
+  {
+    "object": 42
+  },
+  [
+    "array",
+    42.5
+  ]
+]"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Moved from JsonObjectWriter shall not insert superfluous brace" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == "{\n}"
+        </Original>
+        <Expanded>
+          "{
+}"
+==
+"{
+}"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Moved from JsonArrayWriter shall not insert superfluous bracket" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == "[\n]"
+        </Original>
+        <Expanded>
+          "[
+]"
+==
+"[
+]"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Custom class shall be quoted" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == "\"custom\""
+        </Original>
+        <Expanded>
+          ""custom"" == ""custom""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
+  <TestCase name="JsonWriter escapes charaters in strings properly" tags="[JsonWriter]" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+    <Section name="Quote in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\\"\""
+        </Original>
+        <Expanded>
+          ""\""" == ""\"""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Backslash in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\\\\""
+        </Original>
+        <Expanded>
+          ""\\"" == ""\\""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Forward slash in a string is **not** escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"/\""
+        </Original>
+        <Expanded>
+          ""/"" == ""/""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Backspace in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\b\""
+        </Original>
+        <Expanded>
+          ""\b"" == ""\b""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Formfeed in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\f\""
+        </Original>
+        <Expanded>
+          ""\f"" == ""\f""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="linefeed in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\n\""
+        </Original>
+        <Expanded>
+          ""\n"" == ""\n""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="carriage return in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\r\""
+        </Original>
+        <Expanded>
+          ""\r"" == ""\r""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="tab in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\t\""
+        </Original>
+        <Expanded>
+          ""\t"" == ""\t""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="combination of characters is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\\\/\\t\\r\\n\""
+        </Original>
+        <Expanded>
+          ""\\/\t\r\n"" == ""\\/\t\r\n""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
   <TestCase name="Lambdas in assertions" filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
       <Original>
@@ -9219,7 +9546,7 @@ C
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="Nice descriptive name" tags="[.][tag1][tag2][tag3]" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
-    <Warning>
+    <Warning filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
       This one ran
     </Warning>
     <OverallResult success="false" skips="0"/>
@@ -10005,7 +10332,7 @@ C
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="Parsing tags with non-alphabetical characters is pass-through" tags="[test-spec][test-spec-parser]" filename="tests/<exe-name>/IntrospectiveTests/TestSpecParser.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/TestSpecParser.tests.cpp" >
       tagString := "[tag with spaces]"
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/TestSpecParser.tests.cpp" >
@@ -10016,7 +10343,7 @@ C
         true
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/TestSpecParser.tests.cpp" >
       tagString := "[tag with spaces]"
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/TestSpecParser.tests.cpp" >
@@ -10027,7 +10354,7 @@ C
         true
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/TestSpecParser.tests.cpp" >
       tagString := "[tag with spaces]"
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/TestSpecParser.tests.cpp" >
@@ -10038,7 +10365,7 @@ C
         true
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/TestSpecParser.tests.cpp" >
       tagString := "[I said "good day" sir!]"
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/TestSpecParser.tests.cpp" >
@@ -10049,7 +10376,7 @@ C
         true
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/TestSpecParser.tests.cpp" >
       tagString := "[I said "good day" sir!]"
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/TestSpecParser.tests.cpp" >
@@ -10060,7 +10387,7 @@ C
         true
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/TestSpecParser.tests.cpp" >
       tagString := "[I said "good day" sir!]"
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/TestSpecParser.tests.cpp" >
@@ -10471,7 +10798,7 @@ C
     </Section>
     <Section name="reporter" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
       <Section name="-r/console" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
-        <Info>
+        <Info filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
           result.errorMessage() := ""
         </Info>
         <Expression success="true" type="CHECK" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
@@ -10482,7 +10809,7 @@ C
             {?}
           </Expanded>
         </Expression>
-        <Info>
+        <Info filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
           result.errorMessage() := ""
         </Info>
         <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
@@ -10499,7 +10826,7 @@ C
     </Section>
     <Section name="reporter" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
       <Section name="-r/xml" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
-        <Info>
+        <Info filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
           result.errorMessage() := ""
         </Info>
         <Expression success="true" type="CHECK" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
@@ -10510,7 +10837,7 @@ C
             {?}
           </Expanded>
         </Expression>
-        <Info>
+        <Info filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
           result.errorMessage() := ""
         </Info>
         <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
@@ -10527,7 +10854,7 @@ C
     </Section>
     <Section name="reporter" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
       <Section name="--reporter/junit" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
-        <Info>
+        <Info filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
           result.errorMessage() := ""
         </Info>
         <Expression success="true" type="CHECK" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
@@ -10538,7 +10865,7 @@ C
             {?}
           </Expanded>
         </Expression>
-        <Info>
+        <Info filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
           result.errorMessage() := ""
         </Info>
         <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
@@ -10577,7 +10904,7 @@ C
     </Section>
     <Section name="reporter" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
       <Section name="With output file" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
-        <Info>
+        <Info filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
           result.errorMessage() := ""
         </Info>
         <Expression success="true" type="CHECK" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
@@ -10588,7 +10915,7 @@ C
             {?}
           </Expanded>
         </Expression>
-        <Info>
+        <Info filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
           result.errorMessage() := ""
         </Info>
         <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
@@ -10605,7 +10932,7 @@ C
     </Section>
     <Section name="reporter" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
       <Section name="With Windows-like absolute path as output file" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
-        <Info>
+        <Info filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
           result.errorMessage() := ""
         </Info>
         <Expression success="true" type="CHECK" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
@@ -10616,7 +10943,7 @@ C
             {?}
           </Expanded>
         </Expression>
-        <Info>
+        <Info filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
           result.errorMessage() := ""
         </Info>
         <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
@@ -11441,7 +11768,7 @@ C
       </Expanded>
     </Expression>
     <Section name="Automake reporter lists tags" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: Automake
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11467,7 +11794,7 @@ C
       </Expanded>
     </Expression>
     <Section name="Automake reporter lists reporters" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: Automake
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11492,7 +11819,7 @@ C
       </Expanded>
     </Expression>
     <Section name="Automake reporter lists tests" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: Automake
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11519,7 +11846,7 @@ C
       </Expanded>
     </Expression>
     <Section name="compact reporter lists tags" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: compact
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11545,7 +11872,7 @@ C
       </Expanded>
     </Expression>
     <Section name="compact reporter lists reporters" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: compact
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11570,7 +11897,7 @@ C
       </Expanded>
     </Expression>
     <Section name="compact reporter lists tests" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: compact
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11597,7 +11924,7 @@ C
       </Expanded>
     </Expression>
     <Section name="console reporter lists tags" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: console
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11623,7 +11950,7 @@ C
       </Expanded>
     </Expression>
     <Section name="console reporter lists reporters" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: console
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11648,7 +11975,7 @@ C
       </Expanded>
     </Expression>
     <Section name="console reporter lists tests" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: console
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11674,8 +12001,122 @@ C
         !false
       </Expanded>
     </Expression>
+    <Section name="JSON reporter lists tags" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+        Tested reporter: JSON
+      </Info>
+      <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+        <Original>
+          listingString, ContainsSubstring("fakeTag"s)
+        </Original>
+        <Expanded>
+          "{
+  "version": 1,
+  "metadata": {
+    "name": "",
+    "rng-seed": 1234,
+    "catch2-version": "<version>"
+  },
+  "listings": {
+    "tags": [
+      {
+        "aliases": [
+          "fakeTag"
+        ],
+        "count": 1
+      }
+    ]" contains: "fakeTag"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Expression success="true" type="REQUIRE_FALSE" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+      <Original>
+        !(factories.empty())
+      </Original>
+      <Expanded>
+        !false
+      </Expanded>
+    </Expression>
+    <Section name="JSON reporter lists reporters" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+        Tested reporter: JSON
+      </Info>
+      <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+        <Original>
+          listingString, ContainsSubstring("fake reporter"s)
+        </Original>
+        <Expanded>
+          "{
+  "version": 1,
+  "metadata": {
+    "name": "",
+    "rng-seed": 1234,
+    "catch2-version": "<version>"
+  },
+  "listings": {
+    "reporters": [
+      {
+        "name": "fake reporter",
+        "description": "fake description"
+      }
+    ]" contains: "fake reporter"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Expression success="true" type="REQUIRE_FALSE" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+      <Original>
+        !(factories.empty())
+      </Original>
+      <Expanded>
+        !false
+      </Expanded>
+    </Expression>
+    <Section name="JSON reporter lists tests" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+        Tested reporter: JSON
+      </Info>
+      <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+        <Original>
+          listingString, ContainsSubstring( "fake test name"s ) &amp;&amp; ContainsSubstring( "fakeTestTag"s )
+        </Original>
+        <Expanded>
+          "{
+  "version": 1,
+  "metadata": {
+    "name": "",
+    "rng-seed": 1234,
+    "catch2-version": "<version>"
+  },
+  "listings": {
+    "tests": [
+      {
+        "name": "fake test name",
+        "class-name": "",
+        "tags": [
+          "fakeTestTag"
+        ],
+        "source-location": {
+          "filename": "fake-file.cpp",
+          "line": 123456789
+        }
+      }
+    ]" ( contains: "fake test name" and contains: "fakeTestTag" )
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Expression success="true" type="REQUIRE_FALSE" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+      <Original>
+        !(factories.empty())
+      </Original>
+      <Expanded>
+        !false
+      </Expanded>
+    </Expression>
     <Section name="JUnit reporter lists tags" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: JUnit
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11702,7 +12143,7 @@ All available tags:
       </Expanded>
     </Expression>
     <Section name="JUnit reporter lists reporters" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: JUnit
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11728,7 +12169,7 @@ Available reporters:
       </Expanded>
     </Expression>
     <Section name="JUnit reporter lists tests" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: JUnit
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11756,7 +12197,7 @@ All available test cases:
       </Expanded>
     </Expression>
     <Section name="SonarQube reporter lists tags" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: SonarQube
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11783,7 +12224,7 @@ All available tags:
       </Expanded>
     </Expression>
     <Section name="SonarQube reporter lists reporters" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: SonarQube
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11809,7 +12250,7 @@ Available reporters:
       </Expanded>
     </Expression>
     <Section name="SonarQube reporter lists tests" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: SonarQube
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11837,7 +12278,7 @@ All available test cases:
       </Expanded>
     </Expression>
     <Section name="TAP reporter lists tags" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: TAP
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11863,7 +12304,7 @@ All available test cases:
       </Expanded>
     </Expression>
     <Section name="TAP reporter lists reporters" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: TAP
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11888,7 +12329,7 @@ All available test cases:
       </Expanded>
     </Expression>
     <Section name="TAP reporter lists tests" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: TAP
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11915,7 +12356,7 @@ All available test cases:
       </Expanded>
     </Expression>
     <Section name="TeamCity reporter lists tags" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: TeamCity
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11941,7 +12382,7 @@ All available test cases:
       </Expanded>
     </Expression>
     <Section name="TeamCity reporter lists reporters" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: TeamCity
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11966,7 +12407,7 @@ All available test cases:
       </Expanded>
     </Expression>
     <Section name="TeamCity reporter lists tests" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: TeamCity
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11993,7 +12434,7 @@ All available test cases:
       </Expanded>
     </Expression>
     <Section name="XML reporter lists tags" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: XML
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -12023,7 +12464,7 @@ All available test cases:
       </Expanded>
     </Expression>
     <Section name="XML reporter lists reporters" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: XML
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -12051,7 +12492,7 @@ All available test cases:
       </Expanded>
     </Expression>
     <Section name="XML reporter lists tests" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: XML
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -14552,6 +14993,50 @@ Message from section two
     <Failure filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" />
     <OverallResult success="true" skips="0"/>
   </TestCase>
+  <TestCase name="Testing checked-if 4" tags="[!shouldfail][checked-if]" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+    <Expression success="true" type="CHECKED_ELSE" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+      <Original>
+        true
+      </Original>
+      <Expanded>
+        true
+      </Expanded>
+    </Expression>
+    <Expression success="false" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+      <Original>
+        {Unknown expression after the reported line}
+      </Original>
+      <Expanded>
+        {Unknown expression after the reported line}
+      </Expanded>
+      <Exception filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+        Uncaught exception should fail!
+      </Exception>
+    </Expression>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
+  <TestCase name="Testing checked-if 5" tags="[!shouldfail][checked-if]" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+    <Expression success="false" type="CHECKED_ELSE" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+      <Original>
+        false
+      </Original>
+      <Expanded>
+        false
+      </Expanded>
+    </Expression>
+    <Expression success="false" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+      <Original>
+        {Unknown expression after the reported line}
+      </Original>
+      <Expanded>
+        {Unknown expression after the reported line}
+      </Expanded>
+      <Exception filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+        Uncaught exception should fail!
+      </Exception>
+    </Expression>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
   <TestCase name="The NO_FAIL macro reports a failure but does not fail the test" tags="[messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
     <Expression success="false" type="CHECK_NOFAIL" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       <Original>
@@ -15670,7 +16155,7 @@ There is no extra whitespace here
       <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
     </Section>
     <Section name="Basic usage" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
-      <Section name="One false evalutes to false" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
+      <Section name="One false evaluates to false" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
         <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
           <Original>
             data, !AllTrue()
@@ -15712,7 +16197,7 @@ There is no extra whitespace here
       <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
     </Section>
     <Section name="Contained type is convertible to bool" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
-      <Section name="One false evalutes to false" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
+      <Section name="One false evaluates to false" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
         <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
           <Original>
             data, !AllTrue()
@@ -16020,7 +16505,7 @@ There is no extra whitespace here
       <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
     </Section>
     <Section name="Basic usage" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
-      <Section name="One true evalutes to true" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
+      <Section name="One true evaluates to true" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
         <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
           <Original>
             data, AnyTrue()
@@ -16062,7 +16547,7 @@ There is no extra whitespace here
       <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
     </Section>
     <Section name="Contained type is convertible to bool" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
-      <Section name="One true evalutes to true" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
+      <Section name="One true evaluates to true" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
         <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
           <Original>
             data, AnyTrue()
@@ -16370,7 +16855,7 @@ There is no extra whitespace here
       <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
     </Section>
     <Section name="Basic usage" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
-      <Section name="One true evalutes to false" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
+      <Section name="One true evaluates to false" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
         <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
           <Original>
             data, !NoneTrue()
@@ -16412,7 +16897,7 @@ There is no extra whitespace here
       <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
     </Section>
     <Section name="Contained type is convertible to bool" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
-      <Section name="One true evalutes to false" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
+      <Section name="One true evaluates to false" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
         <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
           <Original>
             data, !NoneTrue()
@@ -18766,7 +19251,7 @@ loose text artifact
     <OverallResult success="false" skips="0"/>
   </TestCase>
   <TestCase name="looped tests" tags="[.][failing]" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
       Testing if fib[0] (1) is even
     </Info>
     <Expression success="false" type="CHECK" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
@@ -18777,7 +19262,7 @@ loose text artifact
         1 == 0
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
       Testing if fib[1] (1) is even
     </Info>
     <Expression success="false" type="CHECK" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
@@ -18788,7 +19273,7 @@ loose text artifact
         1 == 0
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
       Testing if fib[2] (2) is even
     </Info>
     <Expression success="true" type="CHECK" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
@@ -18799,7 +19284,7 @@ loose text artifact
         0 == 0
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
       Testing if fib[3] (3) is even
     </Info>
     <Expression success="false" type="CHECK" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
@@ -18810,7 +19295,7 @@ loose text artifact
         1 == 0
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
       Testing if fib[4] (5) is even
     </Info>
     <Expression success="false" type="CHECK" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
@@ -18821,7 +19306,7 @@ loose text artifact
         1 == 0
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
       Testing if fib[5] (8) is even
     </Info>
     <Expression success="true" type="CHECK" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
@@ -18832,7 +19317,7 @@ loose text artifact
         0 == 0
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
       Testing if fib[6] (13) is even
     </Info>
     <Expression success="false" type="CHECK" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
@@ -18843,7 +19328,7 @@ loose text artifact
         1 == 0
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
       Testing if fib[7] (21) is even
     </Info>
     <Expression success="false" type="CHECK" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
@@ -18982,22 +19467,22 @@ loose text artifact
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="mix info, unscoped info and warning" tags="[info][unscoped]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       info
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       unscoped info
     </Info>
-    <Warning>
+    <Warning filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       and warn may mix
     </Warning>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       info
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       unscoped info
     </Info>
-    <Warning>
+    <Warning filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       they are not cleared after warnings
     </Warning>
     <OverallResult success="false" skips="0"/>
@@ -19204,7 +19689,7 @@ b1!
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="not prints unscoped info from previous failures" tags="[.][failing][info][unscoped]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this MAY be seen only for the FIRST assertion IF info is printed for passing assertions
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -19215,7 +19700,7 @@ b1!
         true
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this MAY be seen only for the SECOND assertion IF info is printed for passing assertions
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -19226,7 +19711,7 @@ b1!
         true
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this SHOULD be seen
     </Info>
     <Expression success="false" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -19362,7 +19847,7 @@ b1!
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="print unscoped info if passing unscoped info is printed" tags="[info][unscoped]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this MAY be seen IF info is printed for passing assertions
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -19376,10 +19861,10 @@ b1!
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="prints unscoped info on failure" tags="[.][failing][info][unscoped]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this SHOULD be seen
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this SHOULD also be seen
     </Info>
     <Expression success="false" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -19393,7 +19878,7 @@ b1!
     <OverallResult success="false" skips="0"/>
   </TestCase>
   <TestCase name="prints unscoped info only for the first assertion" tags="[.][failing][info][unscoped]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this SHOULD be seen only ONCE
     </Info>
     <Expression success="false" type="CHECK" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -19412,7 +19897,7 @@ b1!
         true
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this MAY also be seen only ONCE IF info is printed for passing assertions
     </Info>
     <Expression success="true" type="CHECK" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -19895,7 +20380,7 @@ b1!
     <OverallResult success="true" skips="1"/>
   </TestCase>
   <TestCase name="send a single char to INFO" tags="[.][failing]" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
       3
     </Info>
     <Expression success="false" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
@@ -19909,10 +20394,10 @@ b1!
     <OverallResult success="false" skips="0"/>
   </TestCase>
   <TestCase name="sends information to INFO" tags="[.][failing]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       hi
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       i := 7
     </Info>
     <Expression success="false" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -19970,16 +20455,16 @@ b1!
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="stacks unscoped info in loops" tags="[.][failing][info][unscoped]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       Count 1 to 3...
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       1
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       2
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       3
     </Info>
     <Expression success="false" type="CHECK" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -19990,16 +20475,16 @@ b1!
         false
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       Count 4 to 6...
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       4
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       5
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       6
     </Info>
     <Expression success="false" type="CHECK" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -20649,6 +21134,25 @@ b1!
     </Expression>
     <OverallResult success="true" skips="0"/>
   </TestCase>
+  <TestCase name="uniform_integer_distribution can return the bounds" tags="[distribution][rng]" filename="tests/<exe-name>/IntrospectiveTests/RandomNumberGeneration.tests.cpp" >
+    <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/RandomNumberGeneration.tests.cpp" >
+      <Original>
+        dist.a() == -10
+      </Original>
+      <Expanded>
+        -10 == -10
+      </Expanded>
+    </Expression>
+    <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/RandomNumberGeneration.tests.cpp" >
+      <Original>
+        dist.b() == 10
+      </Original>
+      <Expanded>
+        10 == 10
+      </Expanded>
+    </Expression>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
   <TestCase name="unique_ptr reimplementation: basic functionality" tags="[internals][unique-ptr]" filename="tests/<exe-name>/IntrospectiveTests/UniquePtr.tests.cpp" >
     <Section name="Default constructed unique_ptr is empty" filename="tests/<exe-name>/IntrospectiveTests/UniquePtr.tests.cpp" >
       <Expression success="true" type="REQUIRE_FALSE" filename="tests/<exe-name>/IntrospectiveTests/UniquePtr.tests.cpp" >
@@ -21203,6 +21707,6 @@ b1!
     </Section>
     <OverallResult success="true" skips="0"/>
   </TestCase>
-  <OverallResults successes="2049" failures="145" expectedFailures="32" skips="11"/>
-  <OverallResultsCases successes="309" failures="84" expectedFailures="11" skips="5"/>
+  <OverallResults successes="2079" failures="146" expectedFailures="35" skips="12"/>
+  <OverallResultsCases successes="312" failures="85" expectedFailures="14" skips="6"/>
 </Catch2TestRun>
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/xml.sw.multi.approved.txt b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/xml.sw.multi.approved.txt
index c6ddfc80..08ff6c43 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/xml.sw.multi.approved.txt
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/Baselines/xml.sw.multi.approved.txt
@@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<Catch2TestRun name="<exe-name>" rng-seed="1" xml-format-version="2" catch2-version="<version>" filters="&quot;*&quot; ~[!nonportable] ~[!benchmark] ~[approvals]">
+<Catch2TestRun name="<exe-name>" rng-seed="1" xml-format-version="3" catch2-version="<version>" filters="&quot;*&quot; ~[!nonportable] ~[!benchmark] ~[approvals]">
   <TestCase name="# A test name that starts with a #" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
     <OverallResult success="true" skips="0"/>
   </TestCase>
@@ -77,10 +77,10 @@
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="#1238" filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
       uarr := "123"
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
       sarr := "456"
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
@@ -91,10 +91,10 @@
         0 == 0
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
       uarr := "123"
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
       sarr := "456"
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
@@ -128,11 +128,11 @@
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="#1455 - INFO and WARN can start with a linebreak" tags="[.][messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
 
 This info message starts with a linebreak
     </Info>
-    <Warning>
+    <Warning filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
 
 This warning message starts with a linebreak
     </Warning>
@@ -384,91 +384,91 @@ Nor would this
     <Section name="A" filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
     </Section>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       i := 1
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       j := 3
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       k := 5
     </Info>
     <Section name="B" filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
     </Section>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       i := 1
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       j := 3
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       k := 6
     </Info>
     <Section name="B" filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
     </Section>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       i := 1
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       j := 4
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       k := 5
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       i := 1
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       j := 4
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       k := 6
     </Info>
     <Section name="A" filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
     </Section>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       i := 2
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       j := 3
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       k := 5
     </Info>
     <Section name="B" filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
     </Section>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       i := 2
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       j := 3
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       k := 6
     </Info>
     <Section name="B" filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
     </Section>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       i := 2
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       j := 4
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       k := 5
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       i := 2
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       j := 4
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/PartTracker.tests.cpp" >
       k := 6
     </Info>
     <OverallResult success="true" skips="0"/>
@@ -667,7 +667,7 @@ Nor would this
     </Expression>
     <OverallResult success="true" skips="0"/>
   </TestCase>
-  <TestCase name="#2615 - Throwing in constructor generator fails test case but does not abort" tags="[!shouldfail]" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
+  <TestCase name="#2615 - Throwing in constructor generator fails test case but does not abort" tags="[!shouldfail][generators][regression]" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
     <Exception filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
       failure to init
     </Exception>
@@ -675,7 +675,7 @@ Nor would this
   </TestCase>
   <TestCase name="#748 - captures with unexpected exceptions" tags="[!shouldfail][!throws][.][failing]" filename="tests/<exe-name>/UsageTests/Exception.tests.cpp" >
     <Section name="outside assertions" filename="tests/<exe-name>/UsageTests/Exception.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/UsageTests/Exception.tests.cpp" >
         answer := 42
       </Info>
       <Exception filename="tests/<exe-name>/UsageTests/Exception.tests.cpp" >
@@ -684,7 +684,7 @@ Nor would this
       <OverallResults successes="0" failures="0" expectedFailures="1" skipped="false"/>
     </Section>
     <Section name="inside REQUIRE_NOTHROW" filename="tests/<exe-name>/UsageTests/Exception.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/UsageTests/Exception.tests.cpp" >
         answer := 42
       </Info>
       <Expression success="false" type="REQUIRE_NOTHROW" filename="tests/<exe-name>/UsageTests/Exception.tests.cpp" >
@@ -701,7 +701,7 @@ Nor would this
       <OverallResults successes="0" failures="0" expectedFailures="1" skipped="false"/>
     </Section>
     <Section name="inside REQUIRE_THROWS" filename="tests/<exe-name>/UsageTests/Exception.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/UsageTests/Exception.tests.cpp" >
         answer := 42
       </Info>
       <Expression success="true" type="REQUIRE_THROWS" filename="tests/<exe-name>/UsageTests/Exception.tests.cpp" >
@@ -806,7 +806,7 @@ Nor would this
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="#872" filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
       dummy := 0
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
@@ -2886,92 +2886,92 @@ Nor would this
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="CAPTURE can deal with complex expressions" tags="[capture][messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       a := 1
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       b := 2
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       c := 3
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       a + b := 3
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       a+b := 3
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       c > b := true
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       a == 1 := true
     </Info>
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="CAPTURE can deal with complex expressions involving commas" tags="[capture][messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
-      std::vector&lt;int>{1, 2, 3}[0, 1, 2] := 3
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
+      custom_index_op&lt;int>{1, 2, 3}[0, 1, 2] := 0
     </Info>
-    <Info>
-      std::vector&lt;int>{1, 2, 3}[(0, 1)] := 2
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
+      custom_index_op&lt;int>{1, 2, 3}[(0, 1)] := 0
     </Info>
-    <Info>
-      std::vector&lt;int>{1, 2, 3}[0] := 1
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
+      custom_index_op&lt;int>{1, 2, 3}[0] := 0
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       (helper_1436&lt;int, int>{12, -12}) := { 12, -12 }
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       (helper_1436&lt;int, int>(-12, 12)) := { -12, 12 }
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       (1, 2) := 2
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       (2, 3) := 3
     </Info>
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="CAPTURE parses string and character constants" tags="[capture][messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       ("comma, in string", "escaped, \", ") := "escaped, ", "
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       "single quote in string,'," := "single quote in string,',"
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       "some escapes, \\,\\\\" := "some escapes, \,\\"
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       "some, ), unmatched, } prenheses {[&lt;" := "some, ), unmatched, } prenheses {[&lt;"
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       '"' := '"'
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       '\'' := '''
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       ',' := ','
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       '}' := '}'
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       ')' := ')'
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       '(' := '('
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       '{' := '{'
     </Info>
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="Capture and info messages" filename="tests/<exe-name>/UsageTests/ToStringGeneral.tests.cpp" >
     <Section name="Capture should stringify like assertions" filename="tests/<exe-name>/UsageTests/ToStringGeneral.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/UsageTests/ToStringGeneral.tests.cpp" >
         i := 2
       </Info>
       <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/ToStringGeneral.tests.cpp" >
@@ -2985,7 +2985,7 @@ Nor would this
       <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
     </Section>
     <Section name="Info should NOT stringify the way assertions do" filename="tests/<exe-name>/UsageTests/ToStringGeneral.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/UsageTests/ToStringGeneral.tests.cpp" >
         3
       </Info>
       <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/ToStringGeneral.tests.cpp" >
@@ -4364,6 +4364,12 @@ C
     </Expression>
     <OverallResult success="true" skips="0"/>
   </TestCase>
+  <TestCase name="Empty generators can SKIP in constructor" tags="[skipping]" filename="tests/<exe-name>/UsageTests/Skip.tests.cpp" >
+    <Skip filename="tests/<exe-name>/UsageTests/Skip.tests.cpp" >
+      This generator is empty
+    </Skip>
+    <OverallResult success="true" skips="1"/>
+  </TestCase>
   <TestCase name="Empty stream name opens cout stream" tags="[streams]" filename="tests/<exe-name>/IntrospectiveTests/Stream.tests.cpp" >
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Stream.tests.cpp" >
       <Original>
@@ -4375,17 +4381,6 @@ C
     </Expression>
     <OverallResult success="true" skips="0"/>
   </TestCase>
-  <TestCase name="Empty tag is not allowed" filename="tests/<exe-name>/IntrospectiveTests/Tag.tests.cpp" >
-    <Expression success="true" type="REQUIRE_THROWS" filename="tests/<exe-name>/IntrospectiveTests/Tag.tests.cpp" >
-      <Original>
-        Catch::TestCaseInfo( "", { "fake test name", "[]" }, dummySourceLineInfo )
-      </Original>
-      <Expanded>
-        Catch::TestCaseInfo( "", { "fake test name", "[]" }, dummySourceLineInfo )
-      </Expanded>
-    </Expression>
-    <OverallResult success="true" skips="0"/>
-  </TestCase>
   <TestCase name="EndsWith string matcher" tags="[.][failing][matchers]" filename="tests/<exe-name>/UsageTests/Matchers.tests.cpp" >
     <Expression success="false" type="CHECK_THAT" filename="tests/<exe-name>/UsageTests/Matchers.tests.cpp" >
       <Original>
@@ -4977,7 +4972,7 @@ C
     <Failure filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       This is a failure
     </Failure>
-    <Warning>
+    <Warning filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       This message appears in the output
     </Warning>
     <OverallResult success="false" skips="0"/>
@@ -5588,6 +5583,41 @@ C
     </Section>
     <OverallResult success="true" skips="0"/>
   </TestCase>
+  <TestCase name="GENERATE can combine literals and generators" tags="[generators]" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
+    <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
+      <Original>
+        i % 2 == 0
+      </Original>
+      <Expanded>
+        0 == 0
+      </Expanded>
+    </Expression>
+    <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
+      <Original>
+        i % 2 == 0
+      </Original>
+      <Expanded>
+        0 == 0
+      </Expanded>
+    </Expression>
+    <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
+      <Original>
+        i % 2 == 0
+      </Original>
+      <Expanded>
+        0 == 0
+      </Expanded>
+    </Expression>
+    <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
+      <Original>
+        i % 2 == 0
+      </Original>
+      <Expanded>
+        0 == 0
+      </Expanded>
+    </Expression>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
   <TestCase name="Generators -- adapters" tags="[generators][generic]" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
     <Section name="Filtering by predicate" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
       <Section name="Basic usage" filename="tests/<exe-name>/UsageTests/Generators.tests.cpp" >
@@ -7246,7 +7276,7 @@ C
       <Section name="Positive manual step" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
         <Section name="Floating Point" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
           <Section name="Exact" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7257,7 +7287,7 @@ C
                 -1.0 == Approx( -1.0 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7268,7 +7298,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.9
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7279,7 +7309,7 @@ C
                 -0.9 == Approx( -0.9 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.9
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7290,7 +7320,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.8
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7301,7 +7331,7 @@ C
                 -0.8 == Approx( -0.8 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.8
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7312,7 +7342,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.7
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7323,7 +7353,7 @@ C
                 -0.7 == Approx( -0.7 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.7
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7334,7 +7364,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.6
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7345,7 +7375,7 @@ C
                 -0.6 == Approx( -0.6 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.6
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7356,7 +7386,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.5
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7367,7 +7397,7 @@ C
                 -0.5 == Approx( -0.5 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.5
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7378,7 +7408,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.4
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7389,7 +7419,7 @@ C
                 -0.4 == Approx( -0.4 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.4
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7400,7 +7430,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.3
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7411,7 +7441,7 @@ C
                 -0.3 == Approx( -0.3 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.3
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7422,7 +7452,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.2
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7433,7 +7463,7 @@ C
                 -0.2 == Approx( -0.2 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.2
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7444,7 +7474,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7455,7 +7485,7 @@ C
                 -0.1 == Approx( -0.1 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7466,7 +7496,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -1.38778e-16
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7477,7 +7507,7 @@ C
                 -0.0 == Approx( -0.0 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -1.38778e-16
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7488,7 +7518,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7499,7 +7529,7 @@ C
                 0.1 == Approx( 0.1 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7510,7 +7540,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.2
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7521,7 +7551,7 @@ C
                 0.2 == Approx( 0.2 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.2
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7532,7 +7562,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.3
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7543,7 +7573,7 @@ C
                 0.3 == Approx( 0.3 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.3
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7554,7 +7584,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.4
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7565,7 +7595,7 @@ C
                 0.4 == Approx( 0.4 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.4
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7576,7 +7606,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.5
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7587,7 +7617,7 @@ C
                 0.5 == Approx( 0.5 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.5
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7598,7 +7628,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.6
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7609,7 +7639,7 @@ C
                 0.6 == Approx( 0.6 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.6
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7620,7 +7650,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.7
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7631,7 +7661,7 @@ C
                 0.7 == Approx( 0.7 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.7
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7642,7 +7672,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.8
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7653,7 +7683,7 @@ C
                 0.8 == Approx( 0.8 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.8
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7664,7 +7694,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.9
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7675,7 +7705,7 @@ C
                 0.9 == Approx( 0.9 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.9
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7714,7 +7744,7 @@ C
       <Section name="Positive manual step" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
         <Section name="Floating Point" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
           <Section name="Slightly over end" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7725,7 +7755,7 @@ C
                 -1.0 == Approx( -1.0 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7736,7 +7766,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.7
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7747,7 +7777,7 @@ C
                 -0.7 == Approx( -0.7 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.7
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7758,7 +7788,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.4
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7769,7 +7799,7 @@ C
                 -0.4 == Approx( -0.4 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.4
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7780,7 +7810,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7791,7 +7821,7 @@ C
                 -0.1 == Approx( -0.1 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7802,7 +7832,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.2
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7813,7 +7843,7 @@ C
                 0.2 == Approx( 0.2 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.2
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7824,7 +7854,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.5
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7835,7 +7865,7 @@ C
                 0.5 == Approx( 0.5 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.5
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7866,7 +7896,7 @@ C
       <Section name="Positive manual step" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
         <Section name="Floating Point" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
           <Section name="Slightly under end" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7877,7 +7907,7 @@ C
                 -1.0 == Approx( -1.0 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7888,7 +7918,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.7
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7899,7 +7929,7 @@ C
                 -0.7 == Approx( -0.7 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.7
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7910,7 +7940,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.4
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7921,7 +7951,7 @@ C
                 -0.4 == Approx( -0.4 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.4
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7932,7 +7962,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7943,7 +7973,7 @@ C
                 -0.1 == Approx( -0.1 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is -0.1
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7954,7 +7984,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.2
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7965,7 +7995,7 @@ C
                 0.2 == Approx( 0.2 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.2
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7976,7 +8006,7 @@ C
                 true
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.5
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -7987,7 +8017,7 @@ C
                 0.5 == Approx( 0.5 )
               </Expanded>
             </Expression>
-            <Info>
+            <Info filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
               Current expected value is 0.5
             </Info>
             <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/GeneratorsImpl.tests.cpp" >
@@ -8376,20 +8406,32 @@ C
     </Expression>
     <OverallResult success="true" skips="0"/>
   </TestCase>
+  <TestCase name="INFO and UNSCOPED_INFO can stream multiple arguments" tags="[.][failing][info][messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
+      This info has multiple parts.
+    </Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
+      This unscoped info has multiple parts.
+    </Info>
+    <Failure filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
+      Show infos!
+    </Failure>
+    <OverallResult success="false" skips="0"/>
+  </TestCase>
   <TestCase name="INFO and WARN do not abort tests" tags="[.][messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this is a message
     </Info>
-    <Warning>
+    <Warning filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this is a warning
     </Warning>
     <OverallResult success="false" skips="0"/>
   </TestCase>
   <TestCase name="INFO gets logged on failure" tags="[.][failing][messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this message should be logged
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       so should this
     </Info>
     <Expression success="false" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8403,7 +8445,7 @@ C
     <OverallResult success="false" skips="0"/>
   </TestCase>
   <TestCase name="INFO gets logged on failure, even if captured before successful assertions" tags="[.][failing][messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this message may be logged later
     </Info>
     <Expression success="true" type="CHECK" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8414,10 +8456,10 @@ C
         2 == 2
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this message may be logged later
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this message should be logged
     </Info>
     <Expression success="false" type="CHECK" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8428,13 +8470,13 @@ C
         2 == 1
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this message may be logged later
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this message should be logged
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       and this, but later
     </Info>
     <Expression success="false" type="CHECK" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8445,16 +8487,16 @@ C
         2 == 0
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this message may be logged later
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this message should be logged
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       and this, but later
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       but not this
     </Info>
     <Expression success="true" type="CHECK" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8468,10 +8510,10 @@ C
     <OverallResult success="false" skips="0"/>
   </TestCase>
   <TestCase name="INFO is reset for each loop" tags="[.][failing][messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       current counter 0
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       i := 0
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8482,10 +8524,10 @@ C
         0 &lt; 10
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       current counter 1
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       i := 1
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8496,10 +8538,10 @@ C
         1 &lt; 10
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       current counter 2
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       i := 2
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8510,10 +8552,10 @@ C
         2 &lt; 10
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       current counter 3
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       i := 3
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8524,10 +8566,10 @@ C
         3 &lt; 10
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       current counter 4
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       i := 4
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8538,10 +8580,10 @@ C
         4 &lt; 10
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       current counter 5
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       i := 5
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8552,10 +8594,10 @@ C
         5 &lt; 10
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       current counter 6
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       i := 6
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8566,10 +8608,10 @@ C
         6 &lt; 10
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       current counter 7
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       i := 7
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8580,10 +8622,10 @@ C
         7 &lt; 10
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       current counter 8
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       i := 8
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8594,10 +8636,10 @@ C
         8 &lt; 10
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       current counter 9
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       i := 9
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8608,10 +8650,10 @@ C
         9 &lt; 10
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       current counter 10
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       i := 10
     </Info>
     <Expression success="false" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -8624,6 +8666,20 @@ C
     </Expression>
     <OverallResult success="false" skips="0"/>
   </TestCase>
+  <TestCase name="Incomplete AssertionHandler" tags="[!shouldfail][assertion-handler]" filename="tests/<exe-name>/IntrospectiveTests/AssertionHandler.tests.cpp" >
+    <Expression success="false" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/AssertionHandler.tests.cpp" >
+      <Original>
+        Dummy
+      </Original>
+      <Expanded>
+        Dummy
+      </Expanded>
+      <Exception filename="tests/<exe-name>/IntrospectiveTests/AssertionHandler.tests.cpp" >
+        Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE
+      </Exception>
+    </Expression>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
   <TestCase name="Inequality checks that should fail" tags="[!shouldfail][.][failing]" filename="tests/<exe-name>/UsageTests/Condition.tests.cpp" >
     <Expression success="false" type="CHECK" filename="tests/<exe-name>/UsageTests/Condition.tests.cpp" >
       <Original>
@@ -8758,6 +8814,277 @@ C
     </Expression>
     <OverallResult success="true" skips="0"/>
   </TestCase>
+  <TestCase name="JsonWriter" tags="[JSON][JsonWriter]" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+    <Section name="Newly constructed JsonWriter does nothing" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == ""
+        </Original>
+        <Expanded>
+          "" == ""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Calling writeObject will create an empty pair of braces" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == "{\n}"
+        </Original>
+        <Expanded>
+          "{
+}"
+==
+"{
+}"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Calling writeObject with key will create an object to write the value" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str(), ContainsSubstring( "\"int\": 1," ) &amp;&amp; ContainsSubstring( "\"double\": 1.5," ) &amp;&amp; ContainsSubstring( "\"true\": true," ) &amp;&amp; ContainsSubstring( "\"false\": false," ) &amp;&amp; ContainsSubstring( "\"string\": \"this is a string\"," ) &amp;&amp; ContainsSubstring( "\"array\": [\n    1,\n    2\n  ]\n}" )
+        </Original>
+        <Expanded>
+          "{
+  "int": 1,
+  "double": 1.5,
+  "true": true,
+  "false": false,
+  "string": "this is a string",
+  "array": [
+    1,
+    2
+  ]
+}" ( contains: ""int": 1," and contains: ""double": 1.5," and contains: ""true": true," and contains: ""false": false," and contains: ""string": "this is a string"," and contains: ""array": [
+    1,
+    2
+  ]
+}" )
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="nesting objects" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str(), ContainsSubstring( "\"empty_object\": {\n  }," ) &amp;&amp; ContainsSubstring( "\"fully_object\": {\n    \"key\": 1\n  }" )
+        </Original>
+        <Expanded>
+          "{
+  "empty_object": {
+  },
+  "fully_object": {
+    "key": 1
+  }
+}" ( contains: ""empty_object": {
+  }," and contains: ""fully_object": {
+    "key": 1
+  }" )
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Calling writeArray will create an empty pair of braces" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == "[\n]"
+        </Original>
+        <Expanded>
+          "[
+]"
+==
+"[
+]"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Calling writeArray creates array to write the values to" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == "[\n  1,\n  1.5,\n  true,\n  false,\n  \"this is a string\",\n  {\n    \"object\": 42\n  },\n  [\n    \"array\",\n    42.5\n  ]\n]"
+        </Original>
+        <Expanded>
+          "[
+  1,
+  1.5,
+  true,
+  false,
+  "this is a string",
+  {
+    "object": 42
+  },
+  [
+    "array",
+    42.5
+  ]
+]"
+==
+"[
+  1,
+  1.5,
+  true,
+  false,
+  "this is a string",
+  {
+    "object": 42
+  },
+  [
+    "array",
+    42.5
+  ]
+]"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Moved from JsonObjectWriter shall not insert superfluous brace" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == "{\n}"
+        </Original>
+        <Expanded>
+          "{
+}"
+==
+"{
+}"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Moved from JsonArrayWriter shall not insert superfluous bracket" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == "[\n]"
+        </Original>
+        <Expanded>
+          "[
+]"
+==
+"[
+]"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Custom class shall be quoted" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          stream.str() == "\"custom\""
+        </Original>
+        <Expanded>
+          ""custom"" == ""custom""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
+  <TestCase name="JsonWriter escapes charaters in strings properly" tags="[JsonWriter]" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+    <Section name="Quote in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\\"\""
+        </Original>
+        <Expanded>
+          ""\""" == ""\"""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Backslash in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\\\\""
+        </Original>
+        <Expanded>
+          ""\\"" == ""\\""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Forward slash in a string is **not** escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"/\""
+        </Original>
+        <Expanded>
+          ""/"" == ""/""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Backspace in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\b\""
+        </Original>
+        <Expanded>
+          ""\b"" == ""\b""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="Formfeed in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\f\""
+        </Original>
+        <Expanded>
+          ""\f"" == ""\f""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="linefeed in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\n\""
+        </Original>
+        <Expanded>
+          ""\n"" == ""\n""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="carriage return in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\r\""
+        </Original>
+        <Expanded>
+          ""\r"" == ""\r""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="tab in a string is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\t\""
+        </Original>
+        <Expanded>
+          ""\t"" == ""\t""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Section name="combination of characters is escaped" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+      <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/Json.tests.cpp" >
+        <Original>
+          sstream.str() == "\"\\\\/\\t\\r\\n\""
+        </Original>
+        <Expanded>
+          ""\\/\t\r\n"" == ""\\/\t\r\n""
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
   <TestCase name="Lambdas in assertions" filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Compilation.tests.cpp" >
       <Original>
@@ -9219,7 +9546,7 @@ C
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="Nice descriptive name" tags="[.][tag1][tag2][tag3]" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
-    <Warning>
+    <Warning filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
       This one ran
     </Warning>
     <OverallResult success="false" skips="0"/>
@@ -10005,7 +10332,7 @@ C
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="Parsing tags with non-alphabetical characters is pass-through" tags="[test-spec][test-spec-parser]" filename="tests/<exe-name>/IntrospectiveTests/TestSpecParser.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/TestSpecParser.tests.cpp" >
       tagString := "[tag with spaces]"
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/TestSpecParser.tests.cpp" >
@@ -10016,7 +10343,7 @@ C
         true
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/TestSpecParser.tests.cpp" >
       tagString := "[tag with spaces]"
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/TestSpecParser.tests.cpp" >
@@ -10027,7 +10354,7 @@ C
         true
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/TestSpecParser.tests.cpp" >
       tagString := "[tag with spaces]"
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/TestSpecParser.tests.cpp" >
@@ -10038,7 +10365,7 @@ C
         true
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/TestSpecParser.tests.cpp" >
       tagString := "[I said "good day" sir!]"
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/TestSpecParser.tests.cpp" >
@@ -10049,7 +10376,7 @@ C
         true
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/TestSpecParser.tests.cpp" >
       tagString := "[I said "good day" sir!]"
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/TestSpecParser.tests.cpp" >
@@ -10060,7 +10387,7 @@ C
         true
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/IntrospectiveTests/TestSpecParser.tests.cpp" >
       tagString := "[I said "good day" sir!]"
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/TestSpecParser.tests.cpp" >
@@ -10471,7 +10798,7 @@ C
     </Section>
     <Section name="reporter" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
       <Section name="-r/console" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
-        <Info>
+        <Info filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
           result.errorMessage() := ""
         </Info>
         <Expression success="true" type="CHECK" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
@@ -10482,7 +10809,7 @@ C
             {?}
           </Expanded>
         </Expression>
-        <Info>
+        <Info filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
           result.errorMessage() := ""
         </Info>
         <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
@@ -10499,7 +10826,7 @@ C
     </Section>
     <Section name="reporter" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
       <Section name="-r/xml" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
-        <Info>
+        <Info filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
           result.errorMessage() := ""
         </Info>
         <Expression success="true" type="CHECK" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
@@ -10510,7 +10837,7 @@ C
             {?}
           </Expanded>
         </Expression>
-        <Info>
+        <Info filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
           result.errorMessage() := ""
         </Info>
         <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
@@ -10527,7 +10854,7 @@ C
     </Section>
     <Section name="reporter" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
       <Section name="--reporter/junit" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
-        <Info>
+        <Info filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
           result.errorMessage() := ""
         </Info>
         <Expression success="true" type="CHECK" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
@@ -10538,7 +10865,7 @@ C
             {?}
           </Expanded>
         </Expression>
-        <Info>
+        <Info filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
           result.errorMessage() := ""
         </Info>
         <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
@@ -10577,7 +10904,7 @@ C
     </Section>
     <Section name="reporter" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
       <Section name="With output file" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
-        <Info>
+        <Info filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
           result.errorMessage() := ""
         </Info>
         <Expression success="true" type="CHECK" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
@@ -10588,7 +10915,7 @@ C
             {?}
           </Expanded>
         </Expression>
-        <Info>
+        <Info filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
           result.errorMessage() := ""
         </Info>
         <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
@@ -10605,7 +10932,7 @@ C
     </Section>
     <Section name="reporter" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
       <Section name="With Windows-like absolute path as output file" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
-        <Info>
+        <Info filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
           result.errorMessage() := ""
         </Info>
         <Expression success="true" type="CHECK" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
@@ -10616,7 +10943,7 @@ C
             {?}
           </Expanded>
         </Expression>
-        <Info>
+        <Info filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
           result.errorMessage() := ""
         </Info>
         <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/CmdLine.tests.cpp" >
@@ -11441,7 +11768,7 @@ C
       </Expanded>
     </Expression>
     <Section name="Automake reporter lists tags" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: Automake
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11467,7 +11794,7 @@ C
       </Expanded>
     </Expression>
     <Section name="Automake reporter lists reporters" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: Automake
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11492,7 +11819,7 @@ C
       </Expanded>
     </Expression>
     <Section name="Automake reporter lists tests" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: Automake
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11519,7 +11846,7 @@ C
       </Expanded>
     </Expression>
     <Section name="compact reporter lists tags" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: compact
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11545,7 +11872,7 @@ C
       </Expanded>
     </Expression>
     <Section name="compact reporter lists reporters" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: compact
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11570,7 +11897,7 @@ C
       </Expanded>
     </Expression>
     <Section name="compact reporter lists tests" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: compact
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11597,7 +11924,7 @@ C
       </Expanded>
     </Expression>
     <Section name="console reporter lists tags" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: console
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11623,7 +11950,7 @@ C
       </Expanded>
     </Expression>
     <Section name="console reporter lists reporters" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: console
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11648,7 +11975,7 @@ C
       </Expanded>
     </Expression>
     <Section name="console reporter lists tests" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: console
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11674,8 +12001,122 @@ C
         !false
       </Expanded>
     </Expression>
+    <Section name="JSON reporter lists tags" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+        Tested reporter: JSON
+      </Info>
+      <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+        <Original>
+          listingString, ContainsSubstring("fakeTag"s)
+        </Original>
+        <Expanded>
+          "{
+  "version": 1,
+  "metadata": {
+    "name": "",
+    "rng-seed": 1234,
+    "catch2-version": "<version>"
+  },
+  "listings": {
+    "tags": [
+      {
+        "aliases": [
+          "fakeTag"
+        ],
+        "count": 1
+      }
+    ]" contains: "fakeTag"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Expression success="true" type="REQUIRE_FALSE" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+      <Original>
+        !(factories.empty())
+      </Original>
+      <Expanded>
+        !false
+      </Expanded>
+    </Expression>
+    <Section name="JSON reporter lists reporters" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+        Tested reporter: JSON
+      </Info>
+      <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+        <Original>
+          listingString, ContainsSubstring("fake reporter"s)
+        </Original>
+        <Expanded>
+          "{
+  "version": 1,
+  "metadata": {
+    "name": "",
+    "rng-seed": 1234,
+    "catch2-version": "<version>"
+  },
+  "listings": {
+    "reporters": [
+      {
+        "name": "fake reporter",
+        "description": "fake description"
+      }
+    ]" contains: "fake reporter"
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Expression success="true" type="REQUIRE_FALSE" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+      <Original>
+        !(factories.empty())
+      </Original>
+      <Expanded>
+        !false
+      </Expanded>
+    </Expression>
+    <Section name="JSON reporter lists tests" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+        Tested reporter: JSON
+      </Info>
+      <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+        <Original>
+          listingString, ContainsSubstring( "fake test name"s ) &amp;&amp; ContainsSubstring( "fakeTestTag"s )
+        </Original>
+        <Expanded>
+          "{
+  "version": 1,
+  "metadata": {
+    "name": "",
+    "rng-seed": 1234,
+    "catch2-version": "<version>"
+  },
+  "listings": {
+    "tests": [
+      {
+        "name": "fake test name",
+        "class-name": "",
+        "tags": [
+          "fakeTestTag"
+        ],
+        "source-location": {
+          "filename": "fake-file.cpp",
+          "line": 123456789
+        }
+      }
+    ]" ( contains: "fake test name" and contains: "fakeTestTag" )
+        </Expanded>
+      </Expression>
+      <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
+    </Section>
+    <Expression success="true" type="REQUIRE_FALSE" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
+      <Original>
+        !(factories.empty())
+      </Original>
+      <Expanded>
+        !false
+      </Expanded>
+    </Expression>
     <Section name="JUnit reporter lists tags" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: JUnit
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11702,7 +12143,7 @@ All available tags:
       </Expanded>
     </Expression>
     <Section name="JUnit reporter lists reporters" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: JUnit
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11728,7 +12169,7 @@ Available reporters:
       </Expanded>
     </Expression>
     <Section name="JUnit reporter lists tests" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: JUnit
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11756,7 +12197,7 @@ All available test cases:
       </Expanded>
     </Expression>
     <Section name="SonarQube reporter lists tags" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: SonarQube
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11783,7 +12224,7 @@ All available tags:
       </Expanded>
     </Expression>
     <Section name="SonarQube reporter lists reporters" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: SonarQube
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11809,7 +12250,7 @@ Available reporters:
       </Expanded>
     </Expression>
     <Section name="SonarQube reporter lists tests" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: SonarQube
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11837,7 +12278,7 @@ All available test cases:
       </Expanded>
     </Expression>
     <Section name="TAP reporter lists tags" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: TAP
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11863,7 +12304,7 @@ All available test cases:
       </Expanded>
     </Expression>
     <Section name="TAP reporter lists reporters" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: TAP
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11888,7 +12329,7 @@ All available test cases:
       </Expanded>
     </Expression>
     <Section name="TAP reporter lists tests" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: TAP
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11915,7 +12356,7 @@ All available test cases:
       </Expanded>
     </Expression>
     <Section name="TeamCity reporter lists tags" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: TeamCity
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11941,7 +12382,7 @@ All available test cases:
       </Expanded>
     </Expression>
     <Section name="TeamCity reporter lists reporters" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: TeamCity
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11966,7 +12407,7 @@ All available test cases:
       </Expanded>
     </Expression>
     <Section name="TeamCity reporter lists tests" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: TeamCity
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -11993,7 +12434,7 @@ All available test cases:
       </Expanded>
     </Expression>
     <Section name="XML reporter lists tags" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: XML
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -12023,7 +12464,7 @@ All available test cases:
       </Expanded>
     </Expression>
     <Section name="XML reporter lists reporters" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: XML
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -12051,7 +12492,7 @@ All available test cases:
       </Expanded>
     </Expression>
     <Section name="XML reporter lists tests" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
-      <Info>
+      <Info filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
         Tested reporter: XML
       </Info>
       <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/IntrospectiveTests/Reporters.tests.cpp" >
@@ -14552,6 +14993,50 @@ Message from section two
     <Failure filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" />
     <OverallResult success="true" skips="0"/>
   </TestCase>
+  <TestCase name="Testing checked-if 4" tags="[!shouldfail][checked-if]" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+    <Expression success="true" type="CHECKED_ELSE" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+      <Original>
+        true
+      </Original>
+      <Expanded>
+        true
+      </Expanded>
+    </Expression>
+    <Expression success="false" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+      <Original>
+        {Unknown expression after the reported line}
+      </Original>
+      <Expanded>
+        {Unknown expression after the reported line}
+      </Expanded>
+      <Exception filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+        Uncaught exception should fail!
+      </Exception>
+    </Expression>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
+  <TestCase name="Testing checked-if 5" tags="[!shouldfail][checked-if]" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+    <Expression success="false" type="CHECKED_ELSE" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+      <Original>
+        false
+      </Original>
+      <Expanded>
+        false
+      </Expanded>
+    </Expression>
+    <Expression success="false" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+      <Original>
+        {Unknown expression after the reported line}
+      </Original>
+      <Expanded>
+        {Unknown expression after the reported line}
+      </Expanded>
+      <Exception filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
+        Uncaught exception should fail!
+      </Exception>
+    </Expression>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
   <TestCase name="The NO_FAIL macro reports a failure but does not fail the test" tags="[messages]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
     <Expression success="false" type="CHECK_NOFAIL" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       <Original>
@@ -15670,7 +16155,7 @@ There is no extra whitespace here
       <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
     </Section>
     <Section name="Basic usage" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
-      <Section name="One false evalutes to false" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
+      <Section name="One false evaluates to false" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
         <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
           <Original>
             data, !AllTrue()
@@ -15712,7 +16197,7 @@ There is no extra whitespace here
       <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
     </Section>
     <Section name="Contained type is convertible to bool" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
-      <Section name="One false evalutes to false" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
+      <Section name="One false evaluates to false" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
         <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
           <Original>
             data, !AllTrue()
@@ -16020,7 +16505,7 @@ There is no extra whitespace here
       <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
     </Section>
     <Section name="Basic usage" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
-      <Section name="One true evalutes to true" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
+      <Section name="One true evaluates to true" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
         <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
           <Original>
             data, AnyTrue()
@@ -16062,7 +16547,7 @@ There is no extra whitespace here
       <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
     </Section>
     <Section name="Contained type is convertible to bool" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
-      <Section name="One true evalutes to true" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
+      <Section name="One true evaluates to true" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
         <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
           <Original>
             data, AnyTrue()
@@ -16370,7 +16855,7 @@ There is no extra whitespace here
       <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
     </Section>
     <Section name="Basic usage" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
-      <Section name="One true evalutes to false" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
+      <Section name="One true evaluates to false" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
         <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
           <Original>
             data, !NoneTrue()
@@ -16412,7 +16897,7 @@ There is no extra whitespace here
       <OverallResults successes="1" failures="0" expectedFailures="0" skipped="false"/>
     </Section>
     <Section name="Contained type is convertible to bool" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
-      <Section name="One true evalutes to false" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
+      <Section name="One true evaluates to false" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
         <Expression success="true" type="REQUIRE_THAT" filename="tests/<exe-name>/UsageTests/MatchersRanges.tests.cpp" >
           <Original>
             data, !NoneTrue()
@@ -18765,7 +19250,7 @@ There is no extra whitespace here
     <OverallResult success="false" skips="0"/>
   </TestCase>
   <TestCase name="looped tests" tags="[.][failing]" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
       Testing if fib[0] (1) is even
     </Info>
     <Expression success="false" type="CHECK" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
@@ -18776,7 +19261,7 @@ There is no extra whitespace here
         1 == 0
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
       Testing if fib[1] (1) is even
     </Info>
     <Expression success="false" type="CHECK" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
@@ -18787,7 +19272,7 @@ There is no extra whitespace here
         1 == 0
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
       Testing if fib[2] (2) is even
     </Info>
     <Expression success="true" type="CHECK" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
@@ -18798,7 +19283,7 @@ There is no extra whitespace here
         0 == 0
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
       Testing if fib[3] (3) is even
     </Info>
     <Expression success="false" type="CHECK" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
@@ -18809,7 +19294,7 @@ There is no extra whitespace here
         1 == 0
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
       Testing if fib[4] (5) is even
     </Info>
     <Expression success="false" type="CHECK" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
@@ -18820,7 +19305,7 @@ There is no extra whitespace here
         1 == 0
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
       Testing if fib[5] (8) is even
     </Info>
     <Expression success="true" type="CHECK" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
@@ -18831,7 +19316,7 @@ There is no extra whitespace here
         0 == 0
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
       Testing if fib[6] (13) is even
     </Info>
     <Expression success="false" type="CHECK" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
@@ -18842,7 +19327,7 @@ There is no extra whitespace here
         1 == 0
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
       Testing if fib[7] (21) is even
     </Info>
     <Expression success="false" type="CHECK" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
@@ -18981,22 +19466,22 @@ There is no extra whitespace here
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="mix info, unscoped info and warning" tags="[info][unscoped]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       info
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       unscoped info
     </Info>
-    <Warning>
+    <Warning filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       and warn may mix
     </Warning>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       info
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       unscoped info
     </Info>
-    <Warning>
+    <Warning filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       they are not cleared after warnings
     </Warning>
     <OverallResult success="false" skips="0"/>
@@ -19203,7 +19688,7 @@ b1!
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="not prints unscoped info from previous failures" tags="[.][failing][info][unscoped]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this MAY be seen only for the FIRST assertion IF info is printed for passing assertions
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -19214,7 +19699,7 @@ b1!
         true
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this MAY be seen only for the SECOND assertion IF info is printed for passing assertions
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -19225,7 +19710,7 @@ b1!
         true
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this SHOULD be seen
     </Info>
     <Expression success="false" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -19361,7 +19846,7 @@ b1!
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="print unscoped info if passing unscoped info is printed" tags="[info][unscoped]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this MAY be seen IF info is printed for passing assertions
     </Info>
     <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -19375,10 +19860,10 @@ b1!
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="prints unscoped info on failure" tags="[.][failing][info][unscoped]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this SHOULD be seen
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this SHOULD also be seen
     </Info>
     <Expression success="false" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -19392,7 +19877,7 @@ b1!
     <OverallResult success="false" skips="0"/>
   </TestCase>
   <TestCase name="prints unscoped info only for the first assertion" tags="[.][failing][info][unscoped]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this SHOULD be seen only ONCE
     </Info>
     <Expression success="false" type="CHECK" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -19411,7 +19896,7 @@ b1!
         true
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       this MAY also be seen only ONCE IF info is printed for passing assertions
     </Info>
     <Expression success="true" type="CHECK" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -19894,7 +20379,7 @@ b1!
     <OverallResult success="true" skips="1"/>
   </TestCase>
   <TestCase name="send a single char to INFO" tags="[.][failing]" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
       3
     </Info>
     <Expression success="false" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Misc.tests.cpp" >
@@ -19908,10 +20393,10 @@ b1!
     <OverallResult success="false" skips="0"/>
   </TestCase>
   <TestCase name="sends information to INFO" tags="[.][failing]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       hi
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       i := 7
     </Info>
     <Expression success="false" type="REQUIRE" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -19969,16 +20454,16 @@ b1!
     <OverallResult success="true" skips="0"/>
   </TestCase>
   <TestCase name="stacks unscoped info in loops" tags="[.][failing][info][unscoped]" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       Count 1 to 3...
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       1
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       2
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       3
     </Info>
     <Expression success="false" type="CHECK" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -19989,16 +20474,16 @@ b1!
         false
       </Expanded>
     </Expression>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       Count 4 to 6...
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       4
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       5
     </Info>
-    <Info>
+    <Info filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
       6
     </Info>
     <Expression success="false" type="CHECK" filename="tests/<exe-name>/UsageTests/Message.tests.cpp" >
@@ -20648,6 +21133,25 @@ b1!
     </Expression>
     <OverallResult success="true" skips="0"/>
   </TestCase>
+  <TestCase name="uniform_integer_distribution can return the bounds" tags="[distribution][rng]" filename="tests/<exe-name>/IntrospectiveTests/RandomNumberGeneration.tests.cpp" >
+    <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/RandomNumberGeneration.tests.cpp" >
+      <Original>
+        dist.a() == -10
+      </Original>
+      <Expanded>
+        -10 == -10
+      </Expanded>
+    </Expression>
+    <Expression success="true" type="REQUIRE" filename="tests/<exe-name>/IntrospectiveTests/RandomNumberGeneration.tests.cpp" >
+      <Original>
+        dist.b() == 10
+      </Original>
+      <Expanded>
+        10 == 10
+      </Expanded>
+    </Expression>
+    <OverallResult success="true" skips="0"/>
+  </TestCase>
   <TestCase name="unique_ptr reimplementation: basic functionality" tags="[internals][unique-ptr]" filename="tests/<exe-name>/IntrospectiveTests/UniquePtr.tests.cpp" >
     <Section name="Default constructed unique_ptr is empty" filename="tests/<exe-name>/IntrospectiveTests/UniquePtr.tests.cpp" >
       <Expression success="true" type="REQUIRE_FALSE" filename="tests/<exe-name>/IntrospectiveTests/UniquePtr.tests.cpp" >
@@ -21202,6 +21706,6 @@ b1!
     </Section>
     <OverallResult success="true" skips="0"/>
   </TestCase>
-  <OverallResults successes="2049" failures="145" expectedFailures="32" skips="11"/>
-  <OverallResultsCases successes="309" failures="84" expectedFailures="11" skips="5"/>
+  <OverallResults successes="2079" failures="146" expectedFailures="35" skips="12"/>
+  <OverallResultsCases successes="312" failures="85" expectedFailures="14" skips="6"/>
 </Catch2TestRun>
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/AssertionHandler.tests.cpp b/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/AssertionHandler.tests.cpp
new file mode 100644
index 00000000..ab096074
--- /dev/null
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/AssertionHandler.tests.cpp
@@ -0,0 +1,17 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/catch_test_macros.hpp>
+
+TEST_CASE( "Incomplete AssertionHandler", "[assertion-handler][!shouldfail]" ) {
+    Catch::AssertionHandler catchAssertionHandler(
+        "REQUIRE"_catch_sr,
+        CATCH_INTERNAL_LINEINFO,
+        "Dummy",
+        Catch::ResultDisposition::Normal );
+}
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/Details.tests.cpp b/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/Details.tests.cpp
index a5a43926..d7175756 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/Details.tests.cpp
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/Details.tests.cpp
@@ -89,6 +89,47 @@ TEST_CASE("Optional comparison ops", "[optional][approvals]") {
     }
 }
 
+namespace {
+    struct MoveChecker {
+        bool has_moved = false;
+        MoveChecker() = default;
+        MoveChecker( MoveChecker const& rhs ) = default;
+        MoveChecker& operator=( MoveChecker const& rhs ) = default;
+        MoveChecker( MoveChecker&& rhs ) noexcept { rhs.has_moved = true; }
+        MoveChecker& operator=( MoveChecker&& rhs ) noexcept {
+            rhs.has_moved = true;
+            return *this;
+        }
+    };
+}
+
+TEST_CASE( "Optional supports move ops", "[optional][approvals]" ) {
+    using Catch::Optional;
+    MoveChecker a;
+    Optional<MoveChecker> opt_A( a );
+    REQUIRE_FALSE( a.has_moved );
+    REQUIRE_FALSE( opt_A->has_moved );
+
+    SECTION( "Move construction from element" ) {
+        Optional<MoveChecker> opt_B( CATCH_MOVE( a ) );
+        REQUIRE( a.has_moved );
+    }
+    SECTION( "Move assignment from element" ) {
+        opt_A = CATCH_MOVE( a );
+        REQUIRE( a.has_moved );
+    }
+    SECTION( "Move construction from optional" ) {
+        Optional<MoveChecker> opt_B( CATCH_MOVE( opt_A ) );
+        REQUIRE( opt_A->has_moved );
+    }
+    SECTION( "Move assignment from optional" ) {
+        Optional<MoveChecker> opt_B( opt_A );
+        REQUIRE_FALSE( opt_A->has_moved );
+        opt_B = CATCH_MOVE( opt_A );
+        REQUIRE( opt_A->has_moved );
+    }
+}
+
 TEST_CASE( "Decomposer checks that the argument is 0 when handling "
            "only-0-comparable types",
            "[decomposition][approvals]" ) {
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/FloatingPoint.tests.cpp b/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/FloatingPoint.tests.cpp
index 08a579c9..d2181702 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/FloatingPoint.tests.cpp
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/FloatingPoint.tests.cpp
@@ -9,7 +9,9 @@
 #include <catch2/catch_test_macros.hpp>
 #include <catch2/catch_template_test_macros.hpp>
 #include <catch2/internal/catch_floating_point_helpers.hpp>
+#include <catch2/internal/catch_random_floating_point_helpers.hpp>
 
+#include <limits>
 
 TEST_CASE("convertToBits", "[floating-point][conversion]") {
     using Catch::Detail::convertToBits;
@@ -72,3 +74,66 @@ TEST_CASE("UlpDistance", "[floating-point][ulp][approvals]") {
     CHECK( ulpDistance( 1.f, 2.f ) == 0x80'00'00 );
     CHECK( ulpDistance( -2.f, 2.f ) == 0x80'00'00'00 );
 }
+
+
+
+TEMPLATE_TEST_CASE("gamma", "[approvals][floating-point][ulp][gamma]", float, double) {
+    using Catch::Detail::gamma;
+    using Catch::Detail::directCompare;
+
+    // We need to butcher the equal tests with the directCompare helper,
+    // because the Wfloat-equal triggers in decomposer rather than here,
+    // so we cannot locally disable it. Goddamn GCC.
+    CHECK( directCompare( gamma( TestType( -1. ), TestType( 1. ) ),
+                          gamma( TestType( 0.2332 ), TestType( 1.0 ) ) ) );
+    CHECK( directCompare( gamma( TestType( -2. ), TestType( 0 ) ),
+                          gamma( TestType( 1. ), TestType( 1.5 ) ) ) );
+    CHECK( gamma( TestType( 0. ), TestType( 1.0 ) ) <
+           gamma( TestType( 1.0 ), TestType( 1.5 ) ) );
+    CHECK( gamma( TestType( 0 ), TestType( 1. ) ) <
+           std::numeric_limits<TestType>::epsilon() );
+    CHECK( gamma( TestType( -1. ), TestType( -0. ) ) <
+           std::numeric_limits<TestType>::epsilon() );
+    CHECK( directCompare( gamma( TestType( 1. ), TestType( 2. ) ),
+                          std::numeric_limits<TestType>::epsilon() ) );
+    CHECK( directCompare( gamma( TestType( -2. ), TestType( -1. ) ),
+                          std::numeric_limits<TestType>::epsilon() ) );
+}
+
+TEMPLATE_TEST_CASE("count_equidistant_floats",
+                   "[approvals][floating-point][distance]",
+                   float,
+                   double) {
+    using Catch::Detail::count_equidistant_floats;
+    auto count_steps = []( TestType a, TestType b ) {
+        return count_equidistant_floats( a, b, Catch::Detail::gamma( a, b ) );
+    };
+
+    CHECK( count_steps( TestType( -1. ), TestType( 1. ) ) ==
+           2 * count_steps( TestType( 0. ), TestType( 1. ) ) );
+}
+
+TEST_CASE( "count_equidistant_floats",
+           "[approvals][floating-point][distance]" ) {
+    using Catch::Detail::count_equidistant_floats;
+    auto count_floats_with_scaled_ulp = []( auto a, auto b ) {
+        return count_equidistant_floats( a, b, Catch::Detail::gamma( a, b ) );
+    };
+
+    CHECK( count_floats_with_scaled_ulp( 1., 1.5 ) == 1ull << 51 );
+    CHECK( count_floats_with_scaled_ulp( 1.25, 1.5 ) == 1ull << 50 );
+    CHECK( count_floats_with_scaled_ulp( 1.f, 1.5f ) == 1 << 22 );
+    CHECK( count_floats_with_scaled_ulp( -std::numeric_limits<float>::max(),
+                                         std::numeric_limits<float>::max() ) ==
+           33554430 ); // (1 << 25) - 2 due to not including infinities
+    CHECK( count_floats_with_scaled_ulp( -std::numeric_limits<double>::max(),
+                                         std::numeric_limits<double>::max() ) ==
+           18014398509481982 ); // (1 << 54) - 2 due to not including infinities
+
+    STATIC_REQUIRE( std::is_same<std::uint64_t,
+                                 decltype( count_floats_with_scaled_ulp(
+                                     0., 1. ) )>::value );
+    STATIC_REQUIRE( std::is_same<std::uint32_t,
+                                 decltype( count_floats_with_scaled_ulp(
+                                     0.f, 1.f ) )>::value );
+}
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/GeneratorsImpl.tests.cpp b/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/GeneratorsImpl.tests.cpp
index 64e943f8..acfeebed 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/GeneratorsImpl.tests.cpp
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/GeneratorsImpl.tests.cpp
@@ -10,6 +10,8 @@
 #    pragma GCC diagnostic ignored "-Wfloat-equal"
 #endif
 
+#include <helpers/range_test_helpers.hpp>
+
 #include <catch2/catch_approx.hpp>
 #include <catch2/catch_test_macros.hpp>
 #include <catch2/generators/catch_generator_exception.hpp>
@@ -412,6 +414,7 @@ TEST_CASE("GENERATE handles function (pointers)", "[generators][compilation][app
 
 TEST_CASE("GENERATE decays arrays", "[generators][compilation][approvals]") {
     auto str = GENERATE("abc", "def", "gh");
+    (void)str;
     STATIC_REQUIRE(std::is_same<decltype(str), const char*>::value);
 }
 
@@ -544,3 +547,30 @@ TEST_CASE("Filter generator throws exception for empty generator",
         filter( []( int ) { return false; }, value( 3 ) ),
         Catch::GeneratorException );
 }
+
+TEST_CASE("from_range(container) supports ADL begin/end and arrays", "[generators][from-range][approvals]") {
+    using namespace Catch::Generators;
+
+    SECTION("C array") {
+        int arr[3]{ 5, 6, 7 };
+        auto gen = from_range( arr );
+        REQUIRE( gen.get() == 5 );
+        REQUIRE( gen.next() );
+        REQUIRE( gen.get() == 6 );
+        REQUIRE( gen.next() );
+        REQUIRE( gen.get() == 7 );
+        REQUIRE_FALSE( gen.next() );
+    }
+
+    SECTION( "ADL range" ) {
+        unrelated::needs_ADL_begin<int> range{ 1, 2, 3 };
+        auto gen = from_range( range );
+        REQUIRE( gen.get() == 1 );
+        REQUIRE( gen.next() );
+        REQUIRE( gen.get() == 2 );
+        REQUIRE( gen.next() );
+        REQUIRE( gen.get() == 3 );
+        REQUIRE_FALSE( gen.next() );
+    }
+
+}
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/Integer.tests.cpp b/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/Integer.tests.cpp
new file mode 100644
index 00000000..fd620ebb
--- /dev/null
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/Integer.tests.cpp
@@ -0,0 +1,150 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/internal/catch_random_integer_helpers.hpp>
+
+namespace {
+    template <typename Int>
+    static void
+    CommutativeMultCheck( Int a, Int b, Int upper_result, Int lower_result ) {
+        using Catch::Detail::extendedMult;
+        using Catch::Detail::ExtendedMultResult;
+        CHECK( extendedMult( a, b ) ==
+               ExtendedMultResult<Int>{ upper_result, lower_result } );
+        CHECK( extendedMult( b, a ) ==
+               ExtendedMultResult<Int>{ upper_result, lower_result } );
+    }
+} // namespace
+
+TEST_CASE( "extendedMult 64x64", "[Integer][approvals]" ) {
+    // a x 0 == 0
+    CommutativeMultCheck<uint64_t>( 0x1234'5678'9ABC'DEFF, 0, 0, 0 );
+
+    // bit carried from low half to upper half
+    CommutativeMultCheck<uint64_t>( uint64_t( 1 ) << 63, 2, 1, 0 );
+
+    // bits in upper half on one side, bits in lower half on other side
+    CommutativeMultCheck<uint64_t>( 0xcdcd'dcdc'0000'0000,
+                                    0x0000'0000'aeae'aeae,
+                                    0x0000'0000'8c6e'5a77,
+                                    0x7391'a588'0000'0000 );
+
+    // Some input numbers without interesting patterns
+    CommutativeMultCheck<uint64_t>( 0xaaaa'aaaa'aaaa'aaaa,
+                                    0xbbbb'bbbb'bbbb'bbbb,
+                                    0x7d27'd27d'27d2'7d26,
+                                    0xd82d'82d8'2d82'd82e );
+
+    CommutativeMultCheck<uint64_t>( 0x7d27'd27d'27d2'7d26,
+                                    0xd82d'82d8'2d82'd82e,
+                                    0x69af'd991'8256'b953,
+                                    0x8724'8909'fcb6'8cd4 );
+
+    CommutativeMultCheck<uint64_t>( 0xdead'beef'dead'beef,
+                                    0xfeed'feed'feed'feef,
+                                    0xddbf'680b'2b0c'b558,
+                                    0x7a36'b06f'2ce9'6321 );
+
+    CommutativeMultCheck<uint64_t>( 0xddbf'680b'2b0c'b558,
+                                    0x7a36'b06f'2ce9'6321,
+                                    0x69dc'96c9'294b'fc7f,
+                                    0xd038'39fa'a3dc'6858 );
+
+    CommutativeMultCheck<uint64_t>( 0x61c8'8646'80b5'83eb,
+                                    0x61c8'8646'80b5'83eb,
+                                    0x2559'92d3'8220'8bbe,
+                                    0xdf44'2d22'ce48'59b9 );
+}
+
+TEST_CASE( "SizedUnsignedType helpers", "[integer][approvals]" ) {
+    using Catch::Detail::SizedUnsignedType_t;
+    using Catch::Detail::DoubleWidthUnsignedType_t;
+
+    STATIC_REQUIRE( sizeof( SizedUnsignedType_t<1> ) == 1 );
+    STATIC_REQUIRE( sizeof( SizedUnsignedType_t<2> ) == 2 );
+    STATIC_REQUIRE( sizeof( SizedUnsignedType_t<4> ) == 4 );
+    STATIC_REQUIRE( sizeof( SizedUnsignedType_t<8> ) == 8 );
+
+    STATIC_REQUIRE( sizeof( DoubleWidthUnsignedType_t<std::uint8_t> ) == 2 );
+    STATIC_REQUIRE( std::is_unsigned<DoubleWidthUnsignedType_t<std::uint8_t>>::value );
+    STATIC_REQUIRE( sizeof( DoubleWidthUnsignedType_t<std::uint16_t> ) == 4 );
+    STATIC_REQUIRE( std::is_unsigned<DoubleWidthUnsignedType_t<std::uint16_t>>::value );
+    STATIC_REQUIRE( sizeof( DoubleWidthUnsignedType_t<std::uint32_t> ) == 8 );
+    STATIC_REQUIRE( std::is_unsigned<DoubleWidthUnsignedType_t<std::uint32_t>>::value );
+}
+
+TEST_CASE( "extendedMult 32x32", "[integer][approvals]" ) {
+    // a x 0 == 0
+    CommutativeMultCheck<uint32_t>( 0x1234'5678, 0, 0, 0 );
+
+    // bit carried from low half to upper half
+    CommutativeMultCheck<uint32_t>( uint32_t(1) << 31, 2, 1, 0 );
+
+    // bits in upper half on one side, bits in lower half on other side
+    CommutativeMultCheck<uint32_t>( 0xdcdc'0000, 0x0000'aabb, 0x0000'934b, 0x6cb4'0000 );
+
+    // Some input numbers without interesting patterns
+    CommutativeMultCheck<uint32_t>(
+        0xaaaa'aaaa, 0xbbbb'bbbb, 0x7d27'd27c, 0x2d82'd82e );
+
+    CommutativeMultCheck<uint32_t>(
+        0x7d27'd27c, 0x2d82'd82e, 0x163f'f7e8, 0xc5b8'7248 );
+
+    CommutativeMultCheck<uint32_t>(
+        0xdead'beef, 0xfeed'feed, 0xddbf'6809, 0x6f8d'e543 );
+
+    CommutativeMultCheck<uint32_t>(
+        0xddbf'6809, 0x6f8d'e543, 0x60a0'e71e, 0x751d'475b );
+}
+
+TEST_CASE( "extendedMult 8x8", "[integer][approvals]" ) {
+    // a x 0 == 0
+    CommutativeMultCheck<uint8_t>( 0xcd, 0, 0, 0 );
+
+    // bit carried from low half to upper half
+    CommutativeMultCheck<uint8_t>( uint8_t( 1 ) << 7, 2, 1, 0 );
+
+    // bits in upper half on one side, bits in lower half on other side
+    CommutativeMultCheck<uint8_t>( 0x80, 0x03, 0x01, 0x80 );
+
+    // Some input numbers without interesting patterns
+    CommutativeMultCheck<uint8_t>( 0xaa, 0xbb, 0x7c, 0x2e );
+    CommutativeMultCheck<uint8_t>( 0x7c, 0x2e, 0x16, 0x48 );
+    CommutativeMultCheck<uint8_t>( 0xdc, 0xcd, 0xb0, 0x2c );
+    CommutativeMultCheck<uint8_t>( 0xb0, 0x2c, 0x1e, 0x40 );
+}
+
+
+TEST_CASE( "negative and positive signed integers keep their order after transposeToNaturalOrder",
+                    "[integer][approvals]") {
+    using Catch::Detail::transposeToNaturalOrder;
+    int32_t negative( -1 );
+    int32_t positive( 1 );
+    uint32_t adjusted_negative =
+        transposeToNaturalOrder<int32_t>( static_cast<uint32_t>( negative ) );
+    uint32_t adjusted_positive =
+        transposeToNaturalOrder<int32_t>( static_cast<uint32_t>( positive ) );
+    REQUIRE( adjusted_negative < adjusted_positive );
+    REQUIRE( adjusted_positive - adjusted_negative == 2 );
+
+    // Conversion has to be reversible
+    REQUIRE( negative == static_cast<int32_t>( transposeToNaturalOrder<int32_t>(
+                             adjusted_negative ) ) );
+    REQUIRE( positive == static_cast<int32_t>( transposeToNaturalOrder<int32_t>(
+                             adjusted_positive ) ) );
+}
+
+TEST_CASE( "unsigned integers are unchanged by transposeToNaturalOrder",
+           "[integer][approvals]") {
+    using Catch::Detail::transposeToNaturalOrder;
+    uint32_t max = std::numeric_limits<uint32_t>::max();
+    uint32_t zero = 0;
+    REQUIRE( max == transposeToNaturalOrder<uint32_t>( max ) );
+    REQUIRE( zero == transposeToNaturalOrder<uint32_t>( zero ) );
+}
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/InternalBenchmark.tests.cpp b/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/InternalBenchmark.tests.cpp
index 96c0977b..bc8d715b 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/InternalBenchmark.tests.cpp
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/InternalBenchmark.tests.cpp
@@ -22,6 +22,8 @@
 #include <catch2/benchmark/detail/catch_benchmark_function.hpp>
 #include <catch2/benchmark/detail/catch_estimate_clock.hpp>
 
+#include <numeric>
+
 namespace {
     struct manual_clock {
     public:
@@ -154,8 +156,12 @@ TEST_CASE("uniform samples", "[benchmark]") {
     std::vector<double> samples(100);
     std::fill(samples.begin(), samples.end(), 23);
 
-    using it = std::vector<double>::iterator;
-    auto e = Catch::Benchmark::Detail::bootstrap(0.95, samples.begin(), samples.end(), samples, [](it a, it b) {
+    auto e = Catch::Benchmark::Detail::bootstrap(
+        0.95,
+        samples.data(),
+        samples.data() + samples.size(),
+        samples,
+        []( double const* a, double const* b ) {
         auto sum = std::accumulate(a, b, 0.);
         return sum / (b - a);
     });
@@ -196,7 +202,7 @@ TEST_CASE("normal_quantile", "[benchmark]") {
 TEST_CASE("mean", "[benchmark]") {
     std::vector<double> x{ 10., 20., 14., 16., 30., 24. };
 
-    auto m = Catch::Benchmark::Detail::mean(x.begin(), x.end());
+    auto m = Catch::Benchmark::Detail::mean(x.data(), x.data() + x.size());
 
     REQUIRE(m == 19.);
 }
@@ -204,9 +210,9 @@ TEST_CASE("mean", "[benchmark]") {
 TEST_CASE("weighted_average_quantile", "[benchmark]") {
     std::vector<double> x{ 10., 20., 14., 16., 30., 24. };
 
-    auto q1 = Catch::Benchmark::Detail::weighted_average_quantile(1, 4, x.begin(), x.end());
-    auto med = Catch::Benchmark::Detail::weighted_average_quantile(1, 2, x.begin(), x.end());
-    auto q3 = Catch::Benchmark::Detail::weighted_average_quantile(3, 4, x.begin(), x.end());
+    auto q1 = Catch::Benchmark::Detail::weighted_average_quantile(1, 4, x.data(), x.data() + x.size());
+    auto med = Catch::Benchmark::Detail::weighted_average_quantile(1, 2, x.data(), x.data() + x.size());
+    auto q3 = Catch::Benchmark::Detail::weighted_average_quantile(3, 4, x.data(), x.data() + x.size());
 
     REQUIRE(q1 == 14.5);
     REQUIRE(med == 18.);
@@ -225,7 +231,8 @@ TEST_CASE("classify_outliers", "[benchmark]") {
     SECTION("none") {
         std::vector<double> x{ 10., 20., 14., 16., 30., 24. };
 
-        auto o = Catch::Benchmark::Detail::classify_outliers(x.begin(), x.end());
+        auto o = Catch::Benchmark::Detail::classify_outliers(
+            x.data(), x.data() + x.size() );
 
         REQUIRE(o.samples_seen == static_cast<int>(x.size()));
         require_outliers(o, 0, 0, 0, 0);
@@ -233,7 +240,8 @@ TEST_CASE("classify_outliers", "[benchmark]") {
     SECTION("low severe") {
         std::vector<double> x{ -12., 20., 14., 16., 30., 24. };
 
-        auto o = Catch::Benchmark::Detail::classify_outliers(x.begin(), x.end());
+        auto o = Catch::Benchmark::Detail::classify_outliers(
+            x.data(), x.data() + x.size() );
 
         REQUIRE(o.samples_seen == static_cast<int>(x.size()));
         require_outliers(o, 1, 0, 0, 0);
@@ -241,7 +249,8 @@ TEST_CASE("classify_outliers", "[benchmark]") {
     SECTION("low mild") {
         std::vector<double> x{ 1., 20., 14., 16., 30., 24. };
 
-        auto o = Catch::Benchmark::Detail::classify_outliers(x.begin(), x.end());
+        auto o = Catch::Benchmark::Detail::classify_outliers(
+            x.data(), x.data() + x.size() );
 
         REQUIRE(o.samples_seen == static_cast<int>(x.size()));
         require_outliers(o, 0, 1, 0, 0);
@@ -249,7 +258,8 @@ TEST_CASE("classify_outliers", "[benchmark]") {
     SECTION("high mild") {
         std::vector<double> x{ 10., 20., 14., 16., 36., 24. };
 
-        auto o = Catch::Benchmark::Detail::classify_outliers(x.begin(), x.end());
+        auto o = Catch::Benchmark::Detail::classify_outliers(
+            x.data(), x.data() + x.size() );
 
         REQUIRE(o.samples_seen == static_cast<int>(x.size()));
         require_outliers(o, 0, 0, 1, 0);
@@ -257,7 +267,8 @@ TEST_CASE("classify_outliers", "[benchmark]") {
     SECTION("high severe") {
         std::vector<double> x{ 10., 20., 14., 16., 49., 24. };
 
-        auto o = Catch::Benchmark::Detail::classify_outliers(x.begin(), x.end());
+        auto o = Catch::Benchmark::Detail::classify_outliers(
+            x.data(), x.data() + x.size() );
 
         REQUIRE(o.samples_seen == static_cast<int>(x.size()));
         require_outliers(o, 0, 0, 0, 1);
@@ -265,7 +276,8 @@ TEST_CASE("classify_outliers", "[benchmark]") {
     SECTION("mixed") {
         std::vector<double> x{ -20., 20., 14., 16., 39., 24. };
 
-        auto o = Catch::Benchmark::Detail::classify_outliers(x.begin(), x.end());
+        auto o = Catch::Benchmark::Detail::classify_outliers(
+            x.data(), x.data() + x.size() );
 
         REQUIRE(o.samples_seen == static_cast<int>(x.size()));
         require_outliers(o, 1, 0, 1, 0);
@@ -280,15 +292,13 @@ TEST_CASE("analyse", "[approvals][benchmark]") {
     data.benchmarkSamples = 99;
     Catch::Config config{data};
 
-    using Duration = Catch::Benchmark::FloatDuration<Catch::Benchmark::default_clock>;
-
-    Catch::Benchmark::Environment<Duration> env;
-    std::vector<Duration> samples(99);
+    using FDuration = Catch::Benchmark::FDuration;
+    std::vector<FDuration> samples(99);
     for (size_t i = 0; i < samples.size(); ++i) {
-        samples[i] = Duration(23 + (i % 3 - 1));
+        samples[i] = FDuration(23 + (i % 3 - 1));
     }
 
-    auto analysis = Catch::Benchmark::Detail::analyse(config, env, samples.begin(), samples.end());
+    auto analysis = Catch::Benchmark::Detail::analyse(config, samples.data(), samples.data() + samples.size());
     CHECK( analysis.mean.point.count() == 23 );
     CHECK( analysis.mean.lower_bound.count() < 23 );
     CHECK(analysis.mean.lower_bound.count() > 22);
@@ -321,15 +331,13 @@ TEST_CASE("analyse no analysis", "[benchmark]") {
     data.benchmarkSamples = 99;
     Catch::Config config{ data };
 
-    using Duration = Catch::Benchmark::FloatDuration<Catch::Benchmark::default_clock>;
-
-    Catch::Benchmark::Environment<Duration> env;
-    std::vector<Duration> samples(99);
+    using FDuration = Catch::Benchmark::FDuration;
+    std::vector<FDuration> samples(99);
     for (size_t i = 0; i < samples.size(); ++i) {
-        samples[i] = Duration(23 + (i % 3 - 1));
+        samples[i] = FDuration(23 + (i % 3 - 1));
     }
 
-    auto analysis = Catch::Benchmark::Detail::analyse(config, env, samples.begin(), samples.end());
+    auto analysis = Catch::Benchmark::Detail::analyse(config, samples.data(), samples.data() + samples.size());
     CHECK(analysis.mean.point.count() == 23);
     CHECK(analysis.mean.lower_bound.count() == 23);
     CHECK(analysis.mean.upper_bound.count() == 23);
@@ -442,6 +450,6 @@ TEST_CASE("Failing benchmarks", "[!benchmark][.approvals]") {
 }
 
 TEST_CASE( "Failing benchmark respects should-fail",
-           "[!shouldfail][!benchmark][.approvals]" ) {
+           "[!shouldfail][!benchmark][approvals]" ) {
     BENCHMARK( "Asserting benchmark" ) { REQUIRE( 1 == 2 ); };
 }
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/Json.tests.cpp b/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/Json.tests.cpp
new file mode 100644
index 00000000..8204e3c4
--- /dev/null
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/Json.tests.cpp
@@ -0,0 +1,152 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/internal/catch_jsonwriter.hpp>
+#include <catch2/matchers/catch_matchers_string.hpp>
+
+#include <sstream>
+
+namespace {
+    struct Custom {};
+    static std::ostream& operator<<( std::ostream& os, Custom const& ) {
+        return os << "custom";
+    }
+} // namespace
+
+TEST_CASE( "JsonWriter", "[JSON][JsonWriter]" ) {
+
+    std::stringstream stream;
+    SECTION( "Newly constructed JsonWriter does nothing" ) {
+        Catch::JsonValueWriter writer{ stream };
+        REQUIRE( stream.str() == "" );
+    }
+
+    SECTION( "Calling writeObject will create an empty pair of braces" ) {
+        { auto writer = Catch::JsonValueWriter{ stream }.writeObject(); }
+        REQUIRE( stream.str() == "{\n}" );
+    }
+
+    SECTION( "Calling writeObject with key will create an object to write the "
+             "value" ) {
+        using Catch::Matchers::ContainsSubstring;
+        {
+            auto writer = Catch::JsonValueWriter{ stream }.writeObject();
+            writer.write( "int" ).write( 1 );
+            writer.write( "double" ).write( 1.5 );
+            writer.write( "true" ).write( true );
+            writer.write( "false" ).write( false );
+            writer.write( "string" ).write( "this is a string" );
+            writer.write( "array" ).writeArray().write( 1 ).write( 2 );
+        }
+        REQUIRE_THAT(
+            stream.str(),
+            ContainsSubstring( "\"int\": 1," ) &&
+                ContainsSubstring( "\"double\": 1.5," ) &&
+                ContainsSubstring( "\"true\": true," ) &&
+                ContainsSubstring( "\"false\": false," ) &&
+                ContainsSubstring( "\"string\": \"this is a string\"," ) &&
+                ContainsSubstring( "\"array\": [\n    1,\n    2\n  ]\n}" ) );
+    }
+
+    SECTION( "nesting objects" ) {
+        using Catch::Matchers::ContainsSubstring;
+        {
+            auto writer = Catch::JsonValueWriter{ stream }.writeObject();
+            writer.write( "empty_object" ).writeObject();
+            writer.write( "fully_object" )
+                .writeObject()
+                .write( "key" )
+                .write( 1 );
+        }
+        REQUIRE_THAT( stream.str(),
+                      ContainsSubstring( "\"empty_object\": {\n  }," ) &&
+                          ContainsSubstring(
+                              "\"fully_object\": {\n    \"key\": 1\n  }" ) );
+    }
+
+    SECTION( "Calling writeArray will create an empty pair of braces" ) {
+        { auto writer = Catch::JsonValueWriter{ stream }.writeArray(); }
+        REQUIRE( stream.str() == "[\n]" );
+    }
+
+    SECTION( "Calling writeArray creates array to write the values to" ) {
+        {
+            auto writer = Catch::JsonValueWriter{ stream }.writeArray();
+            writer.write( 1 );
+            writer.write( 1.5 );
+            writer.write( true );
+            writer.write( false );
+            writer.write( "this is a string" );
+            writer.writeObject().write( "object" ).write( 42 );
+            writer.writeArray().write( "array" ).write( 42.5 );
+        }
+        REQUIRE( stream.str() == "[\n  1,\n  1.5,\n  true,\n  false,\n  \"this is a string\",\n  {\n    \"object\": 42\n  },\n  [\n    \"array\",\n    42.5\n  ]\n]" );
+    }
+
+    SECTION(
+        "Moved from JsonObjectWriter shall not insert superfluous brace" ) {
+        {
+            auto writer = Catch::JsonObjectWriter{ stream };
+            auto another_writer = std::move( writer );
+        }
+        REQUIRE( stream.str() == "{\n}" );
+    }
+    SECTION(
+        "Moved from JsonArrayWriter shall not insert superfluous bracket" ) {
+        {
+            auto writer = Catch::JsonArrayWriter{ stream };
+            auto another_writer = std::move( writer );
+        }
+        REQUIRE( stream.str() == "[\n]" );
+    }
+    SECTION( "Custom class shall be quoted" ) {
+        Catch::JsonValueWriter{ stream }.write( Custom{} );
+        REQUIRE( stream.str() == "\"custom\"" );
+    }
+}
+
+TEST_CASE( "JsonWriter escapes charaters in strings properly", "[JsonWriter]" ) {
+    std::stringstream sstream;
+    SECTION( "Quote in a string is escaped" ) {
+        Catch::JsonValueWriter{ sstream }.write( "\"" );
+        REQUIRE( sstream.str() == "\"\\\"\"" );
+    }
+    SECTION("Backslash in a string is escaped") {
+        Catch::JsonValueWriter{ sstream }.write( "\\" );
+        REQUIRE( sstream.str() == "\"\\\\\"" );
+    }
+    SECTION( "Forward slash in a string is **not** escaped" ) {
+        Catch::JsonValueWriter{ sstream }.write( "/" );
+        REQUIRE( sstream.str() == "\"/\"" );
+    }
+    SECTION( "Backspace in a string is escaped" ) {
+        Catch::JsonValueWriter{ sstream }.write( "\b" );
+        REQUIRE( sstream.str() == "\"\\b\"" );
+    }
+    SECTION( "Formfeed in a string is escaped" ) {
+        Catch::JsonValueWriter{ sstream }.write( "\f" );
+        REQUIRE( sstream.str() == "\"\\f\"" );
+    }
+    SECTION( "linefeed in a string is escaped" ) {
+        Catch::JsonValueWriter{ sstream }.write( "\n" );
+        REQUIRE( sstream.str() == "\"\\n\"" );
+    }
+    SECTION( "carriage return in a string is escaped" ) {
+        Catch::JsonValueWriter{ sstream }.write( "\r" );
+        REQUIRE( sstream.str() == "\"\\r\"" );
+    }
+    SECTION( "tab in a string is escaped" ) {
+        Catch::JsonValueWriter{ sstream }.write( "\t" );
+        REQUIRE( sstream.str() == "\"\\t\"" );
+    }
+    SECTION( "combination of characters is escaped" ) {
+        Catch::JsonValueWriter{ sstream }.write( "\\/\t\r\n" );
+        REQUIRE( sstream.str() == "\"\\\\/\\t\\r\\n\"" );
+    }
+}
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/RandomNumberGeneration.tests.cpp b/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/RandomNumberGeneration.tests.cpp
index 8018b7eb..03be6c9c 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/RandomNumberGeneration.tests.cpp
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/RandomNumberGeneration.tests.cpp
@@ -7,9 +7,17 @@
 // SPDX-License-Identifier: BSL-1.0
 
 #include <catch2/catch_test_macros.hpp>
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/internal/catch_floating_point_helpers.hpp>
+#include <catch2/internal/catch_random_integer_helpers.hpp>
 #include <catch2/internal/catch_random_number_generator.hpp>
 #include <catch2/internal/catch_random_seed_generation.hpp>
+#include <catch2/internal/catch_uniform_floating_point_distribution.hpp>
+#include <catch2/internal/catch_uniform_integer_distribution.hpp>
 #include <catch2/generators/catch_generators.hpp>
+#include <catch2/matchers/catch_matchers_range_equals.hpp>
+
+#include <random>
 
 TEST_CASE("Our PCG implementation provides expected results for known seeds", "[rng]") {
     Catch::SimplePcg32 rng;
@@ -60,3 +68,523 @@ TEST_CASE("Random seed generation accepts known methods", "[rng][seed]") {
 
     REQUIRE_NOTHROW(Catch::generateRandomSeed(method));
 }
+
+TEMPLATE_TEST_CASE("uniform_floating_point_distribution never returns infs from finite range",
+          "[rng][distribution][floating-point][approvals]", float, double) {
+    std::random_device rd{};
+    Catch::SimplePcg32 pcg( rd() );
+    Catch::uniform_floating_point_distribution<TestType> dist(
+        -std::numeric_limits<TestType>::max(),
+        std::numeric_limits<TestType>::max() );
+
+    for (size_t i = 0; i < 10'000; ++i) {
+        auto ret = dist( pcg );
+        REQUIRE_FALSE( std::isinf( ret ) );
+        REQUIRE_FALSE( std::isnan( ret ) );
+    }
+}
+
+TEST_CASE( "fillBitsFrom - shortening and stretching", "[rng][approvals]" ) {
+    using Catch::Detail::fillBitsFrom;
+
+    // The seed is not important, but the numbers below have to be repeatable.
+    // They should also exhibit the same general pattern of being prefixes
+    Catch::SimplePcg32 pcg( 0xaabb'ccdd );
+
+    SECTION( "Shorten to 8 bits" ) {
+        // We cast the result to avoid dealing with char-like type in uint8_t
+        auto shortened = static_cast<uint32_t>( fillBitsFrom<uint8_t>( pcg ) );
+        REQUIRE( shortened == 0xcc );
+    }
+    SECTION( "Shorten to 16 bits" ) {
+        auto shortened = fillBitsFrom<uint16_t>( pcg );
+        REQUIRE( shortened == 0xccbe );
+    }
+    SECTION( "Keep at 32 bits" ) {
+        auto n = fillBitsFrom<uint32_t>( pcg );
+        REQUIRE( n == 0xccbe'5f04 );
+    }
+    SECTION( "Stretch to 64 bits" ) {
+        auto stretched = fillBitsFrom<uint64_t>( pcg );
+        REQUIRE( stretched == 0xccbe'5f04'a424'a486 );
+    }
+}
+
+TEST_CASE("uniform_integer_distribution can return the bounds", "[rng][distribution]") {
+    Catch::uniform_integer_distribution<int32_t> dist( -10, 10 );
+    REQUIRE( dist.a() == -10 );
+    REQUIRE( dist.b() == 10 );
+}
+
+namespace {
+    template <typename T>
+    static void CheckReturnValue(Catch::uniform_integer_distribution<T>& dist,
+                                 Catch::SimplePcg32& rng,
+                                 T target) {
+        REQUIRE( dist.a() == dist.b() );
+        for (int i = 0; i < 1'000; ++i) {
+            REQUIRE( dist( rng ) == target );
+        }
+    }
+}
+
+TEMPLATE_TEST_CASE( "uniform_integer_distribution can handle unit ranges",
+                    "[rng][distribution][approvals]",
+                    unsigned char,
+                    signed char,
+                    char,
+                    uint8_t,
+                    int8_t,
+                    uint16_t,
+                    int16_t,
+                    uint32_t,
+                    int32_t,
+                    uint64_t,
+                    int64_t ) {
+    // We want random seed to sample different parts of the rng state,
+    // the output is predetermined anyway
+    std::random_device rd;
+    auto seed = rd();
+    CAPTURE( seed );
+    Catch::SimplePcg32 pcg( seed );
+
+    // We check unitary ranges of 3 different values, min for type, max for type,
+    // some value inbetween just to make sure
+    SECTION("lowest value") {
+        constexpr auto lowest = std::numeric_limits<TestType>::min();
+        Catch::uniform_integer_distribution<TestType> dist( lowest, lowest );
+        CheckReturnValue( dist, pcg, lowest );
+    }
+    SECTION( "highest value" ) {
+        constexpr auto highest = std::numeric_limits<TestType>::max();
+        Catch::uniform_integer_distribution<TestType> dist( highest, highest );
+        CheckReturnValue( dist, pcg, highest );
+    }
+    SECTION( "some value" ) {
+        constexpr auto some = TestType( 42 );
+        Catch::uniform_integer_distribution<TestType> dist( some, some );
+        CheckReturnValue( dist, pcg, some );
+    }
+}
+
+// Bool needs its own test because it doesn't have a valid "third" value
+TEST_CASE( "uniform_integer_distribution can handle boolean unit ranges",
+           "[rng][distribution][approvals]" ) {
+    // We want random seed to sample different parts of the rng state,
+    // the output is predetermined anyway
+    std::random_device rd;
+    auto seed = rd();
+    CAPTURE( seed );
+    Catch::SimplePcg32 pcg( seed );
+
+    // We check unitary ranges of 3 different values, min for type, max for
+    // type, some value inbetween just to make sure
+    SECTION( "lowest value" ) {
+        Catch::uniform_integer_distribution<bool> dist( false, false );
+        CheckReturnValue( dist, pcg, false );
+    }
+    SECTION( "highest value" ) {
+        Catch::uniform_integer_distribution<bool> dist( true, true );
+        CheckReturnValue( dist, pcg, true );
+    }
+}
+
+TEMPLATE_TEST_CASE( "uniform_integer_distribution can handle full width ranges",
+                    "[rng][distribution][approvals]",
+                    unsigned char,
+                    signed char,
+                    char,
+                    uint8_t,
+                    int8_t,
+                    uint16_t,
+                    int16_t,
+                    uint32_t,
+                    int32_t,
+                    uint64_t,
+                    int64_t ) {
+    // We want random seed to sample different parts of the rng state,
+    // the output is predetermined anyway
+    std::random_device rd;
+    auto seed = rd();
+    CAPTURE( seed );
+    Catch::SimplePcg32 pcg( seed );
+
+    constexpr auto lowest = std::numeric_limits<TestType>::min();
+    constexpr auto highest = std::numeric_limits<TestType>::max();
+    Catch::uniform_integer_distribution<TestType> dist( lowest, highest );
+    STATIC_REQUIRE( std::is_same<TestType, decltype( dist( pcg ) )>::value );
+
+    // We need to do bit operations on the results, so we will have to
+    // cast them to unsigned type.
+    using BitType = std::make_unsigned_t<TestType>;
+    BitType ORs = 0;
+    BitType ANDs = BitType(-1);
+    for (int i = 0; i < 100; ++i) {
+        auto bits = static_cast<BitType>( dist( pcg ) );
+        ORs |= bits;
+        ANDs &= bits;
+    }
+    // Assuming both our RNG and distribution are unbiased, asking for
+    // the full range should essentially give us random bit generator.
+    // Over long run, OR of all the generated values should have all
+    // bits set to 1, while AND should have all bits set to 0.
+    // The chance of this test failing for unbiased pipeline is
+    // 1 / 2**iters, which for 100 iterations is astronomical.
+    REQUIRE( ORs == BitType( -1 ) );
+    REQUIRE( ANDs == 0 );
+}
+
+namespace {
+    template <typename T>
+    struct uniform_integer_test_params;
+
+    template <>
+    struct uniform_integer_test_params<bool> {
+        static constexpr bool lowest = false;
+        static constexpr bool highest = true;
+        //  This seems weird, but it is an artifact of the specific seed
+        static constexpr bool expected[] = { true,
+                                             true,
+                                             true,
+                                             true,
+                                             true,
+                                             true,
+                                             false,
+                                             true,
+                                             true,
+                                             true,
+                                             true,
+                                             true,
+                                             false,
+                                             true,
+                                             true };
+    };
+
+    template <>
+    struct uniform_integer_test_params<char> {
+        static constexpr char lowest = 32;
+        static constexpr char highest = 126;
+        static constexpr char expected[] = { 'k',
+                                             '\\',
+                                             'Z',
+                                             'X',
+                                             '`',
+                                             'Q',
+                                             ';',
+                                             'o',
+                                             ']',
+                                             'T',
+                                             'v',
+                                             'p',
+                                             ':',
+                                             'S',
+                                             't' };
+    };
+
+    template <>
+    struct uniform_integer_test_params<uint8_t> {
+        static constexpr uint8_t lowest = 3;
+        static constexpr uint8_t highest = 123;
+        static constexpr uint8_t expected[] = { 'c',
+                                                'P',
+                                                'M',
+                                                'J',
+                                                'U',
+                                                'A',
+                                                '%',
+                                                'h',
+                                                'Q',
+                                                'F',
+                                                'q',
+                                                'i',
+                                                '$',
+                                                'E',
+                                                'o' };
+    };
+
+    template <>
+    struct uniform_integer_test_params<int8_t> {
+        static constexpr int8_t lowest = -27;
+        static constexpr int8_t highest = 73;
+        static constexpr int8_t expected[] = { '5',
+                                               '%',
+                                               '#',
+                                               ' ',
+                                               '*',
+                                               25,
+                                               2,
+                                               '9',
+                                               '&',
+                                               29,
+                                               'A',
+                                               ':',
+                                               1,
+                                               28,
+                                               '?' };
+    };
+
+    template <>
+    struct uniform_integer_test_params<uint16_t> {
+        static constexpr uint16_t lowest = 123;
+        static constexpr uint16_t highest = 33333;
+        static constexpr uint16_t expected[] = { 26684,
+                                                 21417,
+                                                 20658,
+                                                 19791,
+                                                 22896,
+                                                 17433,
+                                                 9806,
+                                                 27948,
+                                                 21767,
+                                                 18588,
+                                                 30556,
+                                                 28244,
+                                                 9439,
+                                                 18293,
+                                                 29949 };
+    };
+
+    template <>
+    struct uniform_integer_test_params<int16_t> {
+        static constexpr int16_t lowest = -17222;
+        static constexpr int16_t highest = 17222;
+        static constexpr int16_t expected[] = { 10326,
+                                                 4863,
+                                                 4076,
+                                                 3177,
+                                                 6397,
+                                                 731,
+                                                 -7179,
+                                                 11637,
+                                                 5226,
+                                                 1929,
+                                                 14342,
+                                                 11944,
+                                                 -7560,
+                                                 1623,
+                                                 13712 };
+    };
+
+    template <>
+    struct uniform_integer_test_params<uint32_t> {
+        static constexpr uint32_t lowest = 17222;
+        static constexpr uint32_t highest = 234234;
+        static constexpr uint32_t expected[] = { 190784,
+                                                 156367,
+                                                 151409,
+                                                 145743,
+                                                 166032,
+                                                 130337,
+                                                 80501,
+                                                 199046,
+                                                 158654,
+                                                 137883,
+                                                 216091,
+                                                 200981,
+                                                 78099,
+                                                 135954,
+                                                 212120 };
+    };
+
+    template <>
+    struct uniform_integer_test_params<int32_t> {
+        static constexpr int32_t lowest = -237272;
+        static constexpr int32_t highest = 234234;
+        static constexpr int32_t expected[] = { 139829,
+                                                65050,
+                                                54278,
+                                                41969,
+                                                86051,
+                                                8494,
+                                                -99785,
+                                                157781,
+                                                70021,
+                                                24890,
+                                                194815,
+                                                161985,
+                                                -105004,
+                                                20699,
+                                                186186 };
+    };
+
+    template <>
+    struct uniform_integer_test_params<uint64_t> {
+        static constexpr uint64_t lowest = 1234;
+        static constexpr uint64_t highest = 1234567890;
+        static constexpr uint64_t expected[] = { 987382749,
+                                                 763380386,
+                                                 846572137,
+                                                 359990258,
+                                                 804599765,
+                                                 1131353566,
+                                                 346324913,
+                                                 1108760730,
+                                                 1141693933,
+                                                 856999148,
+                                                 879390623,
+                                                 1149485521,
+                                                 900556586,
+                                                 952385958,
+                                                 807916408 };
+    };
+
+    template <>
+    struct uniform_integer_test_params<int64_t> {
+        static constexpr int64_t lowest = -1234567890;
+        static constexpr int64_t highest = 1234567890;
+        static constexpr int64_t expected[] = { 740197113,
+                                                292191940,
+                                                458575608,
+                                                -514589122,
+                                                374630781,
+                                                1028139036,
+                                                -541919840,
+                                                982953318,
+                                                1048819790,
+                                                479429651,
+                                                524212647,
+                                                1064402981,
+                                                566544615,
+                                                670203462,
+                                                381264073 };
+    };
+
+    // We need these definitions for C++14 and earlier, but
+    // GCC will complain about them in newer C++ standards
+#if __cplusplus <= 201402L
+    constexpr bool uniform_integer_test_params<bool>::expected[];
+    constexpr char uniform_integer_test_params<char>::expected[];
+    constexpr uint8_t uniform_integer_test_params<uint8_t>::expected[];
+    constexpr int8_t uniform_integer_test_params<int8_t>::expected[];
+    constexpr uint16_t uniform_integer_test_params<uint16_t>::expected[];
+    constexpr int16_t uniform_integer_test_params<int16_t>::expected[];
+    constexpr uint32_t uniform_integer_test_params<uint32_t>::expected[];
+    constexpr int32_t uniform_integer_test_params<int32_t>::expected[];
+    constexpr uint64_t uniform_integer_test_params<uint64_t>::expected[];
+    constexpr int64_t uniform_integer_test_params<int64_t>::expected[];
+#endif
+
+}
+
+TEMPLATE_TEST_CASE( "uniform_integer_distribution is reproducible",
+                    "[rng][distribution][approvals]",
+                   bool,
+                   char,
+                   uint8_t,
+                   int8_t,
+                   uint16_t,
+                   int16_t,
+                   uint32_t,
+                   int32_t,
+                   uint64_t,
+                   int64_t) {
+    Catch::SimplePcg32 pcg( 0xaabb'ccdd );
+
+    constexpr auto lowest = uniform_integer_test_params<TestType>::lowest;
+    constexpr auto highest = uniform_integer_test_params<TestType>::highest;
+    Catch::uniform_integer_distribution<TestType> dist(lowest, highest);
+
+    constexpr auto iters = 15;
+    std::array<TestType, iters> generated;
+    for (int i = 0; i < iters; ++i) {
+        generated[i] = dist( pcg );
+    }
+
+    REQUIRE_THAT(generated, Catch::Matchers::RangeEquals(uniform_integer_test_params<TestType>::expected));
+}
+
+
+namespace {
+    template <typename T>
+    struct uniform_fp_test_params;
+
+    template<>
+    struct uniform_fp_test_params<float> {
+        // These are exactly representable
+        static constexpr float lowest = -256.125f;
+        static constexpr float highest = 385.125f;
+        // These are just round-trip formatted
+        static constexpr float expected[] = { 92.56961f,
+                                              -23.170044f,
+                                              310.81833f,
+                                              -53.023132f,
+                                              105.03287f,
+                                              198.77591f,
+                                              -172.72931f,
+                                              51.805176f,
+                                              -241.10156f,
+                                              64.66101f,
+                                              212.12509f,
+                                              -49.24292f,
+                                              -177.1399f,
+                                              245.23679f,
+                                              173.22421f };
+    };
+    template <>
+    struct uniform_fp_test_params<double> {
+        // These are exactly representable
+        static constexpr double lowest = -234582.9921875;
+        static constexpr double highest = 261238.015625;
+        // These are just round-trip formatted
+        static constexpr double expected[] = { 35031.207052832615,
+                                               203783.3401838024,
+                                               44667.940405848756,
+                                               -170100.5877224467,
+                                               -222966.7418051684,
+                                               127472.72630072923,
+                                               -173510.88209096913,
+                                               97394.16172239158,
+                                               119123.6921592663,
+                                               22595.741022785165,
+                                               8988.68409120926,
+                                               136906.86520606978,
+                                               33369.19104222473,
+                                               60912.7615841752,
+                                               -149060.05936760217 };
+    };
+
+// We need these definitions for C++14 and earlier, but
+// GCC will complain about them in newer C++ standards
+#if __cplusplus <= 201402L
+    constexpr float uniform_fp_test_params<float>::expected[];
+    constexpr double uniform_fp_test_params<double>::expected[];
+#endif
+} // namespace
+
+TEMPLATE_TEST_CASE( "uniform_floating_point_distribution is reproducible",
+                    "[rng][distribution][floating-point][approvals]",
+                    float,
+                    double ) {
+    Catch::SimplePcg32 pcg( 0xaabb'aabb );
+
+    const auto lowest = uniform_fp_test_params<TestType>::lowest;
+    const auto highest = uniform_fp_test_params<TestType>::highest;
+    Catch::uniform_floating_point_distribution<TestType> dist( lowest, highest );
+
+    constexpr auto iters = 15;
+    std::array<TestType, iters> generated;
+    for ( int i = 0; i < iters; ++i ) {
+        generated[i] = dist( pcg );
+    }
+
+    REQUIRE_THAT( generated, Catch::Matchers::RangeEquals( uniform_fp_test_params<TestType>::expected ) );
+}
+
+TEMPLATE_TEST_CASE( "uniform_floating_point_distribution can handle unitary ranges",
+                    "[rng][distribution][floating-point][approvals]",
+                    float,
+                    double ) {
+    std::random_device rd;
+    auto seed = rd();
+    CAPTURE( seed );
+    Catch::SimplePcg32 pcg( seed );
+
+    const auto highest = uniform_fp_test_params<TestType>::highest;
+    Catch::uniform_floating_point_distribution<TestType> dist( highest,
+                                                               highest );
+
+    constexpr auto iters = 20;
+    for (int i = 0; i < iters; ++i) {
+        REQUIRE( Catch::Detail::directCompare( dist( pcg ), highest ) );
+    }
+}
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/Reporters.tests.cpp b/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/Reporters.tests.cpp
index 54c26a7a..e5a65bda 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/Reporters.tests.cpp
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/Reporters.tests.cpp
@@ -12,7 +12,6 @@
 #include <catch2/catch_config.hpp>
 #include <catch2/interfaces/catch_interfaces_reporter.hpp>
 #include <catch2/interfaces/catch_interfaces_reporter_factory.hpp>
-#include <catch2/interfaces/catch_interfaces_reporter_registry.hpp>
 #include <catch2/internal/catch_console_colour.hpp>
 #include <catch2/internal/catch_enforce.hpp>
 #include <catch2/internal/catch_list.hpp>
@@ -110,7 +109,9 @@ TEST_CASE( "Reporter's write listings to provided stream", "[reporters]" ) {
         auto sstream = Catch::Detail::make_unique<StringIStream>();
         auto& sstreamRef = *sstream.get();
 
-        Catch::Config config( Catch::ConfigData{} );
+        Catch::ConfigData cfg_data;
+        cfg_data.rngSeed = 1234;
+        Catch::Config config( cfg_data );
         auto reporter = factory.second->create( Catch::ReporterConfig{
             &config, CATCH_MOVE( sstream ), Catch::ColourMode::None, {} } );
 
@@ -164,7 +165,7 @@ namespace {
                       std::vector<std::string>& recorder,
                       Catch::IConfig const* config ):
             EventListenerBase( config ),
-            m_witness( witness ),
+            m_witness( CATCH_MOVE(witness) ),
             m_recorder( recorder )
         {}
 
@@ -182,7 +183,7 @@ namespace {
                       std::vector<std::string>& recorder,
                       Catch::ReporterConfig&& config ):
             StreamingReporterBase( CATCH_MOVE(config) ),
-            m_witness( witness ),
+            m_witness( CATCH_MOVE(witness) ),
             m_recorder( recorder )
         {}
 
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/String.tests.cpp b/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/String.tests.cpp
index 7a0b3b4a..43c58b49 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/String.tests.cpp
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/String.tests.cpp
@@ -177,7 +177,7 @@ TEST_CASE("StringRef at compilation time", "[Strings][StringRef][constexpr]") {
         STATIC_REQUIRE_FALSE(sr1.empty());
         STATIC_REQUIRE(sr1.size() == 3);
 
-        using Catch::operator"" _sr;
+        using Catch::operator""_sr;
         constexpr auto sr2 = ""_sr;
         STATIC_REQUIRE(sr2.empty());
         STATIC_REQUIRE(sr2.size() == 0);
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/Tag.tests.cpp b/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/Tag.tests.cpp
index ef321b27..43723758 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/Tag.tests.cpp
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/IntrospectiveTests/Tag.tests.cpp
@@ -98,7 +98,20 @@ TEST_CASE( "Test case with identical tags keeps just one", "[tags]" ) {
     REQUIRE( testCase.tags[0] == Tag( "tag1" ) );
 }
 
-TEST_CASE( "Empty tag is not allowed" ) {
-    REQUIRE_THROWS( Catch::TestCaseInfo(
-        "", { "fake test name", "[]" }, dummySourceLineInfo ) );
+TEST_CASE("Mismatched square brackets in tags are caught and reported",
+          "[tags][approvals]") {
+    using Catch::TestCaseInfo;
+    using Catch::Matchers::ContainsSubstring;
+            REQUIRE_THROWS_WITH( TestCaseInfo( "",
+                                       { "test with unclosed tag", "[abc" },
+                                       dummySourceLineInfo ),
+                         ContainsSubstring("registering test case 'test with unclosed tag'") );
+    REQUIRE_THROWS_WITH( TestCaseInfo( "",
+                      { "test with nested tags", "[abc[def]]" },
+                      dummySourceLineInfo ),
+        ContainsSubstring("registering test case 'test with nested tags'") );
+    REQUIRE_THROWS_WITH( TestCaseInfo( "",
+                      { "test with superfluous close tags", "[abc][def]]" },
+                      dummySourceLineInfo ),
+        ContainsSubstring("registering test case 'test with superfluous close tags'") );
 }
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/Exception.tests.cpp b/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/Exception.tests.cpp
index f917932f..4f91a30c 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/Exception.tests.cpp
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/Exception.tests.cpp
@@ -20,7 +20,7 @@
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wweak-vtables"
 #pragma clang diagnostic ignored "-Wmissing-noreturn"
-#pragma clang diagnostic ignored "-Wunreachable-code"
+#pragma clang diagnostic ignored "-Wunreachable-code-return"
 #endif
 
 namespace {
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/Generators.tests.cpp b/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/Generators.tests.cpp
index 274c63b8..f04cf4f0 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/Generators.tests.cpp
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/Generators.tests.cpp
@@ -261,6 +261,10 @@ TEST_CASE("Copy and then generate a range", "[generators]") {
     }
 }
 
+#if defined( __clang__ )
+#    pragma clang diagnostic pop
+#endif
+
 TEST_CASE("#1913 - GENERATE inside a for loop should not keep recreating the generator", "[regression][generators]") {
     static int counter = 0;
     for (int i = 0; i < 3; ++i) {
@@ -301,13 +305,19 @@ namespace {
 
 } // namespace
 
-TEST_CASE( "#2615 - Throwing in constructor generator fails test case but does not abort", "[!shouldfail]" ) {
+TEST_CASE( "#2615 - Throwing in constructor generator fails test case but does not abort",
+           "[!shouldfail][regression][generators]" ) {
     // this should fail the test case, but not abort the application
     auto sample = GENERATE( make_test_generator() );
     // this assertion shouldn't trigger
-    REQUIRE( sample == 0U );
+    REQUIRE( sample == 0 );
 }
 
-#if defined( __clang__ )
-#    pragma clang diagnostic pop
-#endif
+TEST_CASE( "GENERATE can combine literals and generators", "[generators]" ) {
+    auto i = GENERATE( 2,
+                       4,
+                       take( 2,
+                             filter( []( int val ) { return val % 2 == 0; },
+                                     random( -100, 100 ) ) ) );
+    REQUIRE( i % 2 == 0 );
+}
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/Matchers.tests.cpp b/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/Matchers.tests.cpp
index 49e25232..74bedf5e 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/Matchers.tests.cpp
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/Matchers.tests.cpp
@@ -406,6 +406,25 @@ TEST_CASE( "Vector matchers that fail", "[matchers][vector][.][failing]" ) {
     }
 }
 
+namespace {
+    struct SomeType {
+        int i;
+        friend bool operator==( SomeType lhs, SomeType rhs ) {
+            return lhs.i == rhs.i;
+        }
+    };
+} // end anonymous namespace
+
+TEST_CASE( "Vector matcher with elements without !=", "[matchers][vector][approvals]" ) {
+    std::vector<SomeType> lhs, rhs;
+    lhs.push_back( { 1 } );
+    lhs.push_back( { 2 } );
+    rhs.push_back( { 1 } );
+    rhs.push_back( { 1 } );
+
+    REQUIRE_THAT( lhs, !Equals(rhs) );
+}
+
 TEST_CASE( "Exception matchers that succeed",
            "[matchers][exceptions][!throws]" ) {
     CHECK_THROWS_MATCHES(
@@ -871,7 +890,7 @@ struct MatcherA : Catch::Matchers::MatcherGenericBase {
         return "equals: (int) 1 or (string) \"1\"";
     }
     bool match( int i ) const { return i == 1; }
-    bool match( std::string s ) const { return s == "1"; }
+    bool match( std::string const& s ) const { return s == "1"; }
 };
 
 struct MatcherB : Catch::Matchers::MatcherGenericBase {
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/MatchersRanges.tests.cpp b/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/MatchersRanges.tests.cpp
index 05e11b0c..cc8c54f8 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/MatchersRanges.tests.cpp
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/MatchersRanges.tests.cpp
@@ -381,7 +381,7 @@ TEST_CASE("Usage of AllTrue range matcher", "[matchers][templated][quantifiers]"
             std::array<bool, 0> const data{};
             REQUIRE_THAT( data, AllTrue() );
         }
-        SECTION( "One false evalutes to false" ) {
+        SECTION( "One false evaluates to false" ) {
             std::array<bool, 5> const data{ { true, true, false, true, true } };
             REQUIRE_THAT( data, !AllTrue() );
         }
@@ -398,7 +398,7 @@ TEST_CASE("Usage of AllTrue range matcher", "[matchers][templated][quantifiers]"
                 { { true }, { true }, { true }, { true }, { true } } };
             REQUIRE_THAT( data, AllTrue() );
         }
-        SECTION( "One false evalutes to false" ) {
+        SECTION( "One false evaluates to false" ) {
             std::array<ConvertibleToBool, 5> const data{
                 { { true }, { true }, { false }, { true }, { true } } };
             REQUIRE_THAT( data, !AllTrue() );
@@ -446,7 +446,7 @@ TEST_CASE( "Usage of NoneTrue range matcher", "[matchers][templated][quantifiers
             std::array<bool, 0> const data{};
             REQUIRE_THAT( data, NoneTrue() );
         }
-        SECTION( "One true evalutes to false" ) {
+        SECTION( "One true evaluates to false" ) {
             std::array<bool, 5> const data{
                 { false, false, true, false, false } };
             REQUIRE_THAT( data, !NoneTrue() );
@@ -464,7 +464,7 @@ TEST_CASE( "Usage of NoneTrue range matcher", "[matchers][templated][quantifiers
                 { { true }, { true }, { true }, { true }, { true } } };
             REQUIRE_THAT( data, !NoneTrue() );
         }
-        SECTION( "One true evalutes to false" ) {
+        SECTION( "One true evaluates to false" ) {
             std::array<ConvertibleToBool, 5> const data{
                 { { false }, { false }, { true }, { false }, { false } } };
             REQUIRE_THAT( data, !NoneTrue() );
@@ -512,7 +512,7 @@ TEST_CASE( "Usage of AnyTrue range matcher", "[matchers][templated][quantifiers]
             std::array<bool, 0> const data{};
             REQUIRE_THAT( data, !AnyTrue() );
         }
-        SECTION( "One true evalutes to true" ) {
+        SECTION( "One true evaluates to true" ) {
             std::array<bool, 5> const data{
                 { false, false, true, false, false } };
             REQUIRE_THAT( data, AnyTrue() );
@@ -530,7 +530,7 @@ TEST_CASE( "Usage of AnyTrue range matcher", "[matchers][templated][quantifiers]
                 { { true }, { true }, { true }, { true }, { true } } };
             REQUIRE_THAT( data, AnyTrue() );
         }
-        SECTION( "One true evalutes to true" ) {
+        SECTION( "One true evaluates to true" ) {
             std::array<ConvertibleToBool, 5> const data{
                 { { false }, { false }, { true }, { false }, { false } } };
             REQUIRE_THAT( data, AnyTrue() );
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/Message.tests.cpp b/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/Message.tests.cpp
index a5e69582..6367bf59 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/Message.tests.cpp
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/Message.tests.cpp
@@ -255,10 +255,24 @@ std::ostream& operator<<(std::ostream& out, helper_1436<T1, T2> const& helper) {
 #pragma clang diagnostic ignored "-Wunused-value"
 #endif
 
+namespace {
+    template <typename T>
+    struct custom_index_op {
+        constexpr custom_index_op( std::initializer_list<T> ) {}
+        constexpr T operator[]( size_t ) { return T{}; }
+#if defined( __cpp_multidimensional_subscript ) && \
+    __cpp_multidimensional_subscript >= 202110L
+        constexpr T operator[]( size_t, size_t, size_t ) const noexcept {
+            return T{};
+        }
+#endif
+    };
+}
+
 TEST_CASE("CAPTURE can deal with complex expressions involving commas", "[messages][capture]") {
-    CAPTURE(std::vector<int>{1, 2, 3}[0, 1, 2],
-            std::vector<int>{1, 2, 3}[(0, 1)],
-            std::vector<int>{1, 2, 3}[0]);
+    CAPTURE(custom_index_op<int>{1, 2, 3}[0, 1, 2],
+            custom_index_op<int>{1, 2, 3}[(0, 1)],
+            custom_index_op<int>{1, 2, 3}[0]);
     CAPTURE((helper_1436<int, int>{12, -12}),
             (helper_1436<int, int>(-12, 12)));
     CAPTURE( (1, 2), (2, 3) );
@@ -285,3 +299,14 @@ TEST_CASE("CAPTURE parses string and character constants", "[messages][capture]"
 #ifdef _MSC_VER
 #pragma warning(pop)
 #endif
+
+TEST_CASE( "INFO and UNSCOPED_INFO can stream multiple arguments",
+           "[messages][info][.failing]" ) {
+    INFO( "This info"
+          << " has multiple"
+          << " parts." );
+    UNSCOPED_INFO( "This unscoped info"
+                   << " has multiple"
+                   << " parts." );
+    FAIL( "Show infos!" );
+}
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/Misc.tests.cpp b/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/Misc.tests.cpp
index 6c1fd68f..7f06704b 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/Misc.tests.cpp
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/Misc.tests.cpp
@@ -217,6 +217,18 @@ TEST_CASE("Testing checked-if 3", "[checked-if][!shouldfail]") {
     SUCCEED();
 }
 
+[[noreturn]]
+TEST_CASE("Testing checked-if 4", "[checked-if][!shouldfail]") {
+    CHECKED_ELSE(true) {}
+    throw std::runtime_error("Uncaught exception should fail!");
+}
+
+[[noreturn]]
+TEST_CASE("Testing checked-if 5", "[checked-if][!shouldfail]") {
+    CHECKED_ELSE(false) {}
+    throw std::runtime_error("Uncaught exception should fail!");
+}
+
 TEST_CASE( "xmlentitycheck" ) {
     SECTION( "embedded xml: <test>it should be possible to embed xml characters, such as <, \" or &, or even whole <xml>documents</xml> within an attribute</test>" ) {
         SUCCEED(); // We need this here to stop it failing due to no tests
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/Skip.tests.cpp b/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/Skip.tests.cpp
index 6bd4189b..661795e1 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/Skip.tests.cpp
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/Skip.tests.cpp
@@ -71,3 +71,30 @@ TEST_CASE( "failing for some generator values causes entire test case to fail",
         FAIL();
     }
 }
+
+namespace {
+    class test_skip_generator : public Catch::Generators::IGenerator<int> {
+    public:
+        explicit test_skip_generator() { SKIP( "This generator is empty" ); }
+
+        auto get() const -> int const& override {
+            static constexpr int value = 1;
+            return value;
+        }
+
+        auto next() -> bool override { return false; }
+    };
+
+    static auto make_test_skip_generator()
+        -> Catch::Generators::GeneratorWrapper<int> {
+        return { new test_skip_generator() };
+    }
+
+} // namespace
+
+TEST_CASE( "Empty generators can SKIP in constructor", "[skipping]" ) {
+    // The generator signals emptiness with `SKIP`
+    auto sample = GENERATE( make_test_skip_generator() );
+    // This assertion would fail, but shouldn't trigger
+    REQUIRE( sample == 0 );
+}
diff --git a/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/ToStringOptional.tests.cpp b/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/ToStringOptional.tests.cpp
index 9fd9d6b4..3671771a 100644
--- a/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/ToStringOptional.tests.cpp
+++ b/alpaka/thirdParty/catch2/tests/SelfTest/UsageTests/ToStringOptional.tests.cpp
@@ -28,4 +28,8 @@ TEST_CASE( "std::vector<std::optional<int> > -> toString", "[toString][optional]
     REQUIRE( "{ 0, { }, 2 }" == ::Catch::Detail::stringify( type{ 0, {}, 2 } ) );
 }
 
+TEST_CASE( "std::nullopt -> toString", "[toString][optional][approvals]" ) {
+    REQUIRE( "{ }" == ::Catch::Detail::stringify( std::nullopt ) );
+}
+
 #endif // CATCH_INTERNAL_CONFIG_CPP17_OPTIONAL
diff --git a/alpaka/thirdParty/catch2/tests/TestScripts/DiscoverTests/CMakeLists.txt b/alpaka/thirdParty/catch2/tests/TestScripts/DiscoverTests/CMakeLists.txt
new file mode 100644
index 00000000..d19f2f88
--- /dev/null
+++ b/alpaka/thirdParty/catch2/tests/TestScripts/DiscoverTests/CMakeLists.txt
@@ -0,0 +1,16 @@
+cmake_minimum_required(VERSION 3.10)
+
+project(discover-tests-test
+  LANGUAGES CXX
+)
+
+add_executable(tests
+  register-tests.cpp
+)
+
+add_subdirectory(${CATCH2_PATH} catch2-build)
+target_link_libraries(tests PRIVATE Catch2::Catch2WithMain)
+
+include(CTest)
+include(Catch)
+catch_discover_tests(tests)
diff --git a/alpaka/thirdParty/catch2/tests/TestScripts/DiscoverTests/VerifyRegistration.py b/alpaka/thirdParty/catch2/tests/TestScripts/DiscoverTests/VerifyRegistration.py
new file mode 100644
index 00000000..9ec42f24
--- /dev/null
+++ b/alpaka/thirdParty/catch2/tests/TestScripts/DiscoverTests/VerifyRegistration.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+
+#              Copyright Catch2 Authors
+# Distributed under the Boost Software License, Version 1.0.
+#   (See accompanying file LICENSE.txt or copy at
+#        https://www.boost.org/LICENSE_1_0.txt)
+
+# SPDX-License-Identifier: BSL-1.0
+
+import os
+import subprocess
+import sys
+
+
+def build_project(sources_dir, output_base_path, catch2_path):
+    build_dir = os.path.join(output_base_path, 'ctest-registration-test')
+    config_cmd = ['cmake',
+                  '-B', build_dir,
+                  '-S', sources_dir,
+                  f'-DCATCH2_PATH={catch2_path}',
+                  '-DCMAKE_BUILD_TYPE=Debug']
+
+    build_cmd = ['cmake',
+                 '--build', build_dir,
+                 '--config', 'Debug']
+
+    try:
+        subprocess.run(config_cmd,
+                       capture_output = True,
+                       check = True,
+                       text = True)
+        subprocess.run(build_cmd,
+                       capture_output = True,
+                       check = True,
+                       text = True)
+    except subprocess.CalledProcessError as err:
+        print('Error when building the test project')
+        print(f'cmd: {err.cmd}')
+        print(f'stderr: {err.stderr}')
+        print(f'stdout: {err.stdout}')
+        exit(3)
+
+    return build_dir
+
+
+
+def get_test_names(build_path):
+    # For now we assume that Windows builds are done using MSBuild under
+    # Debug configuration. This means that we need to add "Debug" folder
+    # to the path when constructing it. On Linux, we don't add anything.
+    config_path = "Debug" if os.name == 'nt' else ""
+    full_path = os.path.join(build_path, config_path, 'tests')
+
+
+    cmd = [full_path, '--reporter', 'xml', '--list-tests']
+    result = subprocess.run(cmd,
+                            capture_output = True,
+                            check = True,
+                            text = True)
+
+    import xml.etree.ElementTree as ET
+    root = ET.fromstring(result.stdout)
+    return [tc.text for tc in root.findall('TestCase/Name')]
+
+
+def list_ctest_tests(build_path):
+    old_path = os.getcwd()
+    os.chdir(build_path)
+
+    cmd = ['ctest', '-C', 'debug', '--show-only=json-v1']
+    result = subprocess.run(cmd,
+                            capture_output = True,
+                            check = True,
+                            text = True)
+    os.chdir(old_path)
+
+    import json
+
+    ctest_response = json.loads(result.stdout)
+    tests = ctest_response['tests']
+    test_names = []
+    for test in tests:
+        test_command = test['command']
+        # First part of the command is the binary, second is the filter.
+        # If there are less, registration has failed. If there are more,
+        # registration has changed and the script needs updating.
+        assert len(test_command) == 2
+        test_names.append(test_command[1])
+        test_name = test_command[1]
+
+    return test_names
+
+def escape_catch2_test_name(name):
+    for char in ('\\', ',', '[', ']'):
+        name = name.replace(char, f"\\{char}")
+    return name
+
+if __name__ == '__main__':
+    if len(sys.argv) != 3:
+        print(f'Usage: {sys.argv[0]} path-to-catch2-cml output-path')
+        exit(2)
+    catch2_path = sys.argv[1]
+    output_base_path = sys.argv[2]
+    sources_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
+
+    build_path = build_project(sources_dir, output_base_path, catch2_path)
+
+    catch_test_names = [escape_catch2_test_name(name) for name in get_test_names(build_path)]
+    ctest_test_names = list_ctest_tests(build_path)
+
+    mismatched = 0
+    for catch_test in catch_test_names:
+        if catch_test not in ctest_test_names:
+            print(f"Catch2 test '{catch_test}' not found in CTest")
+            mismatched += 1
+    for ctest_test in ctest_test_names:
+        if ctest_test not in catch_test_names:
+            print(f"CTest test '{ctest_test}' not found in Catch2")
+            mismatched += 1
+
+    if mismatched:
+        print(f"Found {mismatched} mismatched tests catch test names and ctest test commands!")
+        exit(1)
diff --git a/alpaka/thirdParty/catch2/tests/TestScripts/DiscoverTests/register-tests.cpp b/alpaka/thirdParty/catch2/tests/TestScripts/DiscoverTests/register-tests.cpp
new file mode 100644
index 00000000..aa603df1
--- /dev/null
+++ b/alpaka/thirdParty/catch2/tests/TestScripts/DiscoverTests/register-tests.cpp
@@ -0,0 +1,16 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/catch_test_macros.hpp>
+
+TEST_CASE("@Script[C:\\EPM1A]=x;\"SCALA_ZERO:\"", "[script regressions]"){}
+TEST_CASE("Some test") {}
+TEST_CASE( "Let's have a test case with a long name. Longer. No, even longer. "
+           "Really looooooooooooong. Even longer than that. Multiple lines "
+           "worth of test name. Yep, like this." ) {}
+TEST_CASE( "And now a test case with weird tags.", "[tl;dr][tl;dw][foo,bar]" ) {}
diff --git a/alpaka/thirdParty/catch2/tests/meson.build b/alpaka/thirdParty/catch2/tests/meson.build
index f525f041..58302b7a 100644
--- a/alpaka/thirdParty/catch2/tests/meson.build
+++ b/alpaka/thirdParty/catch2/tests/meson.build
@@ -17,6 +17,7 @@ self_test_sources = files(
   'SelfTest/IntrospectiveTests/Details.tests.cpp',
   'SelfTest/IntrospectiveTests/FloatingPoint.tests.cpp',
   'SelfTest/IntrospectiveTests/GeneratorsImpl.tests.cpp',
+  'SelfTest/IntrospectiveTests/Integer.tests.cpp',
   'SelfTest/IntrospectiveTests/InternalBenchmark.tests.cpp',
   'SelfTest/IntrospectiveTests/Parse.tests.cpp',
   'SelfTest/IntrospectiveTests/PartTracker.tests.cpp',
diff --git a/alpaka/thirdParty/catch2/tools/scripts/checkLicense.py b/alpaka/thirdParty/catch2/tools/scripts/checkLicense.py
index 9a949769..7078d3ec 100755
--- a/alpaka/thirdParty/catch2/tools/scripts/checkLicense.py
+++ b/alpaka/thirdParty/catch2/tools/scripts/checkLicense.py
@@ -33,7 +33,8 @@ def check_licences_in_path(path: str) -> int:
 
 def check_licences():
     failed = 0
-    roots = ['src/catch2', 'tests']
+    # Add 'extras' after the amalgamted files are regenerated with the new script (past 3.4.0)
+    roots = ['src/catch2', 'tests', 'examples', 'fuzzing']
     for root in roots:
         failed += check_licences_in_path(root)
     
diff --git a/alpaka/thirdParty/catch2/tools/scripts/generateAmalgamatedFiles.py b/alpaka/thirdParty/catch2/tools/scripts/generateAmalgamatedFiles.py
index 99fc446b..e3e86aab 100755
--- a/alpaka/thirdParty/catch2/tools/scripts/generateAmalgamatedFiles.py
+++ b/alpaka/thirdParty/catch2/tools/scripts/generateAmalgamatedFiles.py
@@ -1,4 +1,9 @@
 #!/usr/bin/env python3
+#              Copyright Catch2 Authors
+# Distributed under the Boost Software License, Version 1.0.
+#   (See accompanying file LICENSE.txt or copy at
+#        https://www.boost.org/LICENSE_1_0.txt)
+# SPDX-License-Identifier: BSL-1.0
 
 import os
 import re
@@ -12,6 +17,8 @@
 output_header = os.path.join(catchPath, 'extras', 'catch_amalgamated.hpp')
 output_cpp = os.path.join(catchPath, 'extras', 'catch_amalgamated.cpp')
 
+# REUSE-IgnoreStart
+
 # These are the copyright comments in each file, we want to ignore them
 copyright_lines = [
 '//              Copyright Catch2 Authors\n',
@@ -24,6 +31,7 @@
 # The header of the amalgamated file: copyright information + explanation
 # what this file is.
 file_header = '''\
+
 //              Copyright Catch2 Authors
 // Distributed under the Boost Software License, Version 1.0.
 //   (See accompanying file LICENSE.txt or copy at
@@ -39,6 +47,8 @@
 //  ----------------------------------------------------------
 '''
 
+# REUSE-IgnoreEnd
+
 # Returns file header with proper version string and generation time
 def formatted_file_header(version):
     return file_header.format(version_string=version.getVersionString(),
diff --git a/alpaka/thirdParty/catch2/tools/scripts/releaseCommon.py b/alpaka/thirdParty/catch2/tools/scripts/releaseCommon.py
index 0d995eaf..1ff4af29 100644
--- a/alpaka/thirdParty/catch2/tools/scripts/releaseCommon.py
+++ b/alpaka/thirdParty/catch2/tools/scripts/releaseCommon.py
@@ -114,8 +114,8 @@ def updateVersionDefine(version):
 def updateVersionPlaceholder(filename, version):
     with open(filename, 'rb') as file:
         lines = file.readlines()
-    placeholderRegex = re.compile(b'in Catch[0-9]? X.Y.Z')
-    replacement = 'in Catch2 {}.{}.{}'.format(version.majorVersion, version.minorVersion, version.patchNumber).encode('ascii')
+    placeholderRegex = re.compile(b'Catch[0-9]? X.Y.Z')
+    replacement = 'Catch2 {}.{}.{}'.format(version.majorVersion, version.minorVersion, version.patchNumber).encode('ascii')
     with open(filename, 'wb') as file:
         for line in lines:
             file.write(placeholderRegex.sub(replacement, line))
diff --git a/alpaka/thirdParty/catch2/tools/scripts/updateDocumentToC.py b/alpaka/thirdParty/catch2/tools/scripts/updateDocumentToC.py
index 7b56cfc7..1840cecc 100755
--- a/alpaka/thirdParty/catch2/tools/scripts/updateDocumentToC.py
+++ b/alpaka/thirdParty/catch2/tools/scripts/updateDocumentToC.py
@@ -287,7 +287,7 @@ def markdownToclify(
         Path to the markdown output file.
 
       min_toc_len: int (default: 2)
-        Miniumum number of entries to create a table of contents for.
+        Minimum number of entries to create a table of contents for.
 
       github: bool (default: False)
         Uses GitHub TOC syntax if True.