diff --git a/INSTALL.md b/INSTALL.md
index 709b8017..84549e43 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -4,7 +4,7 @@ Install
- C++14 compiler (clang, gcc, hipcc, icc, nvcc)
- *Debian/Ubuntu:* `sudo apt-get install gcc build-essential`
- *Arch Linux:* `sudo pacman -S base-devel`
- - `alpaka` 1.1.0
+ - `alpaka` 1.2.0
- included as git submodule
- `boost` >= 1.65.1
- dependency of alpaka
diff --git a/alpaka/.github/workflows/ci.yml b/alpaka/.github/workflows/ci.yml
index 709d1c5f..fe14a5a3 100644
--- a/alpaka/.github/workflows/ci.yml
+++ b/alpaka/.github/workflows/ci.yml
@@ -14,16 +14,15 @@ concurrency:
################################################################################
# NOTE: Testing the full matrix is not practical.
# Therefore we aim to have each value been set in at lest one job.
-# CXX : {g++, clang++}
+# ALPAKA_CI_CXX : {g++, clang++}
# [g++] ALPAKA_CI_GCC_VER : {9, 10, 11, 12, 13}
# [clang++] ALPAKA_CI_CLANG_VER : {9, 10, 11, 12, 13, 14}
# [cl.exe] ALPAKA_CI_CL_VER : {2022}
-# ALPAKA_CI_STDLIB : {libstdc++, [CXX==clang++]:libc++}
+# ALPAKA_CI_STDLIB : {libstdc++, [ALPAKA_CI_CXX==clang++]:libc++}
# CMAKE_BUILD_TYPE : {Debug, Release}
# alpaka_CI : {GITHUB}
-# ALPAKA_CI_DOCKER_BASE_IMAGE_NAME : {ubuntu:20.04, ubuntu:22.04}
-# ALPAKA_BOOST_VERSION : {1.74.0, 1.75.0, 1.76.0, 1.77.0, 1.78.0, 1.79.0, 1.80.0, 1.81.0, 1.82.0}
-# ALPAKA_CI_CMAKE_VER : {3.22.6, 3.23.5, 3.24.4, 3.25.3, 3.26.4}
+# ALPAKA_BOOST_VERSION : {1.74.0, 1.75.0, 1.76.0, 1.77.0, 1.78.0, 1.79.0, 1.80.0, 1.81.0, 1.82.0, 1.83.0, 1.84.0, 1.85.0, 1.86.0}
+# ALPAKA_CI_CMAKE_VER : {3.22.6, 3.23.5, 3.24.4, 3.25.3, 3.26.4, 3.27.9, 3.28.6, 3.29.8, 3.30.3}
# ALPAKA_CI_XCODE_VER : {13.2.1, 14.2}
# ALPAKA_CI_SANITIZERS : {ASan, UBsan, TSan}
# TSan is not currently used because it produces many unexpected errors
@@ -40,8 +39,8 @@ concurrency:
# alpaka_ACC_ANY_BT_OMP5_ENABLE : {ON, OFF}
# [ON] OMP_NUM_THREADS : {1, 2, 3, 4}
# alpaka_ACC_GPU_CUDA_ENABLE : {ON, OFF}
-# [ON] ALPAKA_CI_CUDA_VERSION : {11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6}
-# [ON] CMAKE_CUDA_COMPILER : {nvcc, [CXX==clang++]:clang++}
+# [ON] ALPAKA_CI_CUDA_VERSION : {11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 12.0, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6}
+# [ON] ALPAKA_CI_CUDA_COMPILER : {nvcc, [ALPAKA_CI_CXX==clang++]:clang++}
# alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE : {ON, OFF}
# alpaka_ACC_GPU_HIP_ENABLE : {ON, OFF}
# [ON] ALPAKA_CI_HIP_BRANCH : {rocm-4.2}
@@ -59,9 +58,10 @@ env:
ALPAKA_CI_HIP_ROOT_DIR: ${{ github.workspace }}/hip
ALPAKA_CI_SANITIZERS: ""
ALPAKA_CI_ANALYSIS: OFF
- ALPAKA_CI_ONEAPI_VERSION: 2023.2.0
+ ALPAKA_CI_ONEAPI_VERSION: 2024.0
ALPAKA_CI_TBB_VERSION: 2021.10.0
ALPAKA_CI_RUN_TESTS: ON
+ alpaka_CXX_STANDARD: 17
alpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE: ON
alpaka_ACC_CPU_B_SEQ_T_THREADS_ENABLE: ON
alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: ON
@@ -99,6 +99,7 @@ jobs:
name: ${{ matrix.name }}
runs-on: ${{ matrix.os }}
env: ${{ matrix.env }}
+ container: ${{ matrix.container }}
strategy:
fail-fast: false
@@ -107,41 +108,43 @@ jobs:
### Analysis builds
- name: linux_clang-14_cuda-11.2_debug_analysis
os: ubuntu-20.04
- env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 14, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_BOOST_VERSION: 1.79.0, ALPAKA_CI_CMAKE_VER: 3.23.5, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_CI_ANALYSIS: ON, ALPAKA_CI_RUN_TESTS: OFF, alpaka_DEBUG: 1, alpaka_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CI_CUDA_VERSION: "11.2", CMAKE_CUDA_COMPILER: clang++, alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+ env: {ALPAKA_CI_CXX: clang++, ALPAKA_CI_CLANG_VER: 14, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_BOOST_VERSION: 1.79.0, ALPAKA_CI_CMAKE_VER: 3.23.5, OMP_NUM_THREADS: 4, ALPAKA_CI_ANALYSIS: ON, ALPAKA_CI_RUN_TESTS: OFF, alpaka_DEBUG: 1, alpaka_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CI_CUDA_VERSION: "11.2", ALPAKA_CI_CUDA_COMPILER : clang++, alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+ container: ubuntu:20.04
- name: windows_cl-2022_debug_analysis
os: windows-2022
- env: {CXX: cl.exe, CC: cl.exe, ALPAKA_CI_CL_VER: 2022, CMAKE_BUILD_TYPE: Debug, ALPAKA_BOOST_VERSION: 1.78.0, ALPAKA_CI_CMAKE_VER: 3.23.5, ALPAKA_CI_ANALYSIS: ON, alpaka_DEBUG: 2}
+ env: {ALPAKA_CI_CXX: cl.exe, ALPAKA_CI_CL_VER: 2022, CMAKE_BUILD_TYPE: Debug, ALPAKA_BOOST_VERSION: 1.78.0, ALPAKA_CI_CMAKE_VER: 3.23.5, ALPAKA_CI_ANALYSIS: ON, alpaka_DEBUG: 2}
- name: macos_xcode-14.2_debug_analysis
os: macos-12
- env: {CXX: clang++, CC: clang, ALPAKA_CI_XCODE_VER: 14.2, CMAKE_BUILD_TYPE: Debug, ALPAKA_BOOST_VERSION: 1.81.0, ALPAKA_CI_ANALYSIS: ON, alpaka_DEBUG: 2, alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE: ON, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: ON, ALPAKA_CI_BUILD_JOBS: 3}
+ env: {ALPAKA_CI_CXX: clang++, ALPAKA_CI_XCODE_VER: 14.2, CMAKE_BUILD_TYPE: Debug, ALPAKA_BOOST_VERSION: 1.81.0, ALPAKA_CI_ANALYSIS: ON, alpaka_DEBUG: 2, alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE: ON, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: ON, ALPAKA_CI_BUILD_JOBS: 3}
- name: linux_gcc-12_debug_analysis
os: ubuntu-22.04
- env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 12, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_BOOST_VERSION: 1.80.0, ALPAKA_CI_CMAKE_VER: 3.23.5, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:22.04", ALPAKA_CI_ANALYSIS: ON, alpaka_DEBUG: 2}
+ env: {ALPAKA_CI_CXX: g++, ALPAKA_CI_GCC_VER: 12, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_BOOST_VERSION: 1.80.0, ALPAKA_CI_CMAKE_VER: 3.23.5, OMP_NUM_THREADS: 4, ALPAKA_CI_ANALYSIS: ON, alpaka_DEBUG: 2}
+ container: ubuntu:22.04
### macOS
- name: macos_xcode-14.2_release
os: macos-12
- env: {CXX: clang++, CC: clang, ALPAKA_CI_XCODE_VER: 14.2, CMAKE_BUILD_TYPE: Release, ALPAKA_BOOST_VERSION: 1.81.0, alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE: ON, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: ON, ALPAKA_CI_BUILD_JOBS: 3}
+ env: {ALPAKA_CI_CXX: clang++, ALPAKA_CI_XCODE_VER: 14.2, CMAKE_BUILD_TYPE: Release, ALPAKA_BOOST_VERSION: 1.81.0, alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE: ON, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: ON, ALPAKA_CI_BUILD_JOBS: 3}
- name: macos_xcode-14.3.1_debug
os: macos-13
- env: {CXX: clang++, CC: clang, ALPAKA_CI_XCODE_VER: 14.3.1, CMAKE_BUILD_TYPE: Debug, ALPAKA_BOOST_VERSION: 1.82.0, alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE: ON, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: ON, ALPAKA_CI_BUILD_JOBS: 3}
+ env: {ALPAKA_CI_CXX: clang++, ALPAKA_CI_XCODE_VER: 14.3.1, CMAKE_BUILD_TYPE: Debug, ALPAKA_BOOST_VERSION: 1.82.0, alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE: ON, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: ON, ALPAKA_CI_BUILD_JOBS: 3}
### Windows
- name: windows_cl-2022_release
os: windows-2022
- env: {CXX: cl.exe, CC: cl.exe, ALPAKA_CI_CL_VER: 2022, CMAKE_BUILD_TYPE: Release, ALPAKA_BOOST_VERSION: 1.79.0, ALPAKA_CI_CMAKE_VER: 3.24.4, OMP_NUM_THREADS: 1}
+ env: {ALPAKA_CI_CXX: cl.exe, ALPAKA_CI_CL_VER: 2022, CMAKE_BUILD_TYPE: Release, ALPAKA_BOOST_VERSION: 1.79.0, ALPAKA_CI_CMAKE_VER: 3.24.4, OMP_NUM_THREADS: 1}
- name: windows_cl-2022_debug
os: windows-2022
- env: {CXX: cl.exe, CC: cl.exe, ALPAKA_CI_CL_VER: 2022, CMAKE_BUILD_TYPE: Debug, ALPAKA_BOOST_VERSION: 1.80.0, ALPAKA_CI_CMAKE_VER: 3.25.1, OMP_NUM_THREADS: 4, alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+ env: {ALPAKA_CI_CXX: cl.exe, ALPAKA_CI_CL_VER: 2022, CMAKE_BUILD_TYPE: Debug, ALPAKA_BOOST_VERSION: 1.80.0, ALPAKA_CI_CMAKE_VER: 3.25.1, OMP_NUM_THREADS: 4, alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
## CUDA 12.1
# nvcc + MSVC
# - name: windows_nvcc-12.1_cl-2022_release_cuda-only
# os: windows-2022
- # env: {CXX: cl.exe, CC: cl.exe, ALPAKA_CI_CL_VER: 2022, CMAKE_BUILD_TYPE: Release, ALPAKA_BOOST_VERSION: 1.81.0, ALPAKA_CI_CMAKE_VER: 3.24.4, ALPAKA_CI_RUN_TESTS: OFF, alpaka_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CI_CUDA_VERSION: "12.1", CMAKE_CUDA_ARCHITECTURES: "50;90", alpaka_ACC_GPU_CUDA_ONLY_MODE: ON, alpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE: OFF, alpaka_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+ # env: {ALPAKA_CI_CXX: cl.exe, ALPAKA_CI_CL_VER: 2022, CMAKE_BUILD_TYPE: Release, ALPAKA_BOOST_VERSION: 1.81.0, ALPAKA_CI_CMAKE_VER: 3.24.4, ALPAKA_CI_RUN_TESTS: OFF, alpaka_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CI_CUDA_VERSION: "12.1", CMAKE_CUDA_ARCHITECTURES: "50;90", alpaka_ACC_GPU_CUDA_ONLY_MODE: ON, alpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE: OFF, alpaka_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
# - name: windows_nvcc-12.1_cl-2022_debug
# os: windows-2022
- # env: {CXX: cl.exe, CC: cl.exe, ALPAKA_CI_CL_VER: 2022, CMAKE_BUILD_TYPE: Debug, ALPAKA_BOOST_VERSION: 1.78.0, ALPAKA_CI_CMAKE_VER: 3.25.1, ALPAKA_CI_RUN_TESTS: OFF, alpaka_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CI_CUDA_VERSION: "12.1", alpaka_ACC_CPU_BT_OMP5_ENABLE: OFF, alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF}
+ # env: {ALPAKA_CI_CXX: cl.exe, ALPAKA_CI_CL_VER: 2022, CMAKE_BUILD_TYPE: Debug, ALPAKA_BOOST_VERSION: 1.78.0, ALPAKA_CI_CMAKE_VER: 3.25.1, ALPAKA_CI_RUN_TESTS: OFF, alpaka_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CI_CUDA_VERSION: "12.1", alpaka_ACC_CPU_BT_OMP5_ENABLE: OFF, alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF}
### Ubuntu
## native
@@ -153,13 +156,16 @@ jobs:
# - Ubuntu 22.04
- name: linux_gcc-9_debug
os: ubuntu-20.04
- env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 9, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_BOOST_VERSION: 1.74.0, ALPAKA_CI_CMAKE_VER: 3.22.6, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", CMAKE_CXX_EXTENSIONS: OFF}
+ env: {ALPAKA_CI_CXX: g++, ALPAKA_CI_GCC_VER: 9, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_BOOST_VERSION: 1.74.0, ALPAKA_CI_CMAKE_VER: 3.22.6, OMP_NUM_THREADS: 4, CMAKE_CXX_EXTENSIONS: OFF}
+ container: ubuntu:20.04
- name: linux_gcc-12_release_c++20
os: ubuntu-22.04
- env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 12, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_BOOST_VERSION: 1.80.0, ALPAKA_CI_CMAKE_VER: 3.24.4, OMP_NUM_THREADS: 2, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:22.04", alpaka_CXX_STANDARD: 20, alpaka_USE_MDSPAN: "FETCH"}
+ env: {ALPAKA_CI_CXX: g++, ALPAKA_CI_GCC_VER: 12, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_BOOST_VERSION: 1.80.0, ALPAKA_CI_CMAKE_VER: 3.24.4, OMP_NUM_THREADS: 2, alpaka_CXX_STANDARD: 20, alpaka_USE_MDSPAN: "FETCH"}
+ container: ubuntu:22.04
- name: linux_gcc-13_debug
os: ubuntu-22.04
- env: {CXX: g++, CC: gcc, ALPAKA_CI_GCC_VER: 13, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_BOOST_VERSION: 1.82.0, ALPAKA_CI_CMAKE_VER: 3.26.4, OMP_NUM_THREADS: 2, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:22.04"}
+ env: {ALPAKA_CI_CXX: g++, ALPAKA_CI_GCC_VER: 13, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_BOOST_VERSION: 1.82.0, ALPAKA_CI_CMAKE_VER: 3.26.4, OMP_NUM_THREADS: 2}
+ container: ubuntu:22.04
# TODO: keep jobs until GitLab CI supports:
# - disable CMAKE_CXX_EXTENSIONS=OFF
@@ -169,20 +175,25 @@ jobs:
# clang++
- name: linux_clang-10_release
os: ubuntu-20.04
- env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 10, ALPAKA_CI_STDLIB: libc++, CMAKE_BUILD_TYPE: Release, ALPAKA_BOOST_VERSION: 1.75.0, ALPAKA_CI_CMAKE_VER: 3.23.5, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, CMAKE_CXX_EXTENSIONS: OFF}
+ env: {ALPAKA_CI_CXX: clang++, ALPAKA_CI_CLANG_VER: 10, ALPAKA_CI_STDLIB: libc++, CMAKE_BUILD_TYPE: Release, ALPAKA_BOOST_VERSION: 1.75.0, ALPAKA_CI_CMAKE_VER: 3.23.5, OMP_NUM_THREADS: 4, alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, CMAKE_CXX_EXTENSIONS: OFF}
+ container: ubuntu:20.04
# clang-11 tested in GitLab CI
- name: linux_clang-12_release
os: ubuntu-20.04
- env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 12, ALPAKA_CI_STDLIB: libc++, CMAKE_BUILD_TYPE: Release, ALPAKA_BOOST_VERSION: 1.77.0, ALPAKA_CI_CMAKE_VER: 3.24.4, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, CMAKE_CXX_EXTENSIONS: OFF}
+ env: {ALPAKA_CI_CXX: clang++, ALPAKA_CI_CLANG_VER: 12, ALPAKA_CI_STDLIB: libc++, CMAKE_BUILD_TYPE: Release, ALPAKA_BOOST_VERSION: 1.77.0, ALPAKA_CI_CMAKE_VER: 3.24.4, OMP_NUM_THREADS: 4, alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, CMAKE_CXX_EXTENSIONS: OFF}
+ container: ubuntu:20.04
- name: linux_clang-13_debug
os: ubuntu-22.04
- env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 13, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_BOOST_VERSION: 1.79.0, ALPAKA_CI_CMAKE_VER: 3.23.5, OMP_NUM_THREADS: 3, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", CMAKE_CXX_EXTENSIONS: OFF}
+ env: {ALPAKA_CI_CXX: clang++, ALPAKA_CI_CLANG_VER: 13, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_BOOST_VERSION: 1.79.0, ALPAKA_CI_CMAKE_VER: 3.23.5, OMP_NUM_THREADS: 3, CMAKE_CXX_EXTENSIONS: OFF}
+ container: ubuntu:22.04
- name: linux_clang-16_debug_ubsan
- os: ubuntu-latest
- env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 16, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_BOOST_VERSION: 1.77.0, ALPAKA_CI_CMAKE_VER: 3.23.5, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:22.04", CMAKE_CXX_EXTENSIONS: OFF, ALPAKA_CI_SANITIZERS: UBSan}
+ os: ubuntu-22.04
+ env: {ALPAKA_CI_CXX: clang++, ALPAKA_CI_CLANG_VER: 16, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_BOOST_VERSION: 1.77.0, ALPAKA_CI_CMAKE_VER: 3.23.5, OMP_NUM_THREADS: 4, CMAKE_CXX_EXTENSIONS: OFF, ALPAKA_CI_SANITIZERS: UBSan}
+ container: ubuntu:22.04
- name: linux_clang-16_debug_tsan
- os: ubuntu-latest
- env: {CXX: clang++, CC: clang, ALPAKA_CI_CLANG_VER: 16, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_BOOST_VERSION: 1.77.0, ALPAKA_CI_CMAKE_VER: 3.23.5, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:22.04", alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, CMAKE_CXX_EXTENSIONS: OFF, ALPAKA_CI_SANITIZERS: TSan}
+ os: ubuntu-22.04
+ env: {ALPAKA_CI_CXX: clang++, ALPAKA_CI_CLANG_VER: 16, ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug, ALPAKA_BOOST_VERSION: 1.77.0, ALPAKA_CI_CMAKE_VER: 3.23.5, OMP_NUM_THREADS: 4, alpaka_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, alpaka_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, alpaka_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, CMAKE_CXX_EXTENSIONS: OFF, ALPAKA_CI_SANITIZERS: TSan}
+ container: ubuntu:22.04
steps:
- name: check filter
diff --git a/alpaka/.github/workflows/gh-pages.yml b/alpaka/.github/workflows/gh-pages.yml
index 256ee452..993f85f6 100644
--- a/alpaka/.github/workflows/gh-pages.yml
+++ b/alpaka/.github/workflows/gh-pages.yml
@@ -6,6 +6,8 @@ on:
push:
branches:
- develop
+env:
+ ALPAKA_CI_OS_NAME: "Linux"
jobs:
gh-pages:
diff --git a/alpaka/.gitignore b/alpaka/.gitignore
index 6c946c89..2dff37de 100644
--- a/alpaka/.gitignore
+++ b/alpaka/.gitignore
@@ -8,6 +8,7 @@
# tmp files
*~
+.*.swp
# netbeans project files
/nbproject/
diff --git a/alpaka/.gitlab-ci.yml b/alpaka/.gitlab-ci.yml
index 549f8ed9..a649e23e 100644
--- a/alpaka/.gitlab-ci.yml
+++ b/alpaka/.gitlab-ci.yml
@@ -17,7 +17,7 @@ variables:
# container version of the generated jobs
# should be merged with ALPAKA_GITLAB_CI_CONTAINER_VERSION
# see: script/job_generator/generate_job_yaml.py
- ALPAKA_GITLAB_CI_GENERATOR_CONTAINER_VERSION: "3.1"
+ ALPAKA_GITLAB_CI_GENERATOR_CONTAINER_VERSION: "3.2"
generate:
stage: generator
@@ -27,7 +27,9 @@ generate:
- apk update && apk add python3~=3.11 py3-pip
- pip3 install -r script/job_generator/requirements.txt
# it is sufficient to verify once, as the same job matrix is generated, verified and then filtered each time
- - python3 script/job_generator/job_generator.py ${ALPAKA_GITLAB_CI_GENERATOR_CONTAINER_VERSION} --verify --wave compile_only_job -o compile_only.yml
+ # disable verify because we know that the generator is broken: https://github.com/thombashi/allpairspy/pull/10
+ #- python3 script/job_generator/job_generator.py ${ALPAKA_GITLAB_CI_GENERATOR_CONTAINER_VERSION} --verify --wave compile_only_job -o compile_only.yml
+ - python3 script/job_generator/job_generator.py ${ALPAKA_GITLAB_CI_GENERATOR_CONTAINER_VERSION} --wave compile_only_job -o compile_only.yml
- python3 script/job_generator/job_generator.py ${ALPAKA_GITLAB_CI_GENERATOR_CONTAINER_VERSION} --wave runtime_job_cpu -o runtime_cpu.yml
- python3 script/job_generator/job_generator.py ${ALPAKA_GITLAB_CI_GENERATOR_CONTAINER_VERSION} --wave runtime_job_gpu -o runtime_gpu.yml
- cat compile_only.yml
diff --git a/alpaka/.pre-commit-config.yaml b/alpaka/.pre-commit-config.yaml
new file mode 100644
index 00000000..065efe81
--- /dev/null
+++ b/alpaka/.pre-commit-config.yaml
@@ -0,0 +1,45 @@
+minimum_pre_commit_version: 3.2.0 # necessitated by Lucas-C's hooks
+default_install_hook_types: [pre-commit, pre-push]
+exclude: 'thirdParty'
+repos:
+- repo: https://github.com/pre-commit/mirrors-clang-format
+ rev: v16.0.6
+ hooks:
+ - id: clang-format
+ files: \.(cpp|hpp)
+- repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v4.5.0
+ hooks:
+ - id: no-commit-to-branch
+ args: [-b, develop]
+ - id: check-merge-conflict
+ - id: trailing-whitespace
+ exclude_types: [markdown, rst]
+ - id: end-of-file-fixer
+ - id: check-toml
+ - id: check-yaml
+ - id: mixed-line-ending
+ - id: check-executables-have-shebangs
+ - id: check-shebang-scripts-are-executable
+- repo: https://github.com/Lucas-C/pre-commit-hooks
+ rev: v1.5.4
+ hooks:
+ - id: forbid-tabs
+ types_or: [file]
+ exclude_types: [rst]
+ - id: remove-tabs
+ types_or: [file]
+ exclude_types: [rst]
+ - id: forbid-crlf
+ - id: remove-crlf
+- repo: https://github.com/astral-sh/ruff-pre-commit
+ rev: v0.2.1
+ hooks:
+ - id: ruff
+ types_or: [ python, pyi, jupyter ]
+ # The ignores in here are chosen to conform with the currently
+ # existing code and not motivated any other way.
+ args: [ --fix, --ignore, "F403,F405,E731"]
+ - id: ruff-format
+ types_or: [ python, pyi, jupyter ]
+ args: ["--line-length", "120"]
diff --git a/alpaka/.zenodo.json b/alpaka/.zenodo.json
index a2c1e618..9b11be3f 100644
--- a/alpaka/.zenodo.json
+++ b/alpaka/.zenodo.json
@@ -12,29 +12,46 @@
"affiliation": "Helmholtz-Zentrum Dresden-Rossendorf",
"orcid": "0000-0002-8218-3116"
},
+ {
+ "name": "Erdem, Sven",
+ "affiliation": "Helmholtz-Zentrum Berlin"
+ },
+ {
+ "name": "Fila, Mateusz Jakub",
+ "affiliation": "CERN"
+ },
{
"name": "Gruber, Bernhard Manfred",
"affiliation": "CASUS, Helmholtz-Zentrum Dresden-Rossendorf, CERN",
"orcid": "0000-0001-7848-1690"
},
{
- "name": "Martin-Haugh, Stewart",
- "affiliation": "STFC Rutherford Appleton Laboratory",
- "orcid": "0000-0001-9457-1928"
+ "name": "Lenz, Julian",
+ "affiliation": "CASUS, Helmholtz-Zentrum Dresden-Rossendorf",
+ "orcid": "0000-0001-5250-0005"
},
{
"name": "Perego, Aurora",
"affiliation": "CERN",
- "orcid": "0000-0003-1576-6757"
+ "orcid": "0009-0002-5210-6213"
},
{
- "name": "Tascon, Andres Rios",
- "affiliation": "Princeton University"
+ "name": "Varvarin, Michael",
+ "affiliation": "CASUS, Helmholtz-Zentrum Dresden-Rossendorf"
+ },
+ {
+ "name": "Vyskočil, Jiří",
+ "affiliation": "CASUS, Helmholtz-Zentrum Dresden-Rossendorf",
+ "orcid": "0000-0001-8822-0929"
},
{
"name": "Widera, René",
"affiliation": "Helmholtz-Zentrum Dresden-Rossendorf",
"orcid": "0000-0003-1642-0459"
+ },
+ {
+ "name": "Yusufoglu, Mehmet",
+ "affiliation": "CASUS, Helmholtz-Zentrum Dresden-Rossendorf"
}
],
"contributors": [
@@ -98,6 +115,12 @@
"affiliation": "Helmholtz-Zentrum Dresden-Rossendorf",
"type": "Other"
},
+ {
+ "name": "Martin-Haugh, Stewart",
+ "affiliation": "STFC Rutherford Appleton Laboratory",
+ "orcid": "0000-0001-9457-1928",
+ "type": "Other"
+ },
{
"affiliation": "Helmholtz-Zentrum Dresden-Rossendorf, TU Dresden",
"name": "Matthes, Alexander",
@@ -143,14 +166,13 @@
"type": "Other"
},
{
- "name": "Vollmer, Daniel",
- "affiliation": "Deutsches Zentrum für Luft- und Raumfahrt e.V.",
+ "name": "Tascon, Andres Rios",
+ "affiliation": "Princeton University",
"type": "Other"
},
{
- "name": "Vyskočil, Jiří",
- "affiliation": "CASUS, Helmholtz-Zentrum Dresden-Rossendorf",
- "orcid": "0000-0001-8822-0929",
+ "name": "Vollmer, Daniel",
+ "affiliation": "Deutsches Zentrum für Luft- und Raumfahrt e.V.",
"type": "Other"
},
{
diff --git a/alpaka/CHANGELOG.md b/alpaka/CHANGELOG.md
index dab9f20e..6af19ec2 100644
--- a/alpaka/CHANGELOG.md
+++ b/alpaka/CHANGELOG.md
@@ -3,6 +3,65 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
+## [1.2.0] - 2024-10-02
+
+### Added
+
+- device functions to simplify writing kernel code #2337 #2369 #2383
+- support Clang 18 and 19 #2387
+- support oneAPI 2024.2 #2368
+- support for mapped memory allocation for the SYCL backend #2375
+- support for pre-commit #2253
+- support for device and constant global variables in the SYCL backend #2242
+- alpaka::meta::isList, alpaka::meta::ToList and alpaka::meta::toTuple #2269
+- accelerator trait to check for single and multi-threads per block #2263
+- trait IsKernelTriviallyCopyable #2302
+- trait AccIsEnabled #2267
+- documentation: cmake flag to build alpaka benchmarks #2272
+- benchmark: babelstream support for different Accs #2299
+- example: using MdSpan to pass 2D data #2293
+- example: 2D heat equation #2365 #2383
+- example: Convolution #2228 #2220
+
+### Changed
+
+- update cheatsheet.rst #2398 #2386 #2241
+- signature of `[get|is]ValidWorkDiv*` #2349
+- use shared CUDA libraries by default #2348 #2342
+- add thread count to CPU blocks accelerators #2338
+- link libcudart even when libcurand is not used #2329
+- ctest: display only output of tests, which failed #2322
+- example: Matrix Multiplication use MdSpan #2317
+- move the Complex class to internal namespace #2301
+- run examples with all enabled accelerators #2280
+- template order allocMappedBuf #2270
+- slow getWarpSize problem #2246
+- simplification of workdiv creation #2240
+- benchmarks: move from examples into own directory #2237
+
+### Fixed
+
+- `[get|is]ValidWorkDiv*` #2349 #2335
+- cray clang compiler errors #2392
+- fix and update SYCL targets #2390 #2361
+- single thread acc throw for invalid workdiv fix #2391
+- explicitly call alpaka::detail to achieve SYCL compilation #2385
+- deduction guide for vector #2376
+- issue with device global variables with CUDA 12.4 #2303
+- clang9/nvcc11.2 boost bug #2294
+- HIP: fix CMake relocatable device code option #2290
+- Re-enable AtomicAtomicRef #2288
+- alpaka_add_library relocatable device code #2273
+- forwarding of msvc compiler flag '/Zo' #2266
+- Windows: usage of Idx to alpaka::Idx #2265
+- compiler detection for clang 17 and 18 as CUDA compiler with libstdc++ (gcc) #2256
+- support for non-integral types in Vec generator constructor #2236
+- memcpy warning #2295
+
+### Removed
+
+- support for nvcc11.0 and nvcc11.1 #2310
+
## [1.1.0] - 2024-01-18
### Added
diff --git a/alpaka/CMakeLists.txt b/alpaka/CMakeLists.txt
index 860796f5..fcaae972 100644
--- a/alpaka/CMakeLists.txt
+++ b/alpaka/CMakeLists.txt
@@ -35,6 +35,7 @@ set_property(GLOBAL PROPERTY USE_FOLDERS ON)
# Options and Variants
option(alpaka_BUILD_EXAMPLES "Build the examples" OFF)
+option(alpaka_BUILD_BENCHMARKS "Build the benchmarks" OFF)
# Enable the test infrastructure only if alpaka is the top-level project
if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
@@ -46,8 +47,9 @@ endif()
option(alpaka_INSTALL_TEST_HEADER "Install headers of the namespace alpaka::test. Attention, headers are not designed for production code, see documentation." OFF)
include(CMakeDependentOption)
+
cmake_dependent_option(alpaka_CHECK_HEADERS "Check all alpaka headers as part of the tests whether they can be compiled standalone." OFF BUILD_TESTING OFF)
-cmake_dependent_option(alpaka_USE_INTERNAL_CATCH2 "Use internally shipped Catch2" ON BUILD_TESTING OFF)
+cmake_dependent_option(alpaka_USE_INTERNAL_CATCH2 "Use internally shipped Catch2" ON "BUILD_TESTING OR alpaka_BUILD_BENCHMARKS" OFF)
################################################################################
# Internal variables.
@@ -80,7 +82,7 @@ set(_alpaka_INCLUDE_DIRECTORY "${_alpaka_ROOT_DIR}/include")
set(_alpaka_SUFFIXED_INCLUDE_DIR "${_alpaka_INCLUDE_DIRECTORY}/alpaka")
# the sequential accelerator is required for the tests and examples
-if(alpaka_BUILD_EXAMPLES OR BUILD_TESTING)
+if(alpaka_BUILD_EXAMPLES OR alpaka_BUILD_BENCHMARKS OR BUILD_TESTING)
if (NOT (alpaka_ACC_GPU_CUDA_ONLY_MODE OR alpaka_ACC_GPU_HIP_ONLY_MODE))
if (NOT DEFINED alpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE)
option(alpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE "enable alpaka serial accelerator" ON)
@@ -148,6 +150,10 @@ if(alpaka_BUILD_EXAMPLES)
add_subdirectory("example/")
endif()
+if(alpaka_BUILD_BENCHMARKS)
+ add_subdirectory("benchmarks/")
+endif()
+
# Only build the tests if alpaka is the top-level project and BUILD_TESTING is ON
if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME AND BUILD_TESTING)
add_subdirectory("test/")
@@ -184,10 +190,10 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
if(alpaka_INSTALL_TEST_HEADER)
install(DIRECTORY "${_alpaka_SUFFIXED_INCLUDE_DIR}"
- DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")
+ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")
else()
- install(DIRECTORY "${_alpaka_SUFFIXED_INCLUDE_DIR}"
- DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
+ install(DIRECTORY "${_alpaka_SUFFIXED_INCLUDE_DIR}"
+ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
PATTERN "test" EXCLUDE)
endif()
diff --git a/alpaka/CONTRIBUTING.md b/alpaka/CONTRIBUTING.md
index bae65b41..eec1037c 100644
--- a/alpaka/CONTRIBUTING.md
+++ b/alpaka/CONTRIBUTING.md
@@ -1,11 +1,34 @@
# Contributing
+Please review our more detailed [Coding Guidelines](https://alpaka.readthedocs.io/en/latest/dev/style.html) as well.
+
+## Pre-commit
+
+This project is set up for use with [pre-commit](https://pre-commit.com). Using it will make your code conform with most
+of our (easily automatable) code style guidelines automatically.
+In very short (for anything further see [pre-commit](https://pre-commit.com)), after running the following in your
+working clone of alpaka
+```bash
+# if not yet done, install the pre-commit executable following https://pre-commit.com
+cd /path/to/alpaka-working-clone
+pre-commit install
+```
+`git` will run a number of checks prior to every commit and push and will refuse to perform the
+pertinent action if they fail. Most of them (like e.g. the formatter) will have automatically altered your working tree
+with the necessary changes such that
+```bash
+git add -u
+```
+will make the next commit pass.
+
## Formatting
-Please format your code before before opening pull requests using clang-format 16 and the .clang-format file placed in the repository root.
+Please format your code before before opening pull requests using clang-format 16 and the .clang-format file placed in
+the repository root. If you were using `pre-commit` during your changes, this has happened automatically already. If
+not, find further instructions below.
### Visual Studio and CLion
-Suport for clang-format is built-in since Visual Studio 2017 15.7 and CLion 2019.1.
+Support for clang-format is built-in since Visual Studio 2017 15.7 and CLion 2019.1.
The .clang-format file in the repository will be automatically detected and formatting is done as you type, or triggered when pressing the format hotkey.
### Bash
diff --git a/alpaka/README.md b/alpaka/README.md
index 945e6fba..f7a03994 100644
--- a/alpaka/README.md
+++ b/alpaka/README.md
@@ -15,18 +15,18 @@ The **alpaka** library is a header-only C++17 abstraction library for accelerato
Its aim is to provide performance portability across accelerators through the abstraction (not hiding!) of the underlying levels of parallelism.
-It is platform independent and supports the concurrent and cooperative use of multiple devices such as the hosts CPU as well as attached accelerators as for instance CUDA GPUs and Xeon Phis (currently native execution only).
-A multitude of accelerator back-end variants using CUDA, OpenMP (2.0/5.0), std::thread and also serial execution is provided and can be selected depending on the device.
+It is platform independent and supports the concurrent and cooperative use of multiple devices such as the hosts CPU (x86, ARM, RISC-V and Power 8+) and GPU accelerators from different vendors (NVIDIA, AMD and Intel).
+A multitude of accelerator back-end variants using NVIDIA CUDA, AMD HIP, SYCL, OpenMP 2.0+, std::thread and also serial execution is provided and can be selected depending on the device.
Only one implementation of the user kernel is required by representing them as function objects with a special interface.
-There is no need to write special CUDA, OpenMP or custom threading code.
-Accelerator back-ends can be mixed within a device queue.
+There is no need to write special CUDA, HIP, OpenMP or custom threading code.
+Accelerator back-ends can be mixed and synchronized via compute device queue.
The decision which accelerator back-end executes which kernel can be made at runtime.
-The abstraction used is very similar to the CUDA grid-blocks-threads division strategy.
+The abstraction used is very similar to the CUDA grid-blocks-threads domain decomposition strategy.
Algorithms that should be parallelized have to be divided into a multi-dimensional grid consisting of small uniform work items.
These functions are called kernels and are executed in parallel threads.
The threads in the grid are organized in blocks.
-All threads in a block are executed in parallel and can interact via fast shared memory.
+All threads in a block are executed in parallel and can interact via fast shared memory and low level synchronization methods.
Blocks are executed independently and can not interact in any way.
The block execution order is unspecified and depends on the accelerator in use.
By using this abstraction the execution can be optimally adapted to the available hardware.
@@ -65,17 +65,17 @@ Supported Compilers
This library uses C++17 (or newer when available).
-| Accelerator Back-end | gcc 9.5
(Linux) | gcc 10.4 / 11.1
(Linux) | gcc 12.3
(Linux) | gcc 13.1
(Linux) | clang 9
(Linux) | clang 10 / 11
(Linux) | clang 12
(Linux) | clang 13
(Linux) | clang 14
(Linux) | clang 15
(Linux) | clang 16
(Linux) | clang 17
(Linux) | icpx 2023.1.0 / 2023.2.0 (Linux) | Xcode 13.2.1 / 14.2 / 14.3.1
(macOS) | Visual Studio 2022
(Windows) |
-|--------------------------------------------------------------------------------|-------------------------------------------------|-------------------------------------------------|---------------------------------------------|------------------------|------------------------------------------------------------|-------------------------------------------------------|-------------------------------------------------|---------------------------------------------|---------------------------------------------------|-------------------------------------------|-------------------------------------------|-------------------------------------------|----------------------------------|-------------------------------------------------------|--------------------------------------|
-| Serial | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
-| OpenMP 2.0+ blocks | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark:[^3] | :white_check_mark: | :white_check_mark: |
-| OpenMP 2.0+ threads | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark:[^3] | :white_check_mark: | :white_check_mark: |
-| std::thread | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
-| TBB | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
-| CUDA (nvcc) | :white_check_mark:
(CUDA 11.0 - 12.3)[^2] | :white_check_mark:
(CUDA 11.4 - 12.0)[^2] | :white_check_mark:
(CUDA 12.0 - 12.3) | :x: | :white_check_mark:
(CUDA 11.0-11.2; 11.6 - 12.0)[^2] | :white_check_mark:
(CUDA 11.2, 11.6 - 12.0)[^2] | :white_check_mark:
(CUDA 11.6 - 12.0)[^2] | :white_check_mark:
(CUDA 11.7 - 12.0) | :white_check_mark:
(CUDA 11.8 - 12.0) | :white_check_mark:
(CUDA 12.2) | :white_check_mark:
(CUDA 12.3) | :x: | :x: | :x: | :x: |
-| CUDA (clang) | - | - | - | :x: | :x: | :x: | :x: | :x: | :white_check_mark: (CUDA 11.0 - 11.5) | :white_check_mark: (CUDA 11.0 - 11.5)[^1] | :white_check_mark: (CUDA 11.0 - 11.5)[^1] | :white_check_mark: (CUDA 11.0 - 11.8)[^1] | :x: | - | - |
-| [HIP](https://alpaka.readthedocs.io/en/latest/install/HIP.html) (clang) | - | - | - | :x: | :x: | :x: | :x: | :x: | :white_check_mark: (HIP 5.1 - 5.2) | :white_check_mark: (HIP 5.3 - 5.4) | :white_check_mark: (HIP 5.5 - 5.6) | :white_check_mark: (HIP 5.7 - 6.0) | :x: | - | - |
-| SYCL | :x: | :x: | :x: | :x: | :x: | :x: | :x: | :x: | :x: | :x: | :x: | :x: | :white_check_mark:[^4] | :x: | :x: |
+| Accelerator Back-end | gcc 9.5 (Linux) | gcc 10.4 / 11.1 (Linux) | gcc 12.3 (Linux) | gcc 13.1 (Linux) | clang 9 (Linux) | clang 10/11 (Linux) | clang 12 (Linux) | clang 13 (Linux) | clang 14 (Linux) | clang 15 (Linux) | clang 16 (Linux) | clang 17 (Linux) | icpx 2024.2 (Linux) | Xcode 13.2.1 / 14.2 / 14.3.1 (macOS) | Visual Studio 2022 (Windows) |
+|----------------------|-------------------------------------------|-------------------------------------------|---------------------------------------|---------------------------------------|-------------------------------------------|-------------------------------------------------|-------------------------------------------|---------------------------------------|---------------------------------------|---------------------------------------|---------------------------------------|---------------------------------------|-------------------------|--------------------------------------|------------------------------|
+| Serial | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
+| OpenMP 2.0+ blocks | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: [^1] | :white_check_mark: | :white_check_mark: |
+| OpenMP 2.0+ threads | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: [^1] | :white_check_mark: | :white_check_mark: |
+| std::thread | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
+| TBB | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
+| CUDA (nvcc) | :white_check_mark: (CUDA 11.2 - 12.5)[^2] | :white_check_mark: (CUDA 11.4 - 12.0)[^2] | :white_check_mark: (CUDA 12.0 - 12.5) | :white_check_mark: (CUDA 12.4 - 12.5) | :white_check_mark: (CUDA 11.6 - 12.0)[^2] | :white_check_mark: (CUDA 11.2, 11.6 - 12.0)[^2] | :white_check_mark: (CUDA 11.6 - 12.0)[^2] | :white_check_mark: (CUDA 11.7 - 12.0) | :white_check_mark: (CUDA 11.8 - 12.0) | :white_check_mark: (CUDA 12.2) | :white_check_mark: (CUDA 12.3) | :white_check_mark: (CUDA 12.4 - 15.5) | :x: | - | :x: |
+| CUDA (clang) | - | - | - | - | :x: | :x: | :x: | :x: | :white_check_mark: (CUDA 11.2 - 11.5) | :white_check_mark: (CUDA 11.2 - 11.5) | :white_check_mark: (CUDA 11.2 - 11.5) | :white_check_mark: (CUDA 11.2 - 11.8) | :x: | - | - |
+| HIP (clang) | - | - | - | - | :x: | :x: | :x: | :x: | :white_check_mark: (HIP 5.1 - 5.2) | :white_check_mark: (HIP 5.3 - 5.4) | :white_check_mark: (HIP 5.5 - 5.6) | :white_check_mark: (HIP 5.7 - 6.1) | :x: | - | - |
+| SYCL | :x: | :x: | :x: | :x: | :x: | :x: | :x: | :x: | :x: | :x: | :x: | :x: | :white_check_mark: [^4] | - | :x: |
Other compilers or combinations marked with :x: in the table above may work but are not tested in CI and are therefore not explicitly supported.
@@ -91,7 +91,7 @@ Dependencies
The **alpaka** library itself just requires header-only libraries.
However some of the accelerator back-end implementations require different boost libraries to be built.
-When an accelerator back-end using *CUDA* is enabled, version *11.0* (with nvcc as CUDA compiler) or version *9.2* (with clang as CUDA compiler) of the *CUDA SDK* is the minimum requirement.
+When an accelerator back-end using *CUDA* is enabled, version *11.2* (with nvcc as CUDA compiler) or version *11.2* (with clang as CUDA compiler) of the *CUDA SDK* is the minimum requirement.
*NOTE*: When using clang as a native *CUDA* compiler, the *CUDA accelerator back-end* can not be enabled together with any *OpenMP accelerator back-end* because this combination is currently unsupported.
*NOTE*: Separable compilation is disabled by default and can be enabled via the CMake flag `CMAKE_CUDA_SEPARABLE_COMPILATION`.
@@ -214,47 +214,7 @@ consider citing us accordingly in your derived work and publications:
Contributing
------------
-Rules for contributions can be found in [CONTRIBUTING.md](CONTRIBUTING.md)
-
-Authors
--------
-
-### Maintainers* and Core Developers
-
-- Benjamin Worpitz* (original author)
-- Dr. Sergei Bastrakov*
-- Kseniia Bastrakova
-- Dr. Andrea Bocci*
-- Dr. Antonio Di Pilato
-- Simeon Ehrig
-- Luca Ferragina
-- Bernhard Manfred Gruber*
-- Christian Kaever
-- Dr. Jeffrey Kelling
-- Dr. Stewart Martin-Haugh
-- Aurora Perego
-- Jan Stephan*
-- René Widera*
-- Dr. Jeffrey Young
-
-### Former Members, Contributions and Thanks
-
-- Dr. Michael Bussmann
-- Mat Colgrove
-- Valentin Gehrke
-- Dr. Axel Hübl
-- Maximilian Knespel
-- Jakob Krude
-- Alexander Matthes
-- Hauke Mewes
-- Phil Nash
-- Dr. Felice Pantaleo
-- Dr. David M. Rogers
-- Mutsuo Saito
-- Jonas Schenke
-- Daniel Vollmer
-- Dr. Jiří Vyskočil
-- Matthias Werner
-- Bert Wesarg
-- Malte Zacharias
-- Erik Zenker
+Rules for contributions can be found in [CONTRIBUTING.md](CONTRIBUTING.md).
+Any pull request will be reviewed by a [maintainer](https://github.com/orgs/alpaka-group/teams/alpaka-maintainers).
+
+Thanks to all [active and former contributors](.zenodo.json).
diff --git a/alpaka/README_SYCL.md b/alpaka/README_SYCL.md
index 5a4eadca..09d620c4 100644
--- a/alpaka/README_SYCL.md
+++ b/alpaka/README_SYCL.md
@@ -96,9 +96,8 @@ These can be used interchangeably (some restrictions apply - see below) with the
```
See [Intel's FAQ](https://github.com/intel/compute-runtime/blob/master/opencl/doc/FAQ.md#feature-double-precision-emulation-fp64) for more information.
* The FPGA back-end does not support atomics. alpaka will not check this.
-* Device global variables (corresponding to `__device__` and `__constant__` variables in CUDA) are not supported in the SYCL back-end yet.
* Shared memory works but on the GPU it is very slow.
-* The latest Intel OpenCL CPU runtime does not work properly. Some tests (`atomicTest`, `blockSharedTest`, `blockSharedSharingTest` and `warpTest`) fail with a `PI_ERROR_OUT_OF_RESOURCES`. The only runtime version that seems to work is 2022.14.8.0.04 (can be downloaded [here](https://github.com/intel/llvm/releases/download/2022-WW33/oclcpuexp-2022.14.8.0.04_rel.tar.gz) apart from a bug with `all_of_group` / `any_of_group` that requires the warp size being equal to the block size as a workaround.
+* The latest Intel OpenCL CPU runtime does not work properly. Some tests (`atomicTest`, `blockSharedTest`, `blockSharedSharingTest` and `warpTest`) fail with a `PI_ERROR_OUT_OF_RESOURCES`. The only runtime version that seems to work is 2022.14.8.0.04 (can be downloaded [here](https://github.com/intel/llvm/releases/download/2022-WW33/oclcpuexp-2022.14.8.0.04_rel.tar.gz)) apart from a bug with `all_of_group` / `any_of_group` that requires the warp size being equal to the block size as a workaround.
### Choosing the sub-group size (warp size)
diff --git a/alpaka/benchmarks/CMakeLists.txt b/alpaka/benchmarks/CMakeLists.txt
new file mode 100644
index 00000000..6a8da0ef
--- /dev/null
+++ b/alpaka/benchmarks/CMakeLists.txt
@@ -0,0 +1,23 @@
+#
+# Copyright 2023 Benjamin Worpitz, Jan Stephan, Mehmet Yusufoglu
+# SPDX-License-Identifier: ISC
+#
+
+################################################################################
+# Required CMake version.
+################################################################################
+
+cmake_minimum_required(VERSION 3.22)
+
+project("alpakaBenchmarks" LANGUAGES CXX)
+
+################################################################################
+# Add subdirectories.
+################################################################################
+
+if(NOT BUILD_TESTING)
+ # Testing is not enabled therefore CATCH2 which is part of common must be pulled.
+ add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../test/common "${CMAKE_BINARY_DIR}/test/common")
+endif()
+
+add_subdirectory("babelstream/")
diff --git a/alpaka/benchmarks/babelstream/CMakeLists.txt b/alpaka/benchmarks/babelstream/CMakeLists.txt
new file mode 100644
index 00000000..5deb1a3e
--- /dev/null
+++ b/alpaka/benchmarks/babelstream/CMakeLists.txt
@@ -0,0 +1,49 @@
+#
+# Copyright 2023 Erik Zenker, Benjamin Worpitz, Jan Stephan, Bernhard Manfred Gruber
+# SPDX-License-Identifier: ISC
+#
+
+cmake_minimum_required(VERSION 3.22)
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+project(babelstream LANGUAGES CXX)
+
+if(NOT TARGET alpaka::alpaka)
+ option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)
+ if(alpaka_USE_SOURCE_TREE)
+ # Don't build the benchmarks recursively
+ set(alpaka_BUILD_BENCHMARKS OFF)
+ add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
+ else()
+ find_package(alpaka REQUIRED)
+ endif()
+endif()
+
+
+set(_TARGET_NAME "babelstream")
+append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
+
+alpaka_add_executable(
+ ${_TARGET_NAME}
+ ${_FILES_SOURCE})
+
+target_include_directories(
+ ${_TARGET_NAME}
+ PRIVATE "src")
+
+target_link_libraries(
+ ${_TARGET_NAME}
+ PRIVATE common)
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER benchmarks/babelstream)
+
+#Run as a ctest
+if(alpaka_CI)
+ # Only run for release builds since this is a benchmark
+ if(CMAKE_BUILD_TYPE STREQUAL "Release")
+ add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
+ endif()
+else()
+ # For a normal benchmark test, number of samples should be equal to the default value.
+ add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
+endif()
diff --git a/alpaka/example/babelstream/src/LICENSE b/alpaka/benchmarks/babelstream/src/LICENSE
similarity index 100%
rename from alpaka/example/babelstream/src/LICENSE
rename to alpaka/benchmarks/babelstream/src/LICENSE
diff --git a/alpaka/benchmarks/babelstream/src/README.md b/alpaka/benchmarks/babelstream/src/README.md
new file mode 100644
index 00000000..cd3eee70
--- /dev/null
+++ b/alpaka/benchmarks/babelstream/src/README.md
@@ -0,0 +1,101 @@
+This work was initially based on the [cupla port of BabelStream](https://github.com/jyoung3131/BabelStream) from Jeff Young. Then refactored.
+The benchmark BabelStream is developed by Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC; based on John D. McCalpin's original STREAM benchmark for CPUs
+Some implementations and the documents are accessible through https://github.com/UoB-HPC
+
+# Example Run
+Can be run with custom arguments as well as catch2 arguments
+# With Custom arguments:
+./babelstream --array-size=1280000 --number-runs=10
+# With Catch2 arguments:
+./babelstream --success
+# With Custom and catch2 arguments together:
+./babelstream --success --array-size=1280000 --number-runs=10
+
+# Command for a benchmarking run
+# ./babelstream --array-size=33554432 --number-runs=100
+# Otuput is below:
+
+'''Array size provided: 33554432
+Number of runs provided: 100
+Randomness seeded to: 2775986196
+
+
+AcceleratorType:AccCpuSerial<1,unsigned int>
+NumberOfRuns:100
+Precision:single
+DataSize(items):33554432
+DeviceName:13th Gen Intel(R) Core(TM) i7-1360P
+WorkDivInit :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+WorkDivCopy :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+WorkDivMult :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+WorkDivAdd :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+WorkDivTriad:{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+Kernels Bandwidths(GB/s) MinTime(s) MaxTime(s) AvgTime(s) DataUsage(MB)
+ InitKernel 12.2133 0.0219789 0.0244341 0.0234795 268.435
+ CopyKernel 20.8898 0.01285 0.0141298 0.0130288 268.435
+ MultKernel 20.9943 0.0127861 0.0161767 0.0129707 268.435
+ AddKernel 24.4181 0.01649 0.0178725 0.0166714 402.653
+ TriadKernel 24.44 0.0164751 0.0182611 0.0166579 402.653
+
+
+
+AcceleratorType:AccGpuCudaRt<1,unsigned int>
+NumberOfRuns:100
+Precision:single
+DataSize(items):33554432
+DeviceName:NVIDIA RTX A500 Laptop GPU
+WorkDivInit :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivCopy :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivMult :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivAdd :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivTriad:{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivDot :{gridBlockExtent: (256), blockThreadExtent: (1024), threadElemExtent: (1)}
+Kernels Bandwidths(GB/s) MinTime(s) MaxTime(s) AvgTime(s) DataUsage(MB)
+ InitKernel 62.3725 0.00430374 0.00434411 0.00433501 268.435
+ CopyKernel 90.2948 0.00297288 0.00302862 0.00300712 268.435
+ MultKernel 90.3858 0.00296988 0.00302989 0.00300866 268.435
+ AddKernel 90.947 0.00442734 0.00448436 0.00446751 402.653
+ TriadKernel 90.88 0.0044306 0.00447952 0.00446739 402.653
+ DotKernel 93.369 0.002875 0.00291691 0.0029106 268.435
+
+
+
+AcceleratorType:AccCpuSerial<1,unsigned int>
+NumberOfRuns:100
+Precision:double
+DataSize(items):33554432
+DeviceName:13th Gen Intel(R) Core(TM) i7-1360P
+WorkDivInit :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+WorkDivCopy :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+WorkDivMult :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+WorkDivAdd :{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+WorkDivTriad:{gridBlockExtent: (33554432), blockThreadExtent: (1), threadElemExtent: (1)}
+WorkDivDot :{gridBlockExtent: (256), blockThreadExtent: (1024), threadElemExtent: (1)}
+Kernels Bandwidths(GB/s) MinTime(s) MaxTime(s) AvgTime(s) DataUsage(MB)
+ InitKernel 12.2326 0.0438886 0.0543366 0.0463925 536.871
+ CopyKernel 20.8888 0.0257014 0.0272265 0.0260267 536.871
+ MultKernel 21.0395 0.0255173 0.0292734 0.0262349 536.871
+ AddKernel 24.6628 0.0326527 0.0383083 0.0334047 805.306
+ TriadKernel 24.5604 0.0327888 0.0494151 0.0335766 805.306
+
+
+
+AcceleratorType:AccGpuCudaRt<1,unsigned int>
+NumberOfRuns:100
+Precision:double
+DataSize(items):33554432
+DeviceName:NVIDIA RTX A500 Laptop GPU
+WorkDivInit :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivCopy :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivMult :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivAdd :{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivTriad:{gridBlockExtent: (32768), blockThreadExtent: (1024), threadElemExtent: (1)}
+WorkDivDot :{gridBlockExtent: (256), blockThreadExtent: (1024), threadElemExtent: (1)}
+Kernels Bandwidths(GB/s) MinTime(s) MaxTime(s) AvgTime(s) DataUsage(MB)
+ InitKernel 62.4307 0.00859947 0.00864104 0.00862767 536.871
+ CopyKernel 89.4157 0.00600421 0.00607738 0.00604754 536.871
+ MultKernel 89.2831 0.00601313 0.00606791 0.0060488 536.871
+ AddKernel 90.5499 0.00889351 0.00895834 0.00893668 805.306
+ TriadKernel 90.5685 0.00889168 0.00897055 0.00893744 805.306
+ DotKernel 93.2451 0.00575763 0.00581312 0.00579143 536.871
+'''
diff --git a/alpaka/benchmarks/babelstream/src/babelStreamCommon.hpp b/alpaka/benchmarks/babelstream/src/babelStreamCommon.hpp
new file mode 100644
index 00000000..a22f7d03
--- /dev/null
+++ b/alpaka/benchmarks/babelstream/src/babelStreamCommon.hpp
@@ -0,0 +1,440 @@
+#pragma once
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include