From 5db4f44ed3705b19a248ef84105dbb33aa0f6688 Mon Sep 17 00:00:00 2001 From: Eric Phipps Date: Thu, 12 Sep 2024 15:42:25 -0600 Subject: [PATCH] Squashed 'tpls/kokkos/' changes from 1a3ea28f6..08ceff92b 08ceff92b Merge pull request #7202 from ndellingwood/master-release-4.4.00 948c13463 update master_history.txt for 4.4.00 6068673cb Merge branch 'release-candidate-4.4.00' for 4.4.00 f4ef4dab4 Merge pull request #7207 from dalg24/cherry_pick_automated_releases_master 76ffeea8d Add workflow to create releases with SLSA provenance generation f15e90c9c [ci skip] update changelog for 4.4.0 (#7188) 818b82712 Merge pull request #7190 from dalg24/rc440_desul_atomics_config c8bbbe2ef Fix atomic accessor for pre-volta GPU architectures (#7189) ca0efd501 Merge pull request #7185 from ndellingwood/cherry-pick-7181-to-rc4.4.0 c8c11c2f7 Fix bogus warnings for cuda/11.4 with gcc/8.5 (#7181) 608fcbea8 Set version number to 4.4.0 94e62755d Hide `IMPL_REF_COUNT_BRANCH_UNLIKELY` option (#7175) dbb0abb67 Merge pull request #7174 from rgayatri23/ompt_lock_code_delete e933bfd5b Implement KOKKOS_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND (#7168) c8c6ae9bd OpenMPTarget: Delete ununsed code. 3bf86311e Merge pull request #7173 from dalg24/prefer_exec_space_name_in_tutorial 3a4fb03be Prefer ExecutionSpace::name() to a typeid expression in hello world 39cfac80e Merge pull request #7170 from dalg24/gcc_deprecated_declarations_warnings af6fb10d5 Disable deprecated warnings with GCC < 11.1 for Pair 02ed662e8 Enable deprecation warnings in the GCC 8.4 build a6beb5a56 Merge pull request #7166 from tjhei/examples_c++11 b13cef88b tutorials: do not mention requiring c++11 73b6d032b Add nvidia Grace Architecture (#7158) 5a2292743 Fix Kokkos_CoreUnitTest_DeviceAndThreads (#7159) c22d638b9 Merge pull request #7163 from kokkos/dependabot/github_actions/ossf/scorecard-action-2.4.0 a65a6ce36 Bump ossf/scorecard-action from 2.3.3 to 2.4.0 eb11070f6 Make ExecutionSpace constructors explicit (#7156) 7d0e3e818 Fix Kokkos::Array default initialization for icpc (#7154) 5e2f147cd Make struct "ChunkSize" constructor explicit to avoid implicit construction in RangePolicy (#7151) 277c616e3 OpenMPTarget: Update docker clang build. (#7147) bc7e02a16 Hidden friend operator== for Kokkos::Array (#7148) d306b408b SYCL: Use sycl::shift_group_[left|right] and sycl::select_from_group (#7146) 49eccc859 Merge pull request #6828 from masterleinad/sycl_use_auto_range 28a7788b7 Use sycl::ext::oneapi::experimental::auto_range 2b8380992 Drop `Experimental::RawMemoryAllocationFailure` and don't catch exceptions to rethrow them in shared alloc (#7145) 3290a3de1 Fix using View without corresponding mdspan-type (#7140) 23b8813dc Add CMake options to control compilation flags for AMD GPUs (#7127) f53b26d44 Merge pull request #7143 from dalg24/ompt_bad_alloc f33443b49 fixup! Throw bad alloc if omp_target_alloc() returns nullptr 84a60e56c Fix Trilinos nightly failure due to `create_mirror*` refactor (#7126) a07cbe4eb Fix gcc-14 C++26 nightly jenkins build (#7137) dca818b1a Enable test_view_allocation_error with OpenMPTarget 5154f9d0b Throw bad alloc if omp_target_alloc() returns nullptr 571fbae26 Add `likely` and `unlikely` attribute from C++20 to ref counting in views (#6730) b02c83a4c Merge pull request #7141 from masterleinad/disable_failing_nan_tests_nvhpc 0a64dfc74 Get rid of `RawMemoryAllocationFailure::AllocationMechanism` and derived backend-specific exceptions (#7139) 7247c7f4f Check for LIBCXX 10 or later for C++20 and later (#7123) 1dd782dbf no_device_stack is unknown 4dc70d811 NVHPC: Disable failing NaN tests 8e682582f Merge pull request #6987 from masterleinad/remove_nvhpc_as_device_compiler_support 7bcd7acaf [ci skip] rename jenkins build 981163a67 Merge pull request #7138 from masterleinad/fix_sycl_sstream ee50eeb17 SYCL: Add support for Graphs (#6912) d7c0be575 SYCL: Add missing include for std::stringstream 62e60260d OpenMP: Ensure kernels submitted by multiple threads to the same instance don't run concurrently (#6151) c0edea91c Merge pull request #7133 from dalg24/simplify_finalize_logic 7ff2e10b3 Disable the PushFinalizeHookTerminate test on Windows 54cd8dac4 Merge pull request #7134 from masterleinad/fix_hip_nightly 59773deec [ci skip] Fix ROCm version to 6.1.2 in nightly CI db8326fb1 Merge pull request #7132 from dalg24/get_rid_of_exception_swallowers 1fbfc7248 Simplify the logic when finalizing and calling the registered functions e215b6bb3 Drop (unused) KOKKOS_ADD_ADVANCED_TEST TriBITS function cbed0764b Let the throwing push finalize hook calls terminate test actually run 44befe5a0 Do not swallow errors when deallocating memory with CUDA 63141b5b8 Merge pull request #7128 from masterleinad/c++20_minimum_compiler_versions 41eb0b6e1 Fix using and, or, xor in desul with MSVC (#7124) 1c04d0464 Merge pull request #7131 from dalg24/avoid_catch_mem_alloc_failure_and_rethrow 0e0307313 Do not bother catching memory allocation failure and rethrow af933fe32 Merge pull request #7129 from dalg24/drop_cuda_uvm_allocation_count 971c88440 Drop (unused) cuda uvm allocations counter cca439ff5 Define minimum compiler versions for C++20 support 73ac2492d Merge pull request #7081 from Rombur/gcc_14 2b248b0d2 Refactor: Move logic of `create_mirror*` to `Impl::create_mirror*` (#7061) a6e7f0d1f Merge pull request #7121 from kokkos/dependabot/github_actions/actions/upload-artifact-4.3.4 669b9f279 Deprecate `RawMemoryAllocationFailure::FailureMode::MaximumCudaUVMAllocationsExceeded` (#7120) 35b3f288f Merge pull request #7122 from dalg24/update_hip_nightly 72d9d077f Update HIP nightly build base image Ubuntu 20.04 -> 22.04 a18bcb059 Bump actions/upload-artifact from 4.3.3 to 4.3.4 93e372cbb Fix and test with -fsanitize=undefined in GitHub CI (#7104) 487b310c9 Merge pull request #7119 from ldh4/simd_fix_div_by_zero 56a40db0a Fix div by zero in math ops testing 33c9b8cef Merge pull request #7118 from crtrott/update-mdspan b84125ed6 Add AtomicAccessorRelaxed (#7089) dc175068b Update mdspan to 98a12b01b51b2 ba2075b3d Merge pull request #7117 from Rombur/hip_20 2a465fa8c Merge pull request #7102 from vicentebolea/fix-relative-dir-install 7b3e7c872 Merge pull request #7112 from masterleinad/fx_sycl_ci 4ddc65077 Merge pull request #7114 from crtrott/add-concepts-include-in-test e396c8f50 Update base image for ROCm 5.6 83f975a94 Github CI: Test with C++17, C++20, and C++23 (#7082) 4a54fb34b Add missing concepts include in test 14093185a SYCL CI: Manually build oneDPL fc45a032c move view allocation related functionality to a new header (#7110) b5f51b9c8 Workaround to ice with icpc when using -no-ip (#7106) f562ca246 Merge pull request #6802 from ldh4/simd_use_larger_vec_width 9f7a92f05 Clean up KOKKOS_LIB_INCLUDE_DIRECTORIES, append include directories to associated targets in Trilinos builds (#7103) d9f7dfe85 Merge pull request #7108 from masterleinad/restrict_jenkins_cuda 720490e7c cmake: fix relative to find kokkos_compiler_launcher f10076cb9 Restrict jenkins CI not to run on hopper for nvcc < 11.8 b650199ca clang formating 4d1278ec2 Added a comment about is_type structs 6e167f26a Workaround for the compilation failure for rocm 5.6-6.0 1eb1abe5e Disabling simd unit tests from building for Windows+CUDA build 61de582d1 clang-formatted e02c6a351 Added for width 4 for NEON aa833570a Added for AVX512 e320e0054 Added width 8 abi for avx2 2d7715239 Fix SpaceAwareAccessor based on usage experiment in View (#7088) 93db4f783 Merge pull request #7094 from aprokop/exec_spaces 28614907c Remove FIXME_NVHPC 23.7 guards 53b320221 Cleanup KokkosP hooks in `Profiling::` (#7096) 46df6c18f Merge pull request #7093 from seyonglee/disable_mdspanerror_openacc b69cf9eab Merge pull request #7099 from ndellingwood/fix-werror-icpc a10c912db Merge pull request #7097 from kokkos/dependabot/github_actions/Jimver/cuda-toolkit-0.2.16 a4e7eab6c Couple more icpc -Werror fixes 546bb2bd4 Merge pull request #7098 from DerNils-git/develop 1771bfd90 Copy print_configuration setting in combination of kokkos settings. c2a586338 Bump Jimver/cuda-toolkit from 0.2.15 to 0.2.16 70d50fe6f Merge pull request #7091 from JBludau/remove_overwrite_of_default_space e0d99fdd7 Merge pull request #7095 from ndellingwood/fix-more-icpc e826e7fc8 Fix more icpc issues 8fc95f871 Add missing space 304ad9d3e Temporarily disable failing parts in the TestMDSpan.hpp for the OpenACC backend. c7acfb75b remove cmake options to change default spaces 3c30f4023 Remove support for NVHPC as CUDA device compiler 24454fa82 Resolve various bogus icpc -Werror (#7079) 2c0bd1644 Merge pull request #7080 from masterleinad/threads_safety_serial a0b8deab0 Merge pull request #7078 from crtrott/update-desul 8501d5a90 Update desul version in github workflow ea4b96f8f Update internal desul file copies to 60c1115 04f6a4f5e Merge pull request #7083 from crtrott/add-missing-include cf14f1c71 Complex needs a tuple include 362c9d724 Merge pull request #7074 from crtrott/space-aware-accessor 9ac81a88d Don't delete special member functions explicitly 1e14d047c fix refcount exception safety (#6289) 3de267cb8 Improve performance for deleting an instance. 7b962ce29 Use correct includes for spaceawareaccessor 708abe21a Move `layout_iterate_type_selector` into Impl namespace (#7076) e2d5815bb Update from GCC 13 to 14 and use C++ 26 in Jenkins nightly 3309d9332 Fix thread-safety for the Serial backend 2678194c7 Structured binding support for Kokkos::complex (#7040) 3d27bf596 Merge pull request #7077 from crtrott/fix-dynamic-extent-definition 3418084ea OpenACC: Skip exec_space_thread_safety_range_scan (#7022) 549858227 Fix using shared libraries and -fvisibility=hidden (#7065) 6c78f4b1f SpaceAwareAccessor: fix issues (no-unique-address, is_empty) a1f1255ac Fix incompatible dynamic_extent definition in Kokkos 34db5182e Address review comments a6b95e9f5 Add specialization of SpaceAwareAcc for AnonymousSpace e2d68fd2b Use SpaceAwareAccessor in View mdspan-interop b2046a40e Add basic tests for SpaceAwareAccessor afbff6c53 Add SpaceAwareAccessor 0d5cc923a Enable MDSPAN support by default (#7069) 892e13c8c Merge pull request #7062 from masterleinad/use_find_cudatoolkit b967b1012 Merge pull request #7072 from ndellingwood/issue-7071 63d8093c1 Workaround icpc "missing return statement at end of non-void function" a3e2b84a7 KOKKOS_CUDA_ERROR->DEFAULT_MSG 64406064d Fix closing brackets 8229477b4 Move check CMake 3.20.1 with nvhpc 40cf84f91 Fix using CUDAToolkit for CMake 3.28.4 and higher d54619970 Merge pull request #7068 from masterleinad/fix_msvc_cuda 90d877036 Avoid lambda in sort_by_key_via_sort d50a87832 Workaround MSVC compiler issues in Views f96df0277 Merge pull request #7070 from masterleinad/fix_mdspan_test 363b464f5 Update to CUDA 12.4.1 in MSVC CI 1d7ccd8df Fix mdspan test 043f87304 Switch to using functors in sort_by_key_via_sort (#7059) 660136f5d Merge pull request #7021 from masterleinad/use_werror_for_cuda 00b4e7fe2 Merge pull request #7063 from masterleinad/restrict_to_array_subtest 0a5fac076 Merge pull request #7066 from Rombur/rocm_61 7c4f2b40a [ci skip] Use ROCM 6.1 in the nightly CI and disable one test 5d0983823 Restrict to_array subtest to NVCC >= 11.4.0 63f05204d Merge pull request #7058 from cedricchevalier19/bump-version-readme 013ef0cad Bump version in the readme f0a7c764a Merge pull request #7057 from kokkos/dependabot/github_actions/DoozyX/clang-format-lint-action-0.17 517f48a4b Merge pull request #7056 from kokkos/dependabot/github_actions/Jimver/cuda-toolkit-0.2.15 5269803eb Bump DoozyX/clang-format-lint-action from 0.16.2 to 0.17 e7ddeee49 Bump Jimver/cuda-toolkit from 0.2.14 to 0.2.15 8e0d4a923 Merge pull request #7055 from masterleinad/move_dependabot 4b913d3e7 Move dependabot to .github 2c3fd02aa Use -Xcudafe --diag_suppress=20208 in Makefile build 1625ec210 Try moving pragma suppress to tests 5b0d94518 Use -Xcudafe --diag_suppress=20208 for 11.6 build; nothing else seems to help 2a15c75c2 Suppress 'long double' is treated as 'double' in device code 0e88744dc Fix dangling reference 9d1842e71 Only use -Werror all-warnings with explicit nvcc_wrapper 5906cba05 Fix .jenkins whitespce 13447c433 Fix gtest d0d99bd58 Fix array size 1876867d6 Fix kokkos_swap 726a8f296 Fix quotation marks in CXX flags e89955018 Cuda: Fix nvcc warnings fad664c8f Merge pull request #7051 from tpadioleau/fix-unused-symbols-ctad-tests 69b0db4c1 Fix unused symbols in CTAD tests f53f905ec Merge pull request #7054 from masterleinad/update_scorecard 150f9009d Update scorecard GitHub workflow 1f602905c Add nightly CI on Frontier (#7048) 63a3cef18 Introduce `KOKKOS_DEDUCTION_GUIDE` macro to allow user-defined deduction guide in device code for clang compiler (#6954) 9f1cc4c97 Merge pull request #7046 from masterleinad/add_dependabot 669746ef8 Improve Kokkos Graphs (#7039) e011753fb Merge pull request #7047 from nliber/array-structured-binding-improvements f0704b39d Add tests to `ScopeGuard` (#7028) c8a5870c2 Merge pull request #7042 from masterleinad/fix_msvc_warnings 2a7ca1a37 Added static_asserts for out of range tuple_element and get (to match checks in complex structured bindings) 5071f2fd3 Add dependabot for GitHub Actions d65b67bbf (Rebase) Partial fix to compile time issues w/nvcc + Kokkos_ENABLE_DEBUG_BOUNDS_CHECK (#7013) 9bd74ee75 Avoid using "#if not defined" 561818bcd Merge pull request #7041 from ndellingwood/issue-7038 0f9efac16 TestArray: add intel guard to to_array implicit conversion test e06ddf6c1 Fix adjacent difference (#6922) 7472ed7ac Merge pull request #6812 from tcclevenger/unorderedmap_deepcopy 580dba58d Merge pull request #7034 from ndellingwood/issue-7031 1c60c8007 Merge pull request #7030 from nliber/ctad-teampolicy-v3 cf791bc2e Adding `Kokkos::to_array` (#6375) 7c67b020c Workaround icpc warnings 0410363d7 Refactor: Replace SFINAE by `if constexpr` for `create_mirror*` functions (#6955) a78d4ddb2 Copied the deduction guides and test cases over from branch nliber/ctad-teampolicy-crtp 24b24d075 Merge pull request #7006 from masterleinad/test_no_default_constructor_dualview 07a500982 Merge pull request #7024 from masterleinad/sycl_cuda_fix_graph_tests 6a3d918a4 Merge pull request #6834 from mhoemmen/fix-README-FENL-link a5bb0d41b Fix Kokkos README's FENL link c8e0a95cb HIP: Use builtin atomic for compare_exchange (#7000) cb27c9941 SYCL: Skip launch_six Graph test 6f176cde0 OpenMPTarget: Fix compiling Graph tests (#7020) 083fb014c Improve `Impl::is_zero_byte()` (#7017) 068d46882 Merge pull request #7018 from dalg24/disable_openmptarget_graph_test 42e83f165 Merge pull request #7023 from dalg24/remove_unused_cuda_api_wrappers f3bd253d3 Remove unused CudaInternal::cuda_{malloc,free}_async_wrapper 02433b625 Merge pull request #7019 from dalg24/nvhpc_suppress_deprecation_warnings bfe9aa2f1 Fixup for disabling deprecation warnings with NVC++ fa8b50102 Disable OpenMPTarget Kokkos::Graph test (does not compile) 468faaa37 Merge pull request #7015 from G-071/fix_hpx_execution_space_nvcc_compilation ce0915b5e Fix undefined behavior in is_zero_byte (#7014) f8f0cc473 Always run Graph tests (#7011) 6aa2ad7da Add a CITATION.cff file (#7008) 64fe75637 SYCL: Don't use shuffles for top-level reductions (#7009) 81b63c5c5 mdspan converting constructors (#6830) 226aecfb8 Properly guard deprecated `Kokkos_Vector.hpp` header self contained test (#7016) fc4383ab6 Fix unique_any_senders nvcc template deduction 2b7b98a1a Use parallel_for instead of parallel_reduce for check 835dbf594 Merge pull request #7012 from seyonglee/openacc_default_async_val_for_team da8be2257 This PR changes the default execution behavior of the parallel_for(team-policy) constructs in the OpenACC backend. - This PR handles a missing case not covered by the previous PR #6772 This PR also fixes the OpenACC backend error in the thread-safety test in PR #6938. df018d97f Suppress deprecated warnings via pragma push/pop in the tests (#6999) cadab6c1e Test DualView resize/realloc for types without default constructor 1d9d0df2e SYCL: Print submission command queue property (#7004) 506da184f Merge pull request #7002 from dalg24/rm_tpl_cusparse 00170ae80 Remove cuSPARSE TPL 5a5306c4e Merge pull request #6997 from masterleinad/sycl_fix_custom_parallel_for_range_deprecations a69e81a59 Merge pull request #6998 from rgayatri23/ompt_scan_lock 7cad3e7c3 OpenMPTarget: Use mutex lock for parallel scan. 37986fde4 [ci skip] update changelog for 4.3.1 (#6995) 6ecdf605e Merge pull request #6994 from ndellingwood/master-release-4.3.01 f5b34222c SYCL: Fix deprecation in custom parallel_for RangePolicy implementation 50a862cf6 SYCL: Prepare Parallel* for Graphs (#6988) d61d75ace Fix a bug when using realloc on views of non-default constructible element types (#6993) c80cdafef update master_history.txt 262d2d6e8 Merge branch 'release-candidate-4.3.01' for 4.3.01 e4cc6862c Merge pull request #6990 from masterleinad/fix_32bit_tpl_library_path 06e4c5bdc Merge pull request #6989 from dalg24/deprecated_attribute_comparison_operators_pair_t1_void ccadc7d9b Disable failing parallel_scan_with_reducers test 28260178f Avoid duplicated definition of KOKKOS_IMPL_32BIT 7b8e3a68f Fix TPL_LIBRARY_SUFFIXES for 32-bit build 9c7920291 Fix deprecation warnings with GCC for pair comparison operators 69567f305 Add thread-safety tests (#6938) c6d86474a Also use is_nothrow_swappable workaround for Intel Classic Compilers (#6983) 68fabc8a2 Merge pull request #6980 from ndellingwood/update-changelog-4301 fecc96c9e Merge pull request #6978 from ndellingwood/cherrypick-6951-4.3.01 4ee802725 Merge pull request #6979 from ndellingwood/cherrypick-6877-4.3.01 49e265601 Merge pull request #6977 from ndellingwood/cherrypick-6931-4.3.01 cd34c2e8b Merge pull request #6976 from ndellingwood/cherrypick-6578-4.3.01 a75dc70d8 Merge pull request #6982 from masterleinad/fix_fedora b7bb509d8 Merge pull request #6985 from crtrott/copyright-rc 85610f455 Merge pull request #6984 from crtrott/Copyright 83498bdc6 Fix Copyright file 45a140491 Fix Copyright file ccd0126b8 Fix fedora CI builds with flang-new 9fccb6107 Update changelog for 4.3.01 cf7f87c19 Merge pull request #6951 from masterleinad/fix_serial_space_team_policy fbab8bdf0 bring back --fmad option to nvcc_wrapper (#6931) 4d7258c26 MI300 support unified memory support (#6877) 30979fb93 cuda: reduction with `RangePolicy`: fix grid dimensions to work for large values and avoid overflow (#6578) 6486a9d68 Merge pull request #6975 from ndellingwood/update-version-4_3_01 dbd7f583a Merge pull request #6962 from dalg24/kokkos_array_const_qualified_element_type 775023262 changelog: header for version 4.3.01 73a7a41ba update to version 4.3.01 ed4d2544f Merge pull request #6972 from dalg24/fix_kokkos_compile_language_cuda_hip_w_omp 15d13f23b Merge pull request #6882 from Rombur/hip_atomic_fetch 27b3ced35 Merge pull request #6949 from Rombur/nightly_deprecated 2574b8029 Fix OpenMP+CUDA when `Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE` is `ON` f699a2c7a Fix enabling OpenMP with HIP and "compile as CMake language" 4f416f3b7 Merge pull request #6965 from dalg24/cmake_openmp_cxx 77ea52f97 Threads: Don't silently allow m_instance to be a nullptr (#6969) 4ec82963f OpenMPTarget: Update loop order in MDRange (#6925) 7e7709fdb SYCL: Avoid deprecated floating-point number abs overloads (#6959) 18642875f Merge pull request #6967 from crtrott/update-readme-kk-version 968639211 Add Linux Foundation notice and fix C++ standard 19ca9ce97 Update version d434f87e9 Do not require OpenMP support for languages other than CXX 2391f1765 Avoid introducing a 2nd definition of the Impl::swappable trait 031f6d94a Alternate definition of Impl::is_nothrow_swappable_v for NVCC version less than 11.4 ebb1cb308 Revert "Try to fix the CUDA 11.0 build" 63eef4623 Try to fix the CUDA 11.0 build 2e82fdd87 Merge pull request #6961 from dalg24/fixup_deprcated_guards_pair_void fafe861d0 Fix support for Kokkos::Array of const-qualified element type ab3cae486 Fix wrong macro guards for deprecated Kokkos::pair specialization cf59f3120 Merge pull request #6943 from dalg24/kokkos_swap_specialization_for_kokkos_arrays e2b7bb99e Merge pull request #6958 from masterleinad/sycl_replace_deprecated_usm_address_spaces a7827731c Kokkos::Impl::SYCLTypes:: -> Kokkos::Impl::sycl_ 5932685c9 Introduce alias based on feature macro 205fd156d Replace deprecated sycl::device_ptr/sycl::host_ptr cc602957c Merge pull request #6951 from masterleinad/fix_serial_space_team_policy 86f5988b3 Fix noexcept specification for kokkos_swap on zero-sized arrays 8706b68d5 kokkos_swap(Array) member friend should not be templated on some other type U 44fde213f Use Kokkos::AUTO for OpenMPTarget 34d0db2f4 Add test 04bc3d9e3 Merge pull request #6952 from nliber/changelog43 d5fd51274 Merge pull request #6947 from dalg24/deprecate_kokkos_pair_void_specialization 0859ab0af Fixed the link for P6601 (Threads backend change) e7b486ff6 Serial: Use the provided execution space instance in TeamPolicy 69c527a42 [ci skip] Enable deprecated code and deprecated warnings in nightly CI d914fe316 Fix deprecated warning from `Kokkos::Array` specialization (#6945) 906e8ce3c Merge pull request #6942 from dalg24/fix_nightlies_cxx20_requires_expression 730d8d828 Deprecate specialization of Kokkos::pair for a single element c9e21ce2a Add `kokkos_swap(Array)` sepcialization f94e8d34d Prefer standard C++ feature testing to guard the C++20 requires expression a8115e5df Adding converting constructor in Kokkos::RandomAccessIterator (#6929) 8c7cc95f9 Merge pull request #6940 from dalg24/unused_limits_header_include_in_kokkos_array f2d37801d Remove unnecessary header include de3a2632c Merge pull request #6934 from dalg24/deprecate_kokkos_array_proxy_template_param d88e2a5b0 bring back --fmad option to nvcc_wrapper (#6931) b5ec79bc9 Merge pull request #6936 from rgayatri23/issue_6874 92e02b50c CUDA: Update nvcc_wrapper a2af4e0d4 Deprecate trailing Proxy template argument in Kokkos::Array b0c2566c8 Merge pull request #6930 from Rombur/fix_nightly 0099c10be Fix nightly CI 6ea7be76e cuda: reduction with `RangePolicy`: fix grid dimensions to work for large values and avoid overflow (#6578) 164519d7d MI300 support unified memory support (#6877) 74c81228f Merge pull request #6926 from Rombur/latest_rocm 1fe8108fb Merge pull request #6906 from dalg24/make_view_of_arrays_less_special 3a27cdbc2 Add ROCm 6.0 in the nightly CI 7b41536c4 Merge pull request #6924 from masterleinad/fix_sycl_workgroup_scan 8cf841076 SYCL: Fix range in subgroup scan for workgroup_scan 55c575750 Use recommended/max team size functions in Cuda ParallelFor and Reduce constructors (#6891) e52cda370 Merge pull request #6785 from Rombur/memory_test e93b168ba Merge pull request #6907 from dalg24/rm_experimental_layout_tiled 98b1a38e5 SYCL: Improve team_reduce implementation (#6562) 1256f6919 Merge pull request #6822 from CExA-project/fix-deep-copy 486cc745c Merge pull request #6908 from ndellingwood/master-release-4.3.00 4b9093099 Refactor: Uniformize `create_mirror*` parameter name for views (#6917) 077ea33c4 Remove trailing whitespace in changelog cc21a5482 Merge pull request #6919 from ndellingwood/dev-changelog-4300 497b438f1 CHANGELOG.md: 4.3.00 update a833fb00b Preparing readme for develop as the default branch (#6796) caa139c9b SYCL: Unroll shuffle loops for top-level parallel_reduce and parallel_scan (#6750) 47a50ac3c Update master_history.txt for 4.3.0 f08217a49 Accommodate users that depend on a code that define silly macros (#6909) 5cf09513c Merge pull request #6910 from tpadioleau/remove-return-functor-copy-for_each 2aecb1d24 SYCL: Fix multi-GPU support and add test (#6887) 059cd15c0 Accommodate users that depend on a code that define silly macros (#6909) e33da600f Fix merge artifact b6678539a Drop specialization of ViewMapping for Kokkos::Array 391e0408b Do not return a copy of the input functor for Kokkos::Experimental::for_each 635551058 Move `Kokkos::Array` tests to a more suitable place (#6905) 5f9214049 Merge branch 'release-candidate-4.3.00' for 4.3.0 06850bf74 [ci skip] Update changelog (#6886) 5eac0bc6f Merge pull request #6876 from masterleinad/disable_fedora_rawhide 1efeb5d76 Deprecate is_layouttiled trait 51b98e1d7 Get rid of now unnecessary use of is_layouttiled trait e2cfdec54 Drop Experimental::LayoutTiled class template 68c668469 Update Intel GPU architectures in Makefile (#6895) a53d30aab Merge pull request #6896 from masterleinad/fix_makefile_threads 7ddc2d39c [4.3.00] Cuda: Fix configuring with CMake 3.28.4 (#6903) 8d734b026 Cuda: Fix configuring with CMake 3.28.4 (#6898) 772e745a1 Merge pull request #6899 from ndellingwood/cherrypick-6892 0834a1281 Fix a bug in Makefile when using AMD GPU architectures (#6892) 2035e313d Fix a bug in Makefile when using AMD GPU architectures (#6892) 872dc422f Fix Makefile.kokkos for Threads 46354d25d Use builtin for atomic_fetch in the HIP backend ae4d0013d TestViewCopy_c.hpp: better handling for OpenMPTarget a2f2ba404 TestViewCopy_c.hpp: add new unit test for deep copy (ViewFill) 841b3a9f9 Fix deep copy when filling Rank-7 views 9fff1e066 Merge pull request #6881 from dalg24/bump_develop_to_4_3_99 05bd48516 [ci skip] Bump version number to 4.3.99 1c60a32b7 Set version number to 4.3.0 a34d910ac Merge pull request #6879 from ndellingwood/update-rocthrust-check-trilinos 096e72437 Scratch space fix for MultiGPU (#6866) 49bd895ae kokkos_tpls.cmake: update default option to enable rocthrust c1a800650 Don't use Fedora development version in GitHub CI 5931cbd29 Merge pull request #6871 from masterleinad/fix_link_rocthrust 5e7cab99b SYCL: Make sure to call find_dependency for oneDPL if necessary (#6870) 8062a6020 Fix linking with rothrust in downstream applications a2b64e0e8 Improve message on view out of bounds access and always abort (#6861) da77d6e14 Merge pull request #6868 from Rombur/hip_sort_by_key 128caa1df Merge pull request #6869 from masterleinad/mdrange_ctad_test_warning 3a765351c Fix unused variable warning in TestMDRangePolicyCTAD.cpp e5126e929 Add HIP specialization for sort-by-key 35ad698e0 Add support for rocThrust in sort when using HIP (#6793) 4e835e136 Merge pull request #6816 from crtrott/add-security-md 5ffcc1dcc Merge pull request #6840 from CExA-project/cmake-bench cfc260ac0 CTAD (deduction guides) for MDRangePolicy (#5516) 6db04b3b5 CTAD (deduction guides) for RangePolicy (#6850) c7ad79c4f Merge pull request #6862 from nmm0/update-mdspan-tpl 82b0f2a60 Merge pull request #6860 from masterleinad/fix_cstyle_cast_clang_tidy 121964a93 update mdspan tpl 9a7e7958a Split some classes from Kokkos_ViewMapping (#6859) c3c8a70d2 Update the unsafe implicit conversion error message in MDRangePolicy (#6855) 9feb104d9 Fix fallback implementation for sort_by_key (#6856) 99c7e1b1c Fix amdclang++ compilation (#6857) 97a94b60a Fix C-style cast 3d485c19d bytes_and_flops: fix a counter name 52c41e6b3 Merge pull request #6854 from dalg24/bump_google_benchmark 4dcbff2cf Benchmarks: disable 2 benchmarks for OpenMPTarget 715d6156e policy_benchmark: fix indentation 97fa76f29 fix some warnings in policy_performance benchmark 750ef211a add policy_performance benchmark to CMake 16d2edbb3 add atomic benchmark to CMake 932466f21 add gather benchmark to CMake 5c9a4aa3c bytes_and_flops fix a small bug in command line argument 277339090 bytes_and_flops with CMake e83619830 Merge pull request #6858 from masterleinad/fix_unused_variable_ctad dc524910d Avoid unused variable warning in TestRangePolicyCTAD.cpp 8b8de2cf4 Remove variadic range policy constructor (#6845) 0cdc9eb76 Bump Google Benchmark version v1.{6.2 -> 7.1} in CMake FetchContent 04a5334c6 Remove redundant RangePolicy constructor (#6841) 058c3a08e Fix scorecard workflow (#6831) c90a9c6f7 Implement sort_by_key (#6801) 549c50b9c Merge pull request #6800 from masterleinad/sycl_clean_device_selection 16a5ebe95 multi-GPU support: Add test for all policies (#6782) bb734012e Merge pull request #6837 from masterleinad/fix_unwanted_fence_parallel_scan_no_fence_test 99510b131 Merge pull request #6825 from masterleinad/cleanup_kokkos_configure_core 24f251a85 Add test for current CTAD support with RangePolicy (#6803) e2c810e1f Avoid detecting unwanted fences in the parallel_scan_no_fence test e2689abc0 Merge pull request #6829 from ndellingwood/update-changelog-421 9d33cb772 Clean up shift_{right, left}_team_impl (#6821) 361bdbf49 [4.2.01]: changelog update (#6656) 74b421b19 Merge pull request #6826 from masterleinad/update_github_actions e67ce088d Merge pull request #6824 from masterleinad/fix_sycl_ci 1112e07eb Update GitHub actions ot use Node 20 c3f0a2698 Cleanup KOKKOS_CONFIGURE_CORE df68761f9 SYCL CI: Avoid setvars.sh 696654a1c Only call deep_copy_view() from deep_copy(), add deprecation warning 0f3b727b1 Merge pull request #6813 from fnrizzi/fix_constness_for_views_std_algos 48588d08b Add CodeQL GitHub Action (#6818) fe6a937af Merge pull request #6815 from masterleinad/fix_sort_fence a46a5a14e Merge pull request #6806 from dalg24/rm_older_cpu_archs a1199b3df Explicity pass template params to ZeroMemset for intel icpc compilers (#6807) cdb634dba Merge pull request #6817 from dalg24/license_in_readme bd9db1562 [ci skip] Update license badge and links in the README 2a8ac6f48 Adding SECURITY.md file c95f9542f Fix fence in Kokkos::sort when using std::sort 8e8d45724 Remove unused typedef 513d8db05 fix constness for views 59e2fd08d Redeine deep_copy for UnorderMap f40d55528 Merge pull request #6810 from crtrott/update-workflow-permissions 54c2336c5 Update workflow permissions 8963927d0 Merge pull request #6808 from crtrott/ossf-scorecard 4b84ae0e6 Add OpenSSF scorecard workflow e0dc0128e Merge pull request #6770 from ndellingwood/master-release-4.2.01 3611cfef3 SYCL: Improve print_configuration (#6795) 37962b3d2 SYCL: Cleanup device selection 17d074259 Drop Intel Westmere and SSE4.2 extension 5b86415d6 Drop IBM Blue Gene/Q and POWER7 architectures 65dca527a Merge pull request #6798 from dalg24/rm_librt 54d41bdb8 Merge pull request #6797 from dalg24/intel_mm_alloc 3b515c99e Cuda multi-GPU support: Pass the correct device id to get_cuda_kernel_func_attributes (#6767) aced864ec Drop librt TPL and associated KOKKOS_ENABLE_LIBRT macro 21b110542 Drop KOKKOS_ENABLE_INTEL_MM_ALLOC macro 7ff87a5b2 SYCL: Filter GPU devices (#6758) 8d58aadf0 Merge pull request #6790 from dalg24/impl_get_gpu_returns_optional 3e405209d Add support for RISCV and the Milk-V's Pioneer (#6773) 49f646283 Merge pull request #6786 from masterleinad/fix_cuda_occupancy 76f740fdb Merge pull request #6791 from dalg24/rm_hbw_space 3496c6fde Remove stray include header 393509470 Merge pull request #6792 from ldh4/simd_add_missing_vector_aligned_in_neon 1502379d0 Added missing copy_from() in neon for vector_aligned 95f70b3f4 Remove support for memkind 473cd5313 Remove DummyPolicy 136360bb3 Restore TestCommonPolicyConstructors.hpp e28b57976 Fixup bogous shared alloc fence labels mentioning HBWSpace f07a537c4 Drop Experimental::HBWSpace 0ed2ebfee Make ranges non-trivial 391f2d12f Fix SharedAllocationRecord to allocate using the correct execution space instance (#6789) 3db377e15 Fixup select from visible devices 1327c3779 Let Impl::get_gpu return std::optional and delegate device selection when appropriate 26060fed7 Don't try to compile the test for any backend with MSVC+Cuda 19dcd64da test_execution_policy_occupancy_and_hint might be unused 99b2e46ec Run OccupancyControlTrait on all execution spaces 91cc45e3a Split runtime checks from TestCommonPolicyConstructors into OccupancyControlTrait 442e4d42a Add checks for unsafe implicit conversions in RangePolicy (#6754) 4d29e39ab Disable test for MSVC+Cuda 31fb4761d simd: support vector_aligned_tag (#6243) 5f128d27d Merge pull request #6787 from ldh4/simd_skip_reduction_omptarget 01d5f8149 SYCL: Error out on initialization if the backend is different from ext_oneapi_* (#6784) 97997807d Temporarily disable simd_reduction test for omptarget build 20d52fb1c Fix Occupancy for Cuda b4bc40614 Reenable TestHIP_Memory_Requirements 63a1208b3 Fixup use provided execution space when copying host inaccessible reduction result (#6777) 7d2ea7212 Cuda multi-GPU support: Make some variables device-specific, update Kokkos::fence (#6753) 2d273c86a Merge pull request #6778 from Rombur/fix_in_parallel 379007a35 Merge pull request #6768 from dalg24/fix_device_id_test_omp_target 917baa6d6 Fix typo in deprecatation macro used in HIP cc6ecf058 Merge pull request #6772 from seyonglee/openacc_default_async_val 4c94f089b Get rid of `ZeroMemset`'s silly trailing value argument (#6769) af806fb5d Drop 2-arguments `ZeroMemset` constructor overloads (#6764) 729940c87 Attempt to fix device id test with OpenMPTarget 69fc8f851 Merge pull request #6763 from masterleinad/cuda_dont_use_singleton_wrapper_tasks b4c61a8f2 Merge pull request #6766 from dalg24/std_algo_tests_drop_print_statements 7d5fff958 Get rid of print statements in parallel algorithms unit tests 408e8be5b OpenMPTarget on Intel GPUs update (#6735) 61a07cf2a Merge pull request #6762 from masterleinad/cuda_dont_use_singleton_wrapper_space_instance bbb895a34 Remove redundant calls in rangepolicy constructors (#6765) 71b246d67 Deprecate `in_parallel` (#6032) (#6582) 7439ec9d4 Avoid calling wrapper functions with singleton in Kokkos_Cuda_Task.cpp eecd917f6 Change the default execution policy behavior of the OpenACC backend from synchronous to asynchronous executions. - Change the default OpenACC async_arg value from acc_async_sync to acc_async_noval. - Add acc_wait(async_arg) to scalar reduction operations (parallel_reduce()). 92307a5ec Update master_history.txt for 4.2.01 221e5f7a2 Merge branch 'release-candidate-4.2.01' for 4.2.01 26ad2643c Merge pull request #6761 from dalg24/cuda_get_last_error 7b5fbd414 [4.2.01]: changelog update (#6656) a082f820d Avoid calling wrapper functions with singleton in some classes b3d8643e8 Drop CudaInternal::cuda_get_last_error_wrapper() 2ca8e73a6 Merge pull request #6751 from ndellingwood/cherrypick-6746-to-rc4201 11b58159e Merge pull request #6756 from ndellingwood/cherrypick-6742-rc-4201 d2913cb38 Add runtime function to query the number of devices and make device ID consistent with `KOKKOS_VISIBLE_DEVICES` (#6713) e2f452882 Merge pull request #6742 from masterleinad/cleanup_trilinos_cmake_cxx_flags 4621c8643 Merge pull request #6742 from masterleinad/cleanup_trilinos_cmake_cxx_flags 20150550e Merge pull request #6747 from uliegecsm/fix-remove-if 540368114 std(remove-if): fixing tmp view alloc + avoid evaluating twice the predicate during final pass d4a099599 Merge pull request #6749 from ndellingwood/cherrypick-6510-to-rc4201 6229367eb Merge pull request #6746 from tcclevenger/cuda_warp_sync_to_avoid_race_conditions d8ace9763 Merge pull request #6746 from tcclevenger/cuda_warp_sync_to_avoid_race_conditions 8845cee38 Merge pull request #6510 from ndellingwood/fix-werror-pedantic 650ac4067 Avoid unnecessary zero-memset of the scratch flags in SYCL (#6739) d560c4719 Drop support for deprecated command-line arguments and environment variables (#6744) 57126af31 add more warp sync for cuda reductions e1415f8fc Merge pull request #6630 from tcclevenger/potential_racecondition_in_cuda_reduce dcf93fc08 Merge pull request #6738 from dalg24/shared_allocation_record 2dc7cbcc9 Cuda multi-GPU support: Allow execution space instance constructor to run (#6706) a1a6ea14c Fix TestThreadVectorMDRangeParallelReduce (#6734) c17969f33 Trilinos: Don't let Kokkos set CMAKE_CXX_FLAGS d18ad8f34 Untangle SharedAllocationRecord spaghetti code 34973c773 Merge pull request #6731 from Rombur/hip_ci_new abd50dc36 Merge pull request #6733 from simon-schlepphorst/fix_cmake_for_cxx26 5610068c5 Don't touch my records! (refactor Cuda/HIP/SYCL/Threads to not directly mess with `SharedAllocationRecord`) (#6732) 407e18dc8 Use team_size_max to fix "Team size too large" error in reducer test (#6725) 523d70189 Disabling failing HIP test in the CI c4e1b86c8 Reenable HIP testing 87f32846b Add KOKKOS_ENABLE_CXX26 to the configuration metadata 39a0f3d67 Add support for C++26 in generated makefiles bd3c0a552 Add C++26 standard to CMake Setup 6912b3998 Guard `[MD]RangePolicy` precondition check for deprecated code 4 (#6726) 8a914909d Merge pull request #6729 from dalg24/acc_allocation_error 5781d176e Disable openacc.view_allocation_error test 000fccc50 Merge pull request #6728 from Rombur/hip_ci_tmp_fix 3d33665ff Fixup using declaration f9f3c6e13 [OpenACC] throw if acc_malloc returned nullptr a3aa567af Add RawMemoryAllocationFailure::AllocationMechanism::OpenACCMalloc enumerator 8f743cf95 Ensure view_allocation_error does not silently ignore that no exception was thrown 9eca17795 Fix Docker env variables 86f5bb7d8 Let the smart pointer manage the CUDA/HIP stream (#6721) f42a8cb03 Temporary fix to reenable HIP CI 179d2e67f Add bound checks in RangePolicy and MDRangePolicy (#6617) ea564a274 Merge pull request #6723 from ndellingwood/cherrypick-6671-rc-4.2.01 4d784fe01 CHANGELOG.md: remove stray trailing whitespaces f53b18b6c Merge pull request #6671 from rbberger/add_mi300_gfx940 95934133f Merge pull request #6722 from ndellingwood/fix-hip-missing-header 256c0ca62 Kokkos_HIP.cpp: include Kokkos_Core.hpp to resolve errors 35a867d37 Make initialize and finalize of the Cuda/HIP singleton less special (#6714) bed3064ef Merge pull request #6712 from dalg24/cuda_error_cleanup 1e10099a1 Merge pull request #6715 from dalg24/hip_extraneous_closing_brace 4e33b3bf9 HIP: Forgot to delete matching brace closing the namespace e6ff1a469 No need to jump through so many hoops to print the error message 868e42e7b Get rid of CudaInternal::cuda_get_error_{name,string}_wrapper c75d730d2 Deprecate `{Cuda,HIP}::detect_device_count()` and `Cuda::[detect_]device_arch()` (#6710) 474366af4 [4.2.01] Fix msvc cuda release (#6660) 9393b358f Don't use the compiler launcher script if the compile language is CUDA. (#6704) fa91c962a Merge pull request #6711 from dalg24/pointless_cudaexec_forward_declaration 0254c631b Drop pointless Kokkos::Impl::CudaExec forward declaration be0c796c4 Merge pull request #6658 from masterleinad/fill_random_sync cad863fca Merge pull request #6708 from fnrizzi/inplace_transform_inclusive_scan 36da6cca7 add tests for in-place `inclusive_scan` (#6682) ee5cbfc25 Fix TeamThreadMDRange parallel_reduce (#6511) 89ba3fbae Provide new public headers `` and `` (#6687) 0ba8c40fc Provide `kokkos_swap` as part of Core and deprecate `Experimental::swap` in Algorithms (#6697) 673401038 add tests 96d530a24 Remove Kokkos::[b]half_t volatile overloads (#6579) 0e4a158a7 Check matching static extents in View constructor (#5190) 27286c32d Add `ATOMICS_BYPASS` configuration option to disable atomics (#6692) 23b02f064 Merge pull request #6701 from masterleinad/fix_enable_compile_as_cmake_language c9038983a Merge pull request #6681 from masterleinad/disable_bessel_sycl 5bb3ba32a Merge pull request #6695 from dalg24/cleanup_profiling_section 3523bc3e7 Enable `{transform_}exclusive_scan` in place (#6667) bec13acd0 Merge pull request #6703 from ndellingwood/issue-6702 68de5ce19 Merge pull request #6700 from dalg24/fixup_print_tolerance 716bef2a4 test_array_ctad: disable test for intel versions < 2021 3358970c2 Try linking against CUDA libararies even with KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE cbf1c644c Fixup cast tolerance to double before printing 9f5e38e97 SYCL: Address deprecations after oneAPI 2023.2.0 (#6577) 06de563f9 Add CI for MSVC+Cuda (#6661) efc0c365c Kokkos::Array deduction guide (#6373) 654a51f60 GitHub CI: Test with AddressSanitizer (#6676) 4eae6a99f Cosmetic changes to ProfilingSection f485cfa53 Let `Profiling::ProfilingSection(std::string)` constructor be explicit and nodiscard (#6690) 4078a0d8a Cuda: Allocate using the correct device (#6392) 02b46c09c #5333: CUDA: Use scratch space appropriate to small reduction elements in Team reductions (#5334) f02539e35 Merge pull request #6647 from dalg24/ulp_should_be_integral 79a36295d Merge pull request #6649 from dalg24/we_dont_need_no_dual_view_converting_assignment_operator 2ac06ce63 Merge pull request #6689 from dalg24/profiling_section 73c750755 Drop unnecessary header include in Kokkos_Profiling_ProfileSection.hpp 5aa0ceee4 Drop unnecessary guarding for a tool library being loaded in ProfilingSection 391daefd5 fill_random without exceution space instance should fence 8de16ea35 Disable more Bessel tests for SYCL on INtel GPUs cbbe09b93 OpenMP: Use `omp_get_nested` for older gcc versions (#6685) fe06b6f36 Merge pull request #6652 from masterleinad/ompt_printf 7e73c2b47 Merge pull request #6675 from brian-kelley/DeepCopyMsg f38553cb0 Merge pull request #6361 from masterleinad/cuda_multiple_devices_constructor 79164a43a Improve handling of printf in OMPT on Intel GPUs 52e44d6cf SYCL: Force inlining of Kokkos::printf (#6650) 2092c01a4 Merge pull request #6651 from masterleinad/disable_hip_ci f71052b5f Merge pull request #6680 from Rombur/rocm_60 5df22b87b Workaround for ROCm 6.0 failing to compile with AVX2 SIMD support 64a7774b6 Merge pull request #6671 from rbberger/add_mi300_gfx940 154a57df8 src->source, dst->destination 72bc7ed42 Add missing include sstream 838f8938e Add a unit test for new deep_copy exception msg 316ceac58 Improve "no copy mechanism" exception message 18d7d78f5 Merge pull request #6664 from dalg24/openacc_not_always_true e4a7cfc78 Per review prefer always_false::value to is_void_v 33db3046a Add Impl::always_false type-dendent false trait 293319c58 Add missing gfx940 62855dcf1 Merge pull request #6662 from cwpearson/feature/cmake-stream cedbf56f6 Merge pull request #6665 from dalg24/not_accomodating_external_definition_of_kokkos_assert 1bd9ce7a5 Merge pull request #6659 from crtrott/fix-msvc-cuda-develop fb668b143 Merge pull request #6666 from masterleinad/openmp_use_omp_get_max_active_levels a996c12a0 Use omp_get_max_active_levels() when supported ae71e4002 Drop guards to accommodate external code defining KOKKOS_ASSERT 76ea3a3a9 Do not negate the dependent true traits helper 379d5db1a Add CMakeLists.txt for stream benchmark ed08974c7 Unit test for issue 3371 (negative vector length should not yield a negative max_team_size) (#6076) e524ec777 Move header for Damien because he is right c6d01e943 Fix formatting 249f8b4fb Sidestep lacking CTAD support msvc/cuda 7dcf1deba Avoid lambdas in constexpr branch for msvc/cuda 458910fbf Fix missing include on msvc/cuda fb0380b91 Fix builtin_unreachable use for MSVC/CUDA 843fca336 OpenMPTarget: clang extensions for dynamic shared memory. (#6380) 1abcca9d4 Merge pull request #6626 from masterleinad/cherry_pick_6608_4_2_01 d0548d658 Merge pull request #6631 from tcclevenger/cherry-pick-6630 4e4a047a2 Merge pull request #6627 from masterleinad/cherry_pick_6623_4_2_01 6e1865714 Merge pull request #6638 from dalg24/rc421_early_tools_profiling 84299466a Merge pull request #6655 from kokkos/cherry_pick_6653 232114fcd Merge pull request #6653 from masterleinad/remove_deprecation_allocation_mechanism_gcc_11_0 24b64848e Merge pull request #6653 from masterleinad/remove_deprecation_allocation_mechanism_gcc_11_0 8e16df3cf Merge pull request #6557 from dalg24/rm_logical_spaces eadc210bf Remove deprecation warning for AllocationMechanism for gcc <11.0 dcdfcac91 Diable HIP CI 9fd95ebcb Don't use rocm-docker for clang-format a35bc6890 Merge pull request #6643 from seyonglee/fix_openacc_toomuchwarning b9b63dfd8 Drop DualView converting copy assignment operator 71729af71 Fixup test math functions ulp should double -> int b877a6e9b Merge pull request #6645 from fnrizzi/fix_6644 a41dba586 SYCL: Restrict workaround for is_device_copyable to oneAPI versions before 2024.0.0 (#6532) 07cdd7000 add missing header fix #6644 c3dde624d Merge pull request #6642 from uliegecsm/kokkos-tools-typo 685620918 This PR fixes the too-much-OpenACC-warning issue, mentioned in PR #6639. This PR also re-enables the OpenACC CI test. 9041bdaf3 Merge pull request #6625 from Rombur/jenkins_multibranch 4a6a92056 Merge pull request #6634 from Rombur/ubuntu_18 ed64cea7f tools(profiling): type (related to kokkos/kokkos-tools/pull/221) 30f020777 Merge pull request #6640 from uliegecsm/unorderedmap-types 52d5c3738 nvcc wrapper: remove troubling flag to fix 6628 (#6629) 12af5769b Merge pull request #6639 from Rombur/disable_openacc e9899a5b1 unorderedmap: modernize traits 7739ca191 Disabling OpenACC in the CI because it emits too many warnings e4753753b Merge pull request #6635 from uliegecsm/kokkos-profiling-fix f8788ef2a Merge pull request #6635 from uliegecsm/kokkos-profiling-fix b00c1e068 update comment to include final() mention 54c62d15d Replace ubuntu:18.04 with ubuntu:20.04 as base image for clang-format c9d7bbad1 kokkos(profiling): do not finalize in any backend 6dcd72b9b Add warp sync for Cuda parallel reduce 4d4a343e5 Add warp sync for Cuda parallel reduce c9540f51c Merge pull request #6624 from uliegecsm/kokkos-graph-hip-fix 0c617db8a Merge pull request #6608 from masterleinad/fix_numeric_traits_bfloat16 16972af28 Add jenkins multibranch pipeline options 0d3428087 Merge pull request #6614 from masterleinad/gh_workflows_icpc_fedora_to_intel a7bf142d5 Merge pull request #6604 from fnrizzi/fix_test_dev_and_th 81580ca15 Merge pull request #6615 from uliegecsm/nvcc-wrapper-missing b54105701 Merge pull request #6624 from uliegecsm/kokkos-graph-hip-fix f31436a09 graph(HIP): adding inline keyword to fix #6623 f1d466622 Merge pull request #6608 from masterleinad/fix_numeric_traits_bfloat16 a4720ce41 Add clang-format check to GitHub workflows (#6612) 0262f7405 nvcc(wrapper): adding missing `--generate-line-info` arg 71a9bcae5 Merge pull request #6613 from ndellingwood/master-release-4.2.00 ae75d3895 GitHub Workflows: Use Ubuntu 22.04 instead of Fedora for Intel compiler testing 0a40d16b8 Merge pull request #6611 from cz4rs/fix-formatting 3dd0b8253 [ci skip] fix formatting 374064ab7 add branching 33a1106da use reference 61842b7d1 remove comments 68e4bedc4 fix for macos 17af2f3c4 try 2779b29b5 avoid pyt package f0af4672c try fix aa2ff89fb Merge pull request #6598 from uliegecsm/kokkos-unique-fix 81a958653 Remove KOKKOS_IMPL_DO_NOT_USE_PRINTF (#6593) ff7104cee [ci skip] Update changelog on develop for 4.2.00 (#6592) 932c1fb2f Added missing operator* to NEON simd 8fd8c94aa Threads: add missing broadcast to TeamThreadRange parallel_scan (#6601) 1a145311f Fix generated Makefile when using gnu_generate_makefile.sh and make >= 4.3 ee655c08a Fix TestNumericTriats.hpp for SYCL with bfloat16 support c60716df4 try fix 38cbde408 Update master_history.txt for 4.2.00 abe01c88f Merge pull request #6600 from masterleinad/cherry-pick-6590 08efbb919 Merge pull request #6595 from masterleinad/cherry-pick-6543 9c37437ea Use binary wrapper for consistency in definition of half types numeric traits (#6590) 61b93ec7f kokkos(unique): fix allocation of temporary view to enfore using the provided space instance 2f5723bd6 Merge pull request #6585 from masterleinad/ompt_guard_scratch_allocations d5a480291 Fix infinity, quiet_NaN, signaling_Nan, isfinite, isnan, isinf for half_t and bhalf_t (#6543) 97a90d5dd OpenACC: add atomics support (#6446) fb73a7359 Merge pull request #6589 from kokkos/revert-6586-desul_sycl_device_global_supported 6023c1919 Merge branch 'release-candidate-4.2.00' for 4.2.00 81e308e7d Revert "Desul atomics: Trade SYCL-specific compile definition for a macro defintion in the configuration header" 3f773d057 OpenMP: No memset in viewfill (#6573) 0a83695e5 Replace Marsaglia polar method with Box-muller to generate a normally distributed random number (#6556) 91ee4e1e1 Merge pull request #6569 from masterleinad/cleanup_static_assert_kokkos_impl_do_not_use_printf 13c6c5783 Merge pull request #6586 from dalg24/desul_sycl_device_global_supported 605784265 [ci skip] Adding Changelog for Release 4.2.0 (#6583) c8b4fe848 Desul atomics: Trade SYCL-specific compile definition for a macro defintion in the configuration header 26464df04 SYCL: Implement DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION path (#6534) fcb0452d0 OpenMPTarget: Guard scratch memory usage in ParallelReduce d4a517f82 Set the device id explicitly for CUDA API calls in impl_initialize 80084960c simd: temporarily skip device math ops unit test for OpenMPTarget build (#6574) a453e9fc3 Simplify fence functions in the Threads backend (#6571) 8d9400e29 Add crtrott's launch_latency benchmark (#6379) 403c34f30 m_cudaDev isn't static anymore a7b16b351 OpenMPTarget: CI compiler upgrade. (#6545) c7a162342 Merge pull request #6576 from dalg24/remove_cuda_clang_workaround 1e1ed1318 Drop Clang+CUDA workaround 6fc7a4930 Merge pull request #6575 from dalg24/drop_unused_memory_fence_header cead4f559 [ci skip] Drop unused header 3a285ecf1 Merge pull request #6276 from ldh4/simd_add_missing_unit_tests 21a3d6f12 Merge pull request #6570 from ldh4/simd_move_fallback_impls 3b8c449f1 Remove empty quotation marks for static_assert b76e1dcc1 fallback implementation cleanup 024d6c21b Remove unused Sandia testing files (#6568) 6da3fa7e9 Threads remove unused variables and functions (#6566) 0e5aa1503 Merge pull request #6553 from masterleinad/avoid_redundance_algorithm_unit_test_variables a07c7a2b6 Address reviewer comments 8c4fe6b06 Merge remote-tracking branch 'upstream/develop' into cuda_multiple_devices_constructor 6eb12dbc9 Rollback changes to view constructors to reduce the number of instantiations (#6564) fd80cbef4 Merge pull request #6541 from Rombur/threads_refactor_3 6d95b621e Remove logical memory spaces 54f2e7f23 Merge pull request #6548 from Rombur/hip_split 3093a0e64 Only define STDALGO_TEAM_SOURCES_* once 3edbef33d Merge pull request #6531 from masterleinad/sycl_use_ext_oneapi_device_global_feature_macro 400dd1d99 Trim some fat in `CudaInternal` (towards multiple GPUs support) (#6544) 201d1dead Merge pull request #6536 from dalg24/view_constructor_from_label 6b4ee34ee Split files in HIP backend 13efa71ac Merge pull request #6547 from msimberg/bump-hpx-1.9.0 a41df08a7 Bump HPX version used in CI to 1.9.0 840d6b775 Reduce number of View constructor instantiations 189aaa6da Merge pull request #6542 from Rombur/fix_guard b4f27c87f Fix typo in macro guard c4d0dfe02 Fix indentation 33010ecc3 Add comments a417450bb Remove spawn function bb759df49 Remove useless forward declaration 7d31c2273 Small cleanup of ThreadsInternal::initialize 6ac5aa846 Remove Sentinel struct from Threads b875be75d Remove unused variables 09756717d SYCL: Use host-pinned memory to copy reduction/scan result (#6500) 6056c6b1f Merge pull request #6537 from Rombur/threads_refactor_2 3bcf9657f Prefer defaulted default constructor for Bitset (#6524) 1fcce6936 Remove extra constructor 9158785df Remove sleep and wake functions ae0bd54eb Added unit tests for reduction ops and few intel svml intrinsics cf5a859bf Threads: replace enum with constexpr int and enum class (#6514) e156d5859 Check that device associated with stream matches requested device 0b59a1b40 Merge pull request #6512 from eltociear/patch-1 a30b9aa78 Fixup in README (github -> GitHub) 4383d1c1e Merge pull request #6528 from ndellingwood/cherrypick-6516 a440ac9e3 Merge pull request #6527 from ndellingwood/cherrypick-6518 4c88f2569 Merge pull request #6525 from dalg24/bitset_do_not_mess_with_labels 92cc6ce95 Merge pull request #6523 from dalg24/rm_deprecated_code_3 2bc1721d7 SYCL: Use SYCL_EXT_ONEAPI_DEVICE_GLOBAL to detect support for device global variables e9a540605 Merge pull request #6516 from fnrizzi/fix_6502 86e3c8db7 Merge pull request #6518 from fnrizzi/fix_6515 0120c431b Merge pull request #6516 from fnrizzi/fix_6502 ef889a7ab with_updated_label -> append_to_label 380754b91 Do not append " - blocks" to the bitset label 589ad55b0 fixup! [deprecated code 3] remove using declaration in Kokkos::Experimental:: for all math functions 2e6765a2a [deprecated code 3] remove ENABLE_DEPRECATED_CODE_3 option 7c63c32bd [deprecated code 3] remove MasterLock fb0bd5297 Get rid of FIXME_OPENMP ca49c65f6 OpenMP backend cleanup following removal of deprecated code 3 0505ce294 [deprecated code 3] remove {OpenMP,HPX}::partition_master 3172fd1b0 [deprecated code 3] remove using declaration in Kokkos::Experimental:: for all math constants d515a51ea [deprecated code 3] remove using declaration in Kokkos::Experimental:: for all math functions 35dda2ac6 [deprecated code 3] remove using declaration in Kokkos::Experimental:: for clamp, min, max, and minmax 57c0aa61f [deprecated code 3] remove KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_* macros dfd0a6d31 [deprecated code 3] remove InitArguments e6c51df7f [deprecated code 3] remove all default device init tests 3490ec1e7 Merge pull request #6520 from masterleinad/update_kokkos_version_develop 505c396dc Merge pull request #6518 from fnrizzi/fix_6515 629135a0f [ci skip] Update Kokkos version to 4.2.99 b26a1f735 avoid auto 7b86b80a9 add guards 0c0cafaba Merge pull request #6509 from Rombur/threads_team 58f53a6a2 Merge pull request #6510 from ndellingwood/fix-werror-pedantic 78c1ed885 Kokkos_SIMD_Scalar.hpp: remove extra ';' a0cacc305 Rename Kokkos_ThreadsTeam.hpp to Kokkos_Threads_Team.hpp 654d283a6 Update version number for 4.2.00 release bd361e562 Merge pull request #6505 from Rombur/threads_instance 3beb7f191 Merge pull request #6499 from masterleinad/nvhpc_impl_only e2ad3b36b Merge pull request #6506 from masterleinad/promote_kokkos_printf_header 89bd35cc3 Merge pull request #6198 from uliegecsm/unordered-map-space 0cad570ad Merge pull request #6503 from fnrizzi/relax_guards_team_algos 04a631081 Update CI in OpenMPTarget to use llvm-17 (#6472) c586fa172 simd: add floor, ceil, round, trunc operations (#6393) 1095b640e Merge pull request #6497 from fnrizzi/openmptarget_scan_return 5518eb99e Promote Kokkos_Printf.hpp to public include 6ff5721a6 Rename Kokkos_ThreadsExec to align with the other backends 5544c0c22 UnorderedMap(space instance): proposal for #6067 adc885184 remove guards 377b3f057 fix order 02e6bdcce ad threadvector bc83a8912 Merge pull request #6430 from aelovikov-intel/fix-red 578bc7f91 Merge branch 'develop' into openmptarget_scan_return e8687a5df Merge pull request #6495 from masterleinad/hpx_parallel_scan_team_thread_thread_vector 63d9ae201 Merge pull request #6493 from fnrizzi/fix_team_uniquecopy_copyif a856f973e Allow NVHPC as device compiler only with Kokkos_ENABLE_IMPL_NVHPC_AS_DEVICE_COMPILER=ON e4038bcd0 Merge pull request #6498 from Rombur/threads_split_files f511dca95 SIMD: Split math functions from SIMD_Common.hpp (#6487) 8420c2f00 Update to HIP TeamPolicy Block number heuristic (#6284) 4e69e4010 Merge pull request #6370 from Rombur/hip_graph 5b693fd95 address review comment fdfeaf916 add overload for TeamThreadRange d97f16f44 Merge pull request #6479 from uliegecsm/fix_hip_concurrency ebef19bdf Serial: Allow for distinct execution space instances (#6441) 23496b47e HPX: Implement TeamThread and ThreadVector parallel_scan with return value cb22b8061 Split Kokkos_Threads_Parallel files f5c0cc5a4 Update core/src/HIP/Kokkos_HIP_KernelLaunch.hpp ad9eb209f Merge pull request #6308 from thearusable/5635-sycl-parallel-scan-with-value-ThreadVectorRange a601d81f9 Merge pull request #6490 from masterleinad/fix_build_cmake_installed_different_compiler 1f4e3d5db fix impl 8659ffa0b Fix example/build_cmake_installed_different_compiler 1ebb3afc4 Merge pull request #6484 from masterleinad/fix_bessel_function ef1922d30 Merge pull request #6394 from masterleinad/simd_checks_neon 7fafc641a Merge pull request #6485 from masterleinad/fix_simd_cuda_compilations 8181d7075 Fix atomic operations bug for Min and Max (#6435) ee8d58ea3 Merge pull request #6482 from uliegecsm/iostream 9035ab2d3 Merge pull request #6486 from ajpowelsnl/fix/6451 c5bf8705d #5635: SYCL: Add parallel_scan overload with value for ThreadVectorRange 1ccf4995b Also fix annotations for generator constructor for AVX512 and NEON e52b957db Merge pull request #6305 from thearusable/5635-threads-parallel-scan-with-value-ThreadVectorRange e40f026ef team-level std algos: part 13 (#6351) 890148e5d Fix NVCC warnings (#6483) 4d3958bec guards to ensure DBL_EPSILON return for POWER8,9 96edf73bd Fix compiling SIMD unit tests on NVIDIA 6b21fde9e cleaning: remove iostream from headers where possible (IWYU) 6ff0deb9b Fix implementation for cyl_bessel_i0 29d4ffdbf Add KOKKOS_ARCH_ARM_NEON 4ce289baa Allow detecting SIMD types based on compiler macros (#6188) 495b1ccfd Add parallel_scan overloads with value for Threads c63f125ec Add test for parallel_scan with return value for ThreadVectorRange 0bf937cdd Moving abort and assert into their own public headers (#6445) 2075ae79b core/src: Add half single and double mixed compare (LT,GT,LE,GE) (#6407) e04f637dc Merge pull request #6307 from thearusable/5635-sycl-parallel-scan-with-value-TeamThreadRange 567524c8f Merge pull request #6242 from thearusable/5635-hip-parallel-scan-with-value 3ad6473f6 Merge pull request #6465 from masterleinad/simd_math_functions fbdb0e04f Merge pull request #6478 from masterleinad/minimum_version_google_benchmark 872ffb770 Merge pull request #6474 from uliegecsm/dualview_compatible_copy_constructor_assignment 148e6a6c3 Merge pull request #6471 from masterleinad/fix_openmp_teamthreadrange_parallel_scan_return 60e4d1359 team-level std algos: part 12 (#6350) d8f8142ab Use call operator instead of run_me function 94c5d9ab6 Modify test so that source and destination view are of different type 744711864 Compute concurrency on HIP using Kokkos hardcoded m_maxWavesPerCU 39316fa8c Add test of copy constructor/assignment operator for DualView. e1f2cf545 Fix minimum version for Google benchmark 82044c696 Add compatible copy assignment operator to DualView b610a288b OpenMP: Fix TeamThreadRange parallel_scan with return value for team_size > 1 bbfe63981 Use std::is_same_v 7d817b88b #5635: SYCL: Add parallel_scan overload with return value e4eb204ee #5635: Move some tests for parallel_scan to TestTeamScan df1901b1c Use std::is_same_v 6c6a26ab1 Add parallel_scan overloads with value for HIP backend 41cf2e51c Merge pull request #6303 from thearusable/5635-threads-parallel-scan-with-value-TeamThreadRange 68a97a1fb Merge pull request #6235 from thearusable/5635-cuda-parallel-scan-with-value 5150a9fad Merge pull request #6463 from masterleinad/sycl_disable_bessel_test_intel_gpus c395c0cf1 Fix formatting 61e7b262d Skip testing for non-power-of-two team sizes b8d4feb26 use shortcut 4f6ddd190 #5635: Move some tests for parallel_scan to TestTeamScan 190bfe4ab #5635: Add parallel_scan overloads with value for Threads 1675997f2 #5635: HIP: Add Overloads for parallel_scan with return value for TeamThreadRange (#6302) 9db1ea46d team-level std algos: part 11 (#6258) 925032879 Merge pull request #6378 from cwpearson/feature/gups-permute-mode d458fdadb team-level std algos: part 10 (#6256) f6977cf43 Check for default device 6e2ca15da Merge pull request #6464 from masterleinad/restrict_avx2_workaround_rocm5_67 c195ee69a SYCL: Disable another bessel function test for Intel GPUs b85563160 SIMD: Math functions should be in namespace Kokkos e542e989a improve tests to check intra-team result (#6431) 9b3778134 fixes build error for TeamReduce and TeamTranformReduced tests for specific GCC (#6459) b813f2bb3 HIP: Restrict AVX2 workaround to ROCm 5.6 and 5.7 e1c82660e Merge pull request #6462 from fnrizzi/fix_warning_random_test_windows b9fa28cfb Workaround for ROCm 5.6+ failing to compile with AVX2 SIMD support (#6449) b2a1820b0 fix casting warning in Random test 988a9e6a9 Move final assignment to correct scope 2e743674a improve tests (#6437) 1f0183bc1 improve tests (#6432) 773e34648 team-level stdalgos: improve tests, check intra-team result matching (part 3/7) (#6425) 5f279b02c Fix parallel_scan_with_reducers test 8e4820194 Fix race condition in functor_vec_scan_ret_val test e13f67ca6 team-level stdalgos: improve tests, check intra-team result matching (part 6/7) (#6436) 5ed274c93 Skip bessel function tests known to fail on Intel GPUs (#6434) 3cd281376 team-level stdalgos: improve tests, check intra-team result matching (part 2/7) (#6426) d8fa85644 std_algos: improving min, max, minmax (#6421) 6f9e50c83 Merge pull request #6301 from thearusable/5635-cuda-parallel-scan-TeamThreadRange 06c6a73ab Merge pull request #6213 from fnrizzi/team_level_p10 ecbe79507 Merge pull request #6452 from masterleinad/disable_simd_compiler_macro_check_ompt a56b433ce Merge pull request #6455 from fnrizzi/fix_6442 8239de526 Fix compiling code using Kokkos::printf for OpenMPTarget on Intel GPUs (#6443) 36af9d6e4 Merge pull request #6456 from fnrizzi/fix_6440 fb20a482d Assign final sum in Cuda parallel_scan ThreadVectorRange 4a819b6b7 Fix Cuda parallel_scan ThreadVectorRange range 6f85f19eb Merge pull request #6454 from fnrizzi/fix_copyif_team_test_assert 7e2749632 OpenMPTarget init-join fix (#6444) 56cc35bdf re-enable unit tests for sort and random via makefile (#6422) e95075930 fix unreachable for intel bcb92a619 fix intel compile error ba9165994 add intra team check for missing test 4a266d8ee #5635: Add test for parallel_scan with return value for ThreadVectorRange 2b7eb0b0b add missing assert 96320555f #5635: Add parallel_scan with value for CUDA and ThreadVectorRange 7284cd215 OpenMPTarget: Disable check for SIMD compiler macros e743017e8 benchmark/gups: use CMake 002cce07f Clean up benchmarks/gups 6d794df99 #5635: Enable TeamThreadRange test for CUDA db8498389 remove old impl 1fb6f4a74 #5635: Add parallel_scan changes for CUDA and TeamThreadRange 47aecc6c4 Merge remote-tracking branch 'upstream/develop' into team_level_p10 6a95b5f3a Merge pull request #6292 from thearusable/5635-serial-parallel-scan-part-2 6494d96c1 Merge pull request #6212 from fnrizzi/team_level_p9 8bff82ff8 try fix for unique, previous impl to remove later 615fc1aef Fixes for Kokkos::Array (#6372) 7e35f1087 Merge pull request #6433 from masterleinad/cuda_fix_m_num_scratch_locks_initialization 541f67468 improve tests with intra-team result check d270e064a Merge pull request #6428 from masterleinad/enable_kokkos_isnan_for_bhalf_t c046bdba1 Merge pull request #6429 from tcclevenger/hip_potential_race_condition 6979f67ca Initialize m_num_scratch_locks for Cuda parallel_for TeamPolicy fea838822 Same for scan 89a42341c [SYCL][Reduction] Group counter should use at least memory_order::acq_rel 41253bd55 Set the device id in cuda_kernel_arch 111371f10 Merge remote-tracking branch 'upstream/develop' into cuda_multiple_devices_constructor 96bb26b0c avoid potential race condition HIP 732d39219 Fix guard for isnan test for bhalf_t 9081d366c Merge pull request #6423 from uliegecsm/viewmapping_comparison 3bbbe2ba5 improve tests to address review e3a608bb5 add comment 4389d81c4 Fix to avoid #186-D pointless comparison warning. ba1bd2303 Merge pull request #6418 from dalg24/uvm_warn_once cb459e741 Merge pull request #6419 from dalg24/cuda_bock_size_deduction_device_properties 035d28487 Address reviewer' comments 45646ab3b Use execution space instance argument to get device properties in block size deduction 582dfeac7 check-copyright improvements (#6399) 26a4cd43a Only warn once (at initialization) when forcing allocation in unified memory 03ba69e07 Merge pull request #6417 from dalg24/drop_check_support_unified_addressing eb8ee282a fix single as per Christian's suggestion 21f72433b Drop check whether device supports unified addressing c32f9c90b Merge pull request #6415 from cz4rs/fedora-enable-death-tests 1c0c73402 Merge pull request #6413 from dalg24/pre_kepler_arch_not_supported 1affb05d0 core/src: Add half math functions to private header (#6124) c19926fd5 Enable death tests for fedora rawhide 7d9394ddb formatting 1cb10cb27 Merge remote-tracking branch 'upstream/develop' into team_level_p10 f4d7ea559 Merge remote-tracking branch 'upstream/develop' into team_level_p9 fc213ead1 Team-level std algos: part 7 (#6211) db591e369 formatting afac7784f address comments 8a3ec1bf5 Merge remote-tracking branch 'upstream/develop' into team_level_p10 dd5c624b7 use single c4c9ed551 Merge remote-tracking branch 'upstream/develop' into team_level_p9 fd774d916 Merge pull request #6411 from dalg24/precondition_not_initalized 28061e84d Drop pre-Kepler logic in Cuda::impl_initialize 7b4d0a6f7 Merge pull request #6410 from dalg24/unused_hip_internal_data_members c692a816d !initialized() should be a precondition for calling {Cuda,HIP,SYCL}Internal::initialize d8846bf84 Merge pull request #6409 from dalg24/host_exec_initialized_before_device 881801cb4 Drop unused HIPInternal::m_hipArch static data member 6aaf3736b Drop unused HIPInternal::m_maxSharedWords data member 53b2b2285 Merge pull request #6408 from dalg24/drop_cuda_internal_maximum_shared_words c6cd24794 Drop check that the host backend is initialized before the Cuda/HIP/SYCL one 34aebe55c Drop (unused) `Cuda::cuda_internal_maximum_shared_words` 3c97512cc OpenMP backend refactor files. (#6403) 93fe629cf address comments c53a95e0c Merge pull request #6402 from dalg24/cuda_malloc_async_on_by_default bda5326b1 team-level std algos: part 6 (#6210) 49d4048cd Merge pull request #6405 from cz4rs/fix-cmake-warning-benchmarks 31c060ab5 Merge pull request #6401 from dalg24/manage_stream_should_be_private 629128cb7 Merge pull request #6406 from cz4rs/appveyor-disable-benchmarks 201f78d6d Merge pull request #6400 from dalg24/checked_integer_ops_death_category e5a23d10e Disable performance benchmarks in AppVeyor CI 926a6420f Use archive extraction time for timestamps 159942703 Preserve one build that disables Kokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC 7cdac473b Introduce disable_malloc_async Cuda option with generated makefiles cd285dbaa Print whether KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC is defined ac23f369d Let Kokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC be ON by default 30a4da945 Deprecate HIP(hipStream_t, bool) e7174a8fb Fixup checked interger operations death test 7353bdf95 Deprecate Cuda(cudaStream_t, bool) 393abd3ef team-level std algos: part 5 (#6209) 8aae5eaa9 Added a gen ctor for float (#6397) de4380e9b team-level std algos: part 4 (#6208) e76708ae1 SIMD: add float simd support (#6177) 4617e4a9c Adding is_scoped_enum & to_underlying (#6356) df37e47dc Merge pull request #6395 from msimberg/hpx-no-local-headers-runtime 578dafa84 team-level std algos: part 3 (#6207) 43d3e533d Add a more comprehensive `kokkos_{malloc, free}` perf_test (#6377) fa1aaa712 Explicitly check for valid device id d8130353f Don't use local headers or runtime in HPX backend due to deprecation d122c39a8 Check AVX, AVX2, AVX512XEON compiler macros on KokkosCore_config.h (#6248) 66f5c0428 Merge pull request #6391 from Rombur/fix_kokkos_nightly 168378611 Fix typo. e627b2e59 Add test 1d298d0fb HIP: Update print_configuration (#6387) 7c2631c59 Fix uninitialize variable warning with gcc 13 9fde326a3 Fix reviewer's comments e1f9739b7 Merge pull request #6389 from dalg24/rempvpe_tmp_pattern_specialization_from_tag 64a9b3d85 Fix typo 23ceffb16 Remove stray Cuda graph pattern specialization from tag 592f20cf3 Merge pull request #6220 from ldh4/simd_add_gather_scatter f34046625 Merge pull request #6386 from dalg24/simd_suppress_nvcc_attribure_ignored_warning 3b55f5412 Merge pull request #6371 from crtrott/nvhpc-23-1 6dd93217a Update base docker file for nvhpc d5573d71c Work around OpenMPTarget failure 85f66eb34 Update nvhpc gtest skip message 8886eb6c4 Use NVHPC 23.7 for testing of OpenACC b70d78f5c Fix TestAtomic to use the test execspace 3223007e1 OpenACC: Guard tests relying on abort 7a8f5531e NVHPC 23.7 update: address reviewer comments cc278c44a Update nvhpc to version 23.7 in the CI 7805bdc54 NVC++ clang-format fixes bb4e9e110 More NVC++ 23.7 updates f49065953 Update containers and algorithms for NVC++ 23.7 5a9aee36f Reenabling tests for nvhpc 23.7 775cc5562 Disable a test not working with nvhpc-23.1 3e8a53cd4 Merge pull request #6384 from ldh4/simd_use_host_device_lambda f85f499a1 Merge pull request #6385 from crtrott/remove_deprecated_3_volatile_join c7ea58f18 Remove deprecated code 3 support for volatile join 4cb688c84 Suppress NVCC attribute ignored warning on SIMD operators as hidden friends b35c741f2 Changed gen ctor usage to pass in KOKKOS_LAMBDA 09fd0c207 Rebased and applied feedbacks dab159da0 Added gather_from and scatter_to for AVX2 and AVX512 simd Added gather_from and scatter_to for NEON simd c740f55eb Merge pull request #6382 from dalg24/hostspace_allocation_mechanism_deprecated_declaration_warning 5722ba863 Merge pull request #6249 from Rombur/hip_56 b26a6c620 Add deprecated attribute to HostSpace(AllocationMechanism) definition 15d5217f1 team-level std algos: part 2 (#6205) 7e45f17a7 simd: make mask and condition unit test to check with all data types (#6360) bb0633e9d SIMD: convert binary operators to hidden friends (#6320) 4222b841e Merge pull request #6236 from tcclevenger/potential_racecondition_in_cuda_reduce a7b23b60f Merge pull request #6376 from Rombur/constexpr e22a487ed Use constexpr West in test e6b98b7ee Use constexpr West in src 277e4a614 Do not use HIP Graph with ROCm 5.2 7e299b4e2 Merge pull request #6369 from masterleinad/workaround_posix_memalign_osx fdb50e03a Fix -Wformat-truncation warnings in CI (#6354) 4c5c00827 HPX: Don't interfere with exception handling dc076947d Use aligned version of operator new instead of aligned_alloc c5e13a5ee Replace one of the ROCM 5.2 configuration with ROCM 5.6 96b157b6c Add support for HIP Graph c8ec39728 Merge pull request #6368 from msimberg/task-ready-count-desul-atomic-load 756b7203c Add Kokkos_Atomic.hpp headers to OpenMP and HPX task headers 7a24e94b8 Replace volatile m_ready_count loads by desul::atomic_loads in task queue implementation ca3687e63 Merge pull request #6365 from masterleinad/fix_amd_gpu_macro_define ab10d2bba Extend 'hip_driver_check_page_migration' (#6364) 682ca6984 Only set KOKKOS_ARCH_AMD_GPU if a AMD GPU architecture is enabled b8482317e Introduce constructor for multi-GPU support. 04d5c55a4 Initial implementation of gfx942 (#6358) 8d8b24a1f Use std::aligned_alloc for allocations (#6341) 3eb0bca34 Merge pull request #6357 from msimberg/bin-bash-to-usr-bin-env-bash b1b6d123f Merge pull request #6353 from masterleinad/fix_simd_neon_gcc_13 e454cc6c4 Fix cudaAPI wrapper errors for CUDA_MALLOC_ASYNC (#6346) 79348d5f2 Use /usr/bin/env bash in nvcc_wrapper and kokkos_launch_compiler for portability da49ee2ac SIMD: add generator constructors (#6347) 3c7c0e19d Fix compiling SIMD libraray with NEON and gcc-13 2e744dc33 Merge pull request #6267 from masterleinad/kokkos_split_parallel 0bcb28fda Merge pull request #6342 from masterleinad/disable_onedpl_support 7e91f11f0 Rename AMD GPU architectures (#6266) 3a58c3ae4 Merge pull request #6340 from arghdos/cleanup_clangrt 4df576109 Disable default oneDPL support in Trilinos cb4ac66e8 format f5dde3d38 fix syntax c46aa0906 remove file 69389fcd3 revert files a3a00a6a2 keep only subset 92bcdd06b Merge remote-tracking branch 'upstream/develop' into team_level_p10 5742e3f71 fix lambda capture and constraints e4f7383e3 fix 30b6ca729 remove file 4d4fde30f remove unnecessary file, fix constraints f49c11e2e Merge remote-tracking branch 'upstream/develop' into team_level_p9 e550cb751 fix lambda capture 11c6cf74b Fix #6336: capturing `this` implicitly is deprecated in C++20 (#6337) 639c1da18 Fix #6334: intel compiler does not like returning inside if constexpr (#6335) 50fd7afbf Adopt new HIP cmake's way of finding clang-rt 40fb259c3 allow sorting via native oneDPL to support views with stride = 1 (#6322) 0ebedee2c Merge pull request #6304 from masterleinad/improve_static_asserts_create_mirror f557d3dd2 Passing size==0 to DeepCopy/memcpy/omp_target_memcpy (#6273) fcd6132fe Use checked arithmetic builtins for overflow detection (#6313) 00dca3a43 Merge pull request #6333 from Rombur/for_each f65c24472 Merge pull request #6328 from Rombur/gtest 6991420ad Run NVHPC only on V100 6d2005a5c Merge pull request #6200 from fnrizzi/team_level_nonmod_seq_p1 3c2d948d7 Merge pull request #6329 from masterleinad/avoid_conflicting_namespace_sycl_algorithm fe2c6be9f Merge pull request #6327 from Rombur/ccache 108bfe24b Merge pull request #6316 from Rombur/hip_55 140e42c21 SIMD: split simd unit tests into separate files (#6278) e7593a74b formatting 9d13d3bb6 Merge remote-tracking branch 'upstream/develop' into team_level_p9 c6d98a607 address reviews [skip ci] 963f7833b Fully qualify Experimental::SYCL in algorithms to avoid conflicting namespaces f193f902c Fix gtest when using C++20 b0e676bbf Decrease maximum memory available to ccache 5e0704ecd remove spurious undefs 23870b36b SYCL: Use sycl::bit_cast from oneAPI 2024.0.0 on (#6300) 78e8a3267 SIMD: add shift ops for all int types (#6109) a85dc980d Merge remote-tracking branch 'upstream/develop' into team_level_nonmod_seq_p1 d89140de0 Update Clang+Cuda CI from 10.1 to 11.0.3 (#6318) b57fb7317 Allow using the SYCL execution space on AMD GPUs (#6321) 15a9c9cc8 Fix reviewer's comments 929cac250 #5635: Serial/OpenMP: Parallel_scan with return value for TeamThreadRange (#6090) afbbdcc2e #5635: Add test for parallel_scan with return value for ThreadVectorRange 7290d5256 #5635: Add parallel_scan overload with return value for ThreadVectorRange 39de95933 Merge pull request #6299 from tcclevenger/thread_saftey_for_cuda_api_calls_individual_wrappers a4f57aa2d Disable failing tests for ROCm 5.5 and 5.6 3e5d43e4c Error out when CXX standard is not set when using amdclang or cray clang 74b5043ef Add comment that we don't need to link against clang_rt with ROCm 5.5 176c1c741 Split Kokkos_Parallel_Team.hpp d47f9eca7 Move Kokkos_SYCL_Scan.hpp 7ec0abe1a Split Kokkos_SYCL_Parallel_Reduce.hpp 236065bbd Split Kokkos_SYCL_Parallel_Range.hpp 6cffdb416 [HIP] Optimize parallel_reduce (#6229) 8b0841596 Merge pull request #6309 from cz4rs/fix-hpx-with-serial-enabled bc56d2998 add helper variable templates `are_*_iterators_v` (#6312) 26cd6cc3d Update CI: use CUDA 11.6.2 instead of 11.6.0 (#6314) c893105c6 Remove static in comment 0e97679b1 Use "if constexpr" for setCudaDevice ba6b4d9d1 Rework stream inputs c4278e121 Cuda10 requires "stream=nullptr" as default arg 3f19a97b8 Some api function require cuda11.2+ 03039fe45 Reorganize #include 4d8629f26 create cudaAPI function wrappers aaf00c0f4 bring back previous code as discussed in meeting 2f12ebb3e Improve SYCL reduction performance: MDRangePolicy (#6271) ced245196 Check for overflow during backend initialization (Cuda, HIP, SYCL) (#6159) 5f7d27505 Modify fences in View API test c0266e93c Enable Serial backend in HPX build 5a7039f8e Improve macro definitions 00ff0a571 Don't suppress warnings for NVHPC 8ef6cb55d Suppress warnings 44bf94578 Use KOKKOS_IF_ON_HOST 3a255d185 Enforce create_mirror restrictions w.r.t. ViewCtorArgs on all variants b210bff00 format a17ca7b5c std_algos: for_each: try condense the impl 4d1c6c351 Fix a memory bug in the `free_state` function of random pools. (#6290) 2dfb861e2 Merge pull request #6298 from Rombur/nightly_fixup 1b863ef96 Fix typo in nightly jenkins configuration ada7d5bfa Merge pull request #6295 from Rombur/nightly_c++23 db1fa2709 Explicitly capture this in lambda function f448ac8d0 Add nightly build using latest gcc and c++23 78b856fd2 `Kokkos::sort` support custom comparator (#6253) 8452f8d2a OpenACC CMakechange Clacc (#6250) ba79dc43e SIMD: suppress a uninitialized variable warning (#6294) 1eb7c2d55 Merge pull request #6293 from masterleinad/fix_sycl_sign_compare 93933a417 SYCL TeamPolicy: Fix sign comparison warning 3f5be1bd7 Merge pull request #6288 from Rombur/cuda_11.7.1 11ce288cd Improve SYCL reduction performance: workgroup_reduction (#6270) 933d23b1f Improve SYCL reduction performance: RangePolicy (#6264) 5cfb2f8d2 Update CI from CUDA 11.7.0 to 11.7.1 ed94e603f Update CMakeLists for unit tests with OpenMPTarget, OpenACC with NVHPC (#6260) 5d92feed6 Merge pull request #6272 from masterleinad/improve_reduction_performance_sycl_4 96922d9a4 address comments 48ecaef0e fix for openmptarget 6423c55cf guard for openmptarget b7ba3c963 fix copyright 57212062c subset of team level impl of std algorithms 1c20b3633 Merge pull request #6283 from masterleinad/avoid_conflicting_namespace_sycl_algorithm 7061a4bd0 Merge pull request #6282 from cwpearson/feature/export-compiler-version eeacb8530 Fully qualify Experimental::SYCL in algorithms to avoid finding conflicting namespaces 73c924a31 make Kokkos_CXX_COMPILER_VERSION available to CMake consumers 7e6871f6a Merge pull request #6252 from dalg24/deprecate_kokkos_vector af2bfe07a Merge pull request #6269 from ndellingwood/fix-6268 ae2454885 Improve SYCL TeamPolicy reduction 597bc36e9 Implement Kokkos::printf (#6083) 53a5aef3a Merge pull request #6265 from Rombur/copy_constructor 14c5c8403 Workaround gcc/8.2.0 compiler issue with _mm512_abs_pd 36ec6fae8 Merge pull request #6263 from masterleinad/fix_simd_neon 47f9ceec5 Add default ParallelFor copy constructor for HIP 8296fb127 Fix SIMD tests on NEON f050fbc66 Merge pull request #6262 from dalg24/fixup_simd_abs 778225ffb Fix SIMD abs unit test accidentally using complex overload dada6e43c Merge pull request #6251 from dalg24/complex_cv_unqualified_floating_point 35c1e3c37 SIMD: Add abs() for all int types (#6069) 91fdd34f6 Error out when Kokkoks_Vector.hpp is included with deprecated code disabled 990ac55ad Make sure macros are defined 07a0f0c2c std_algos: fix wrong corner case for `is_partitioned` (#6257) 6a8488d78 SYCL: Use in-order queues in InterOp tests (#6246) 46a3c752b Ignore in the header self-containment tests d0da878e5 Drop Vector test with makefiles and conditionally remove it with CMake 2fc969e94 Warn if is included 648d86111 Deprecated Kokkos::vector 113055e3c Ensure that complex is only instantiated for cv-unqualified floating-point type 13948ecf8 Merge pull request #6239 from fnrizzi/sort_slim_api_move_impl 7a18e97bf Merge pull request #6245 from masterleinad/avoid_undefined_behavior_test_task_scheduler e1f5701d3 Merge pull request #6247 from bartlettroscoe/remove-tribits_exclude_autotools_files 3c4dff5d1 Remove calling tribits_exclude_autotools_files() 9d5759a3a Avoid undefined behavior in TestTaskScheduker.hpp 0db68e41b Fix whitespace in bug_report.md (#6244) 70a4a0a9e bug_report.md: new PR branching from `develop` (#5034) 2941c913e fix corner cases 2cd6bbc79 Fix windows symlink configure issue (#6241) 5e3e5f4b3 fix corner case 91b57f1d9 Fix AVX2 simd support for ZEN2 AMD CPU. (#6238) 2a1c79bf8 use exespace to check rather than mem space 543accd71 refine for cuda uvm 492bb57c3 improve all other corner cases as per review comment 329b40a79 Merge pull request #6240 from seyonglee/openacc_bugfix 016d5d776 This PR contains minor code changes and bug fixes needed for LLVM-Clacc compiler to compile the OpenACC backend. afc873b5f slim API and move code to impl 5b6cb80a2 make constraints on `Kokkos::sort` more visible/clear (#6234) 855e93a04 only compute with relavent entries 9e6befca0 SYCL: Support for bhalf_t (#6204) 5dd399c28 reorganize sort headers (#6230) 79bc7add4 Merge pull request #6181 from e10harvey/remove_push_trigger 80dd6adad Merge pull request #6189 from masterleinad/sycl_in_order_queues 7c62855d7 Merge pull request #6223 from masterleinad/fix_simd_on_gpus 2b2c35de0 Changelog for 4.1.00 (#6225) c229f2eea Disable KOKKOS_ARCH_AVX512XEON for NVHPC 99fe13416 Move scalar overloads to Scalar header 995a6a6c2 Avoid SFINAE in favor of overloads cf76d5091 Make in-order queues the default via macro 4bfa10e0e Fix host-device annotation for where_expression/const_where_expression fff1bc6ef Fix a gcc-8.4.0 warning 4a57a2ebc Introduce impl_get_value/impl_get_mask 0de2de2b1 Fix host-annotations of AVX2, AVX512, and NEON constructors 352f4c3ac Disable AVX512 support for NVHPC 3163aefc0 snapshot mdspan namespace changes (#6162) 8d410bd24 Merge pull request #6218 from masterleinad/fix_sycl_makefile 0714f3ff6 Fix compiling SYCL with KOKKOS_IMPL_DO_NOT_USE_PRINTF_USAGE 734e1da29 Merge pull request #6214 from ndellingwood/update-testing-scripts 7cc2e453b team-level std algos: common code needed (#6199) 630387491 [ci skip] test_all_sandia: update compilers and queues 61839dccf implementation and tests 68378682c implementation and tests 63d695ce5 Merge pull request #6202 from masterleinad/update_version_develop d78a24f72 Merge pull request #6203 from masterleinad/fix_libquadmath_test bcd18c4fd Fix test_quad_precision_math_constants test 005a85b27 Update version number on develop after branching off for 4.1.00 76b626ebd .github/workflows: Only trigger upon push to develop d730ed70d .github/workflows: Remove push trigger git-subtree-dir: tpls/kokkos git-subtree-split: 08ceff92bcf3a828844480bc1e6137eb74028517 --- .github/ISSUE_TEMPLATE/bug_report.md | 25 +- .github/dependabot.yml | 6 + .github/workflows/clang-format-check.yml | 15 + .github/workflows/codeql.yml | 51 + .../continuous-integration-workflow-32bit.yml | 16 +- .../continuous-integration-workflow-hpx.yml | 21 +- .../continuous-integration-workflow.yml | 56 +- .github/workflows/osx.yml | 13 +- .github/workflows/performance-benchmark.yml | 9 +- .github/workflows/releases.yml | 83 + .github/workflows/scorecard.yml | 73 + .github/workflows/windows.yml | 35 + .jenkins | 69 +- .jenkins_nightly | 68 + .olcf-gitlab-ci.yml | 12 + CHANGELOG.md | 453 ++- CITATION.cff | 65 + CMakeLists.txt | 14 +- Copyright.txt | 49 +- LICENSE | 10 - Makefile.kokkos | 256 +- Makefile.targets | 15 +- README.md | 69 +- SECURITY.md | 12 + Spack.md | 1 - algorithms/CMakeLists.txt | 2 +- algorithms/src/CMakeLists.txt | 4 +- algorithms/src/Kokkos_NestedSort.hpp | 180 +- algorithms/src/Kokkos_Random.hpp | 54 +- algorithms/src/Kokkos_Sort.hpp | 760 +---- algorithms/src/Kokkos_StdAlgorithms.hpp | 1 - .../src/sorting/Kokkos_BinOpsPublicAPI.hpp | 129 + .../src/sorting/Kokkos_BinSortPublicAPI.hpp | 410 +++ .../sorting/Kokkos_NestedSortPublicAPI.hpp | 100 + .../src/sorting/Kokkos_SortByKeyPublicAPI.hpp | 117 + .../src/sorting/Kokkos_SortPublicAPI.hpp | 196 ++ .../impl/Kokkos_CopyOpsForBinSortImpl.hpp | 61 + .../sorting/impl/Kokkos_NestedSortImpl.hpp | 114 + .../src/sorting/impl/Kokkos_SortByKeyImpl.hpp | 424 +++ .../src/sorting/impl/Kokkos_SortImpl.hpp | 416 +++ .../Kokkos_AdjacentDifference.hpp | 203 +- .../std_algorithms/Kokkos_AdjacentFind.hpp | 117 +- .../src/std_algorithms/Kokkos_AllOf.hpp | 64 +- .../src/std_algorithms/Kokkos_AnyOf.hpp | 64 +- algorithms/src/std_algorithms/Kokkos_Copy.hpp | 69 +- .../std_algorithms/Kokkos_CopyBackward.hpp | 69 +- .../src/std_algorithms/Kokkos_CopyIf.hpp | 77 +- .../src/std_algorithms/Kokkos_CopyN.hpp | 72 +- .../src/std_algorithms/Kokkos_Count.hpp | 60 +- .../src/std_algorithms/Kokkos_CountIf.hpp | 64 +- .../src/std_algorithms/Kokkos_Equal.hpp | 329 +- .../std_algorithms/Kokkos_ExclusiveScan.hpp | 243 +- algorithms/src/std_algorithms/Kokkos_Fill.hpp | 54 +- .../src/std_algorithms/Kokkos_FillN.hpp | 58 +- algorithms/src/std_algorithms/Kokkos_Find.hpp | 60 +- .../src/std_algorithms/Kokkos_FindEnd.hpp | 144 +- .../src/std_algorithms/Kokkos_FindFirstOf.hpp | 155 +- .../src/std_algorithms/Kokkos_FindIf.hpp | 70 +- .../src/std_algorithms/Kokkos_FindIfNot.hpp | 63 +- .../src/std_algorithms/Kokkos_ForEach.hpp | 85 +- .../src/std_algorithms/Kokkos_ForEachN.hpp | 72 +- .../src/std_algorithms/Kokkos_Generate.hpp | 54 +- .../src/std_algorithms/Kokkos_GenerateN.hpp | 63 +- .../std_algorithms/Kokkos_InclusiveScan.hpp | 311 +- .../std_algorithms/Kokkos_IsPartitioned.hpp | 61 +- .../src/std_algorithms/Kokkos_IsSorted.hpp | 116 +- .../std_algorithms/Kokkos_IsSortedUntil.hpp | 115 +- .../src/std_algorithms/Kokkos_IterSwap.hpp | 13 +- .../Kokkos_LexicographicalCompare.hpp | 144 +- .../src/std_algorithms/Kokkos_MaxElement.hpp | 105 +- .../src/std_algorithms/Kokkos_MinElement.hpp | 105 +- .../std_algorithms/Kokkos_MinMaxElement.hpp | 107 +- .../src/std_algorithms/Kokkos_Mismatch.hpp | 145 +- algorithms/src/std_algorithms/Kokkos_Move.hpp | 68 +- .../std_algorithms/Kokkos_MoveBackward.hpp | 71 +- .../src/std_algorithms/Kokkos_NoneOf.hpp | 65 +- .../std_algorithms/Kokkos_PartitionCopy.hpp | 90 +- .../std_algorithms/Kokkos_PartitionPoint.hpp | 62 +- .../src/std_algorithms/Kokkos_Reduce.hpp | 203 +- .../src/std_algorithms/Kokkos_Remove.hpp | 64 +- .../src/std_algorithms/Kokkos_RemoveCopy.hpp | 82 +- .../std_algorithms/Kokkos_RemoveCopyIf.hpp | 71 +- .../src/std_algorithms/Kokkos_RemoveIf.hpp | 66 +- .../src/std_algorithms/Kokkos_Replace.hpp | 63 +- .../src/std_algorithms/Kokkos_ReplaceCopy.hpp | 83 +- .../std_algorithms/Kokkos_ReplaceCopyIf.hpp | 88 +- .../src/std_algorithms/Kokkos_ReplaceIf.hpp | 69 +- .../src/std_algorithms/Kokkos_Reverse.hpp | 53 +- .../src/std_algorithms/Kokkos_ReverseCopy.hpp | 71 +- .../src/std_algorithms/Kokkos_Rotate.hpp | 57 +- .../src/std_algorithms/Kokkos_RotateCopy.hpp | 75 +- .../src/std_algorithms/Kokkos_Search.hpp | 143 +- .../src/std_algorithms/Kokkos_SearchN.hpp | 131 +- .../src/std_algorithms/Kokkos_ShiftLeft.hpp | 54 +- .../src/std_algorithms/Kokkos_ShiftRight.hpp | 54 +- .../src/std_algorithms/Kokkos_SwapRanges.hpp | 70 +- .../src/std_algorithms/Kokkos_Transform.hpp | 217 +- .../Kokkos_TransformExclusiveScan.hpp | 130 +- .../Kokkos_TransformInclusiveScan.hpp | 244 +- .../std_algorithms/Kokkos_TransformReduce.hpp | 276 +- .../src/std_algorithms/Kokkos_Unique.hpp | 125 +- .../src/std_algorithms/Kokkos_UniqueCopy.hpp | 163 +- .../impl/Kokkos_AdjacentDifference.hpp | 66 +- .../impl/Kokkos_AdjacentFind.hpp | 81 +- .../impl/Kokkos_AllOfAnyOfNoneOf.hpp | 55 +- .../impl/Kokkos_Constraints.hpp | 108 +- .../impl/Kokkos_CopyBackward.hpp | 49 +- .../std_algorithms/impl/Kokkos_CopyCopyN.hpp | 75 +- .../src/std_algorithms/impl/Kokkos_CopyIf.hpp | 74 +- .../impl/Kokkos_CountCountIf.hpp | 52 +- .../src/std_algorithms/impl/Kokkos_Equal.hpp | 116 +- .../impl/Kokkos_ExclusiveScan.hpp | 238 +- .../std_algorithms/impl/Kokkos_FillFillN.hpp | 49 +- .../std_algorithms/impl/Kokkos_FindEnd.hpp | 110 +- .../impl/Kokkos_FindFirstOf.hpp | 80 +- .../impl/Kokkos_FindIfOrNot.hpp | 72 +- .../impl/Kokkos_ForEachForEachN.hpp | 64 +- .../impl/Kokkos_FunctorsForExclusiveScan.hpp | 221 ++ .../impl/Kokkos_GenerateGenerateN.hpp | 51 +- .../impl/Kokkos_InclusiveScan.hpp | 144 +- .../impl/Kokkos_IsPartitioned.hpp | 71 +- .../std_algorithms/impl/Kokkos_IsSorted.hpp | 53 +- .../impl/Kokkos_IsSortedUntil.hpp | 74 +- .../impl/Kokkos_LexicographicalCompare.hpp | 92 +- .../impl/Kokkos_MinMaxMinmaxElement.hpp | 83 +- .../std_algorithms/impl/Kokkos_Mismatch.hpp | 95 +- .../src/std_algorithms/impl/Kokkos_Move.hpp | 33 +- .../impl/Kokkos_MoveBackward.hpp | 42 +- .../impl/Kokkos_MustUseKokkosSingleInTeam.hpp | 47 + .../impl/Kokkos_PartitionCopy.hpp | 76 +- .../impl/Kokkos_PartitionPoint.hpp | 45 +- .../impl/Kokkos_RandomAccessIterator.hpp | 31 + .../src/std_algorithms/impl/Kokkos_Reduce.hpp | 115 +- .../impl/Kokkos_RemoveAllVariants.hpp | 130 +- .../std_algorithms/impl/Kokkos_Replace.hpp | 28 +- .../impl/Kokkos_ReplaceCopy.hpp | 49 +- .../impl/Kokkos_ReplaceCopyIf.hpp | 65 +- .../std_algorithms/impl/Kokkos_ReplaceIf.hpp | 34 +- .../std_algorithms/impl/Kokkos_Reverse.hpp | 31 +- .../impl/Kokkos_ReverseCopy.hpp | 42 +- .../src/std_algorithms/impl/Kokkos_Rotate.hpp | 34 +- .../std_algorithms/impl/Kokkos_RotateCopy.hpp | 49 +- .../src/std_algorithms/impl/Kokkos_Search.hpp | 110 +- .../std_algorithms/impl/Kokkos_SearchN.hpp | 120 +- .../std_algorithms/impl/Kokkos_ShiftLeft.hpp | 41 +- .../std_algorithms/impl/Kokkos_ShiftRight.hpp | 42 +- .../std_algorithms/impl/Kokkos_SwapRanges.hpp | 42 +- .../std_algorithms/impl/Kokkos_Transform.hpp | 103 +- .../impl/Kokkos_TransformExclusiveScan.hpp | 122 +- .../impl/Kokkos_TransformInclusiveScan.hpp | 212 +- .../impl/Kokkos_TransformReduce.hpp | 116 +- .../src/std_algorithms/impl/Kokkos_Unique.hpp | 93 +- .../std_algorithms/impl/Kokkos_UniqueCopy.hpp | 133 +- algorithms/unit_tests/CMakeLists.txt | 344 +- algorithms/unit_tests/Makefile | 20 +- algorithms/unit_tests/TestBinSortA.hpp | 14 +- algorithms/unit_tests/TestBinSortB.hpp | 70 +- algorithms/unit_tests/TestRandom.hpp | 92 + .../unit_tests/TestRandomAccessIterator.cpp | 49 + algorithms/unit_tests/TestSortByKey.hpp | 253 ++ algorithms/unit_tests/TestSortCustomComp.hpp | 133 + .../unit_tests/TestStdAlgorithmsCommon.cpp | 14 + .../unit_tests/TestStdAlgorithmsCommon.hpp | 488 ++- .../TestStdAlgorithmsConstraints.cpp | 109 + .../TestStdAlgorithmsExclusiveScan.cpp | 234 +- .../TestStdAlgorithmsHelperFunctors.hpp | 20 +- .../TestStdAlgorithmsInclusiveScan.cpp | 226 +- .../unit_tests/TestStdAlgorithmsIsSorted.cpp | 7 +- .../TestStdAlgorithmsIsSortedUntil.cpp | 19 +- .../unit_tests/TestStdAlgorithmsModOps.cpp | 48 +- .../TestStdAlgorithmsPartitionCopy.cpp | 6 +- .../TestStdAlgorithmsPartitioningOps.cpp | 2 +- ...estStdAlgorithmsTeamAdjacentDifference.cpp | 220 ++ .../TestStdAlgorithmsTeamAdjacentFind.cpp | 256 ++ .../unit_tests/TestStdAlgorithmsTeamAllOf.cpp | 165 + .../unit_tests/TestStdAlgorithmsTeamAnyOf.cpp | 165 + .../unit_tests/TestStdAlgorithmsTeamCopy.cpp | 157 + .../TestStdAlgorithmsTeamCopyBackward.cpp | 168 + .../TestStdAlgorithmsTeamCopyIf.cpp | 180 + .../TestStdAlgorithmsTeamCopy_n.cpp | 176 + .../unit_tests/TestStdAlgorithmsTeamCount.cpp | 201 ++ .../TestStdAlgorithmsTeamCountIf.cpp | 162 + .../unit_tests/TestStdAlgorithmsTeamEqual.cpp | 278 ++ .../TestStdAlgorithmsTeamExclusiveScan.cpp | 272 ++ .../unit_tests/TestStdAlgorithmsTeamFill.cpp | 106 + .../TestStdAlgorithmsTeamFill_n.cpp | 176 + .../unit_tests/TestStdAlgorithmsTeamFind.cpp | 212 ++ .../TestStdAlgorithmsTeamFindEnd.cpp | 271 ++ .../TestStdAlgorithmsTeamFindFirstOf.cpp | 280 ++ .../TestStdAlgorithmsTeamFindIf.cpp | 241 ++ .../TestStdAlgorithmsTeamFindIfNot.cpp | 236 ++ .../TestStdAlgorithmsTeamForEach.cpp | 126 + .../TestStdAlgorithmsTeamForEachN.cpp | 144 + .../TestStdAlgorithmsTeamGenerate.cpp | 116 + .../TestStdAlgorithmsTeamGenerate_n.cpp | 179 + .../TestStdAlgorithmsTeamInclusiveScan.cpp | 297 ++ .../TestStdAlgorithmsTeamIsPartitioned.cpp | 255 ++ .../TestStdAlgorithmsTeamIsSorted.cpp | 209 ++ .../TestStdAlgorithmsTeamIsSortedUntil.cpp | 275 ++ ...tdAlgorithmsTeamLexicographicalCompare.cpp | 286 ++ .../TestStdAlgorithmsTeamMaxElement.cpp | 182 + .../TestStdAlgorithmsTeamMinElement.cpp | 181 + .../TestStdAlgorithmsTeamMinMaxElement.cpp | 200 ++ .../TestStdAlgorithmsTeamMismatch.cpp | 283 ++ .../unit_tests/TestStdAlgorithmsTeamMove.cpp | 161 + .../TestStdAlgorithmsTeamMoveBackward.cpp | 170 + .../TestStdAlgorithmsTeamNoneOf.cpp | 165 + .../TestStdAlgorithmsTeamPartitionCopy.cpp | 313 ++ .../TestStdAlgorithmsTeamPartitionPoint.cpp | 260 ++ .../TestStdAlgorithmsTeamReduce.cpp | 272 ++ .../TestStdAlgorithmsTeamRemove.cpp | 182 + .../TestStdAlgorithmsTeamRemoveCopy.cpp | 226 ++ .../TestStdAlgorithmsTeamRemoveCopyIf.cpp | 182 + .../TestStdAlgorithmsTeamRemoveIf.cpp | 166 + .../TestStdAlgorithmsTeamReplace.cpp | 135 + .../TestStdAlgorithmsTeamReplaceCopy.cpp | 204 ++ .../TestStdAlgorithmsTeamReplaceCopyIf.cpp | 183 + .../TestStdAlgorithmsTeamReplaceIf.cpp | 138 + .../TestStdAlgorithmsTeamReverse.cpp | 105 + .../TestStdAlgorithmsTeamReverseCopy.cpp | 153 + .../TestStdAlgorithmsTeamRotate.cpp | 173 + .../TestStdAlgorithmsTeamRotateCopy.cpp | 188 ++ .../TestStdAlgorithmsTeamSearch.cpp | 279 ++ .../TestStdAlgorithmsTeamSearchN.cpp | 295 ++ .../TestStdAlgorithmsTeamShiftLeft.cpp | 189 ++ .../TestStdAlgorithmsTeamShiftRight.cpp | 187 ++ .../TestStdAlgorithmsTeamSwapRanges.cpp | 151 + ...TestStdAlgorithmsTeamTransformBinaryOp.cpp | 185 ++ ...tdAlgorithmsTeamTransformExclusiveScan.cpp | 245 ++ ...tdAlgorithmsTeamTransformInclusiveScan.cpp | 281 ++ .../TestStdAlgorithmsTeamTransformReduce.cpp | 323 ++ .../TestStdAlgorithmsTeamTransformUnaryOp.cpp | 176 + .../TestStdAlgorithmsTeamUnique.cpp | 171 + .../TestStdAlgorithmsTeamUniqueCopy.cpp | 200 ++ ...estStdAlgorithmsTransformExclusiveScan.cpp | 96 +- ...estStdAlgorithmsTransformInclusiveScan.cpp | 120 +- algorithms/unit_tests/TestStdReducers.cpp | 32 +- appveyor.yml | 2 +- benchmarks/CMakeLists.txt | 12 + benchmarks/atomic/CMakeLists.txt | 4 + benchmarks/bytes_and_flops/CMakeLists.txt | 4 + benchmarks/bytes_and_flops/bench.hpp | 12 +- benchmarks/bytes_and_flops/bench_double.cpp | 2 +- benchmarks/bytes_and_flops/bench_float.cpp | 2 +- benchmarks/bytes_and_flops/bench_int32_t.cpp | 2 +- benchmarks/bytes_and_flops/bench_int64_t.cpp | 2 +- benchmarks/bytes_and_flops/bench_stride.hpp | 16 +- .../bytes_and_flops/bench_unroll_stride.hpp | 2 +- benchmarks/bytes_and_flops/main.cpp | 4 +- benchmarks/gather/CMakeLists.txt | 4 + benchmarks/gather/gather.hpp | 16 +- benchmarks/gather/gather_unroll.hpp | 2 +- benchmarks/gather/main.cpp | 2 +- benchmarks/gups/CMakeLists.txt | 4 + benchmarks/gups/gups-kokkos.cpp | 175 - benchmarks/gups/gups.cpp | 195 ++ benchmarks/launch_latency/CMakeLists.txt | 4 + benchmarks/launch_latency/launch_latency.cpp | 283 ++ benchmarks/policy_performance/CMakeLists.txt | 4 + benchmarks/policy_performance/main.cpp | 5 +- .../policy_performance/policy_perf_test.hpp | 14 +- benchmarks/stream/CMakeLists.txt | 4 + .../view_copy_constructor/CMakeLists.txt | 4 + .../{gups => view_copy_constructor}/Makefile | 21 +- .../view_copy_constructor.cpp | 310 ++ bin/kokkos_launch_compiler | 4 +- bin/nvcc_wrapper | 6 +- cmake/Dependencies.cmake | 1 - cmake/KokkosConfig.cmake.in | 8 +- cmake/KokkosConfigCommon.cmake.in | 8 +- cmake/KokkosCore_config.h.in | 39 +- cmake/Modules/FindTPLCUDA.cmake | 54 +- cmake/Modules/FindTPLLIBRT.cmake | 1 - cmake/Modules/FindTPLMEMKIND.cmake | 1 - cmake/Modules/FindTPLONEDPL.cmake | 3 + cmake/Modules/FindTPLROCM.cmake | 26 +- cmake/Modules/FindTPLROCTHRUST.cmake | 15 + cmake/deps/CUDA.cmake | 1 - cmake/deps/CUSPARSE.cmake | 26 - cmake/fake_tribits.cmake | 17 - cmake/kokkos_arch.cmake | 397 ++- cmake/kokkos_compiler_id.cmake | 83 +- cmake/kokkos_enable_devices.cmake | 47 +- cmake/kokkos_enable_options.cmake | 18 +- cmake/kokkos_functions.cmake | 7 +- cmake/kokkos_pick_cxx_std.cmake | 1 + cmake/kokkos_test_cxx_std.cmake | 8 +- cmake/kokkos_tpls.cmake | 42 +- cmake/kokkos_tribits.cmake | 71 +- cmake/tpls/FindTPLCUSPARSE.cmake | 26 - config/test_all_sandia | 773 ----- config/yaml/volta.yaml | 4 - containers/performance_tests/TestCuda.cpp | 4 - containers/src/Kokkos_Bitset.hpp | 42 +- containers/src/Kokkos_DualView.hpp | 58 +- containers/src/Kokkos_DynRankView.hpp | 423 +-- containers/src/Kokkos_DynamicView.hpp | 333 +- containers/src/Kokkos_OffsetView.hpp | 274 +- containers/src/Kokkos_UnorderedMap.hpp | 218 +- containers/src/Kokkos_Vector.hpp | 21 +- containers/src/impl/Kokkos_Bitset_impl.hpp | 1 - .../src/impl/Kokkos_UnorderedMap_impl.hpp | 17 +- containers/unit_tests/CMakeLists.txt | 8 + containers/unit_tests/Makefile | 11 +- containers/unit_tests/TestBitset.hpp | 36 +- containers/unit_tests/TestDualView.hpp | 190 +- containers/unit_tests/TestUnorderedMap.hpp | 104 +- containers/unit_tests/TestVector.hpp | 4 +- core/perf_test/CMakeLists.txt | 11 +- core/perf_test/PerfTest_MallocFree.cpp | 100 + core/perf_test/PerfTest_ViewAllocate.cpp | 34 - core/perf_test/test_atomic.cpp | 3 +- core/perf_test/test_atomic_minmax_simple.cpp | 8 +- core/perf_test/test_mempool.cpp | 1 + core/perf_test/test_taskdag.cpp | 2 + core/src/CMakeLists.txt | 17 +- core/src/Cuda/Kokkos_Cuda.hpp | 68 +- core/src/Cuda/Kokkos_CudaSpace.cpp | 372 +-- core/src/Cuda/Kokkos_CudaSpace.hpp | 251 +- .../Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp | 8 +- core/src/Cuda/Kokkos_Cuda_Error.hpp | 51 - core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp | 28 +- core/src/Cuda/Kokkos_Cuda_GraphNode_Impl.hpp | 1 - core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp | 52 +- core/src/Cuda/Kokkos_Cuda_Instance.cpp | 729 ++-- core/src/Cuda/Kokkos_Cuda_Instance.hpp | 344 +- core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp | 123 +- core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp | 4 +- .../src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp | 23 +- core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp | 45 +- core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp | 133 +- core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp | 11 +- core/src/Cuda/Kokkos_Cuda_Task.hpp | 45 +- core/src/Cuda/Kokkos_Cuda_Team.hpp | 67 +- core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp | 1 - core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp | 4 +- core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp | 17 +- core/src/Cuda/Kokkos_Cuda_abort.hpp | 25 +- core/src/HIP/Kokkos_HIP.cpp | 48 +- core/src/HIP/Kokkos_HIP.hpp | 32 +- core/src/HIP/Kokkos_HIP_Error.hpp | 37 - core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp | 152 + core/src/HIP/Kokkos_HIP_GraphNode_Impl.hpp | 54 + core/src/HIP/Kokkos_HIP_Graph_Impl.hpp | 187 ++ core/src/HIP/Kokkos_HIP_Instance.cpp | 213 +- core/src/HIP/Kokkos_HIP_Instance.hpp | 74 +- core/src/HIP/Kokkos_HIP_KernelLaunch.hpp | 157 +- .../HIP/Kokkos_HIP_ParallelFor_MDRange.hpp | 173 + core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp | 100 + core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp | 177 + ... => Kokkos_HIP_ParallelReduce_MDRange.hpp} | 145 +- .../HIP/Kokkos_HIP_ParallelReduce_Range.hpp | 329 ++ .../HIP/Kokkos_HIP_ParallelReduce_Team.hpp | 395 +++ ....hpp => Kokkos_HIP_ParallelScan_Range.hpp} | 344 +- core/src/HIP/Kokkos_HIP_Parallel_Team.hpp | 902 ----- core/src/HIP/Kokkos_HIP_ReduceScan.hpp | 48 +- .../HIP/Kokkos_HIP_SharedAllocationRecord.cpp | 144 +- .../HIP/Kokkos_HIP_SharedAllocationRecord.hpp | 121 +- core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp | 9 +- core/src/HIP/Kokkos_HIP_Space.cpp | 61 +- core/src/HIP/Kokkos_HIP_Space.hpp | 62 +- core/src/HIP/Kokkos_HIP_Team.hpp | 69 +- .../src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp | 421 +++ core/src/HIP/Kokkos_HIP_UniqueToken.hpp | 1 - core/src/HIP/Kokkos_HIP_Vectorization.hpp | 2 +- core/src/HIP/Kokkos_HIP_ZeroMemset.hpp | 10 +- core/src/HPX/Kokkos_HPX.cpp | 33 +- core/src/HPX/Kokkos_HPX.hpp | 124 +- core/src/HPX/Kokkos_HPX_Task.hpp | 9 +- core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp | 3 +- core/src/KokkosExp_MDRangePolicy.hpp | 112 +- core/src/Kokkos_Abort.hpp | 105 + core/src/Kokkos_Array.hpp | 126 +- core/src/Kokkos_Assert.hpp | 66 + .../Kokkos_Atomics_Desul_Volatile_Wrapper.hpp | 3 +- core/src/Kokkos_Atomics_Desul_Wrapper.hpp | 4 +- core/src/Kokkos_BitManipulation.hpp | 37 +- core/src/Kokkos_Clamp.hpp | 41 + core/src/Kokkos_Complex.hpp | 81 + core/src/Kokkos_Concepts.hpp | 4 +- core/src/Kokkos_CopyViews.hpp | 619 ++-- core/src/Kokkos_Core.hpp | 9 +- core/src/Kokkos_Core_fwd.hpp | 14 +- core/src/Kokkos_ExecPolicy.hpp | 294 +- core/src/Kokkos_Extents.hpp | 68 +- core/src/Kokkos_Graph.hpp | 9 + core/src/Kokkos_HBWSpace.hpp | 308 -- core/src/Kokkos_Half.hpp | 1 + core/src/Kokkos_HostSpace.hpp | 118 +- core/src/Kokkos_Layout.hpp | 120 +- core/src/Kokkos_LogicalSpaces.hpp | 413 --- core/src/Kokkos_Macros.hpp | 86 +- core/src/Kokkos_MasterLock.hpp | 56 - core/src/Kokkos_MathematicalConstants.hpp | 18 - core/src/Kokkos_MathematicalFunctions.hpp | 63 +- .../Kokkos_MathematicalSpecialFunctions.hpp | 81 +- core/src/Kokkos_MemoryPool.hpp | 2 - ...kkos_MinMaxClamp.hpp => Kokkos_MinMax.hpp} | 34 +- core/src/Kokkos_Pair.hpp | 30 +- core/src/Kokkos_Parallel.hpp | 33 +- core/src/Kokkos_Parallel_Reduce.hpp | 78 +- core/src/Kokkos_Printf.hpp | 55 + core/src/Kokkos_Profiling_ProfileSection.hpp | 41 +- core/src/Kokkos_ScratchSpace.hpp | 2 +- core/src/Kokkos_Swap.hpp | 68 + core/src/Kokkos_Tuners.hpp | 11 +- core/src/Kokkos_View.hpp | 382 ++- core/src/OpenACC/Kokkos_OpenACC.cpp | 4 +- core/src/OpenACC/Kokkos_OpenACC.hpp | 9 +- core/src/OpenACC/Kokkos_OpenACCSpace.cpp | 6 +- core/src/OpenACC/Kokkos_OpenACC_DeepCopy.hpp | 26 +- .../OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp | 2 +- core/src/OpenACC/Kokkos_OpenACC_Instance.hpp | 4 +- .../Kokkos_OpenACC_ParallelFor_Team.hpp | 11 +- .../Kokkos_OpenACC_ParallelReduce_MDRange.hpp | 12 +- .../Kokkos_OpenACC_ParallelReduce_Range.hpp | 4 +- .../Kokkos_OpenACC_ParallelReduce_Team.hpp | 7 +- .../Kokkos_OpenACC_ParallelScan_Range.hpp | 98 +- .../Kokkos_OpenACC_SharedAllocationRecord.cpp | 89 +- .../Kokkos_OpenACC_SharedAllocationRecord.hpp | 52 +- core/src/OpenACC/Kokkos_OpenACC_Team.hpp | 6 +- core/src/OpenACC/Kokkos_OpenACC_Traits.hpp | 3 + core/src/OpenMP/Kokkos_OpenMP.cpp | 44 +- core/src/OpenMP/Kokkos_OpenMP.hpp | 63 +- core/src/OpenMP/Kokkos_OpenMP_Instance.cpp | 95 +- core/src/OpenMP/Kokkos_OpenMP_Instance.hpp | 111 +- core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp | 1267 ------- .../src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp | 412 +++ .../OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp | 537 +++ .../OpenMP/Kokkos_OpenMP_Parallel_Scan.hpp | 296 ++ core/src/OpenMP/Kokkos_OpenMP_Task.cpp | 13 +- core/src/OpenMP/Kokkos_OpenMP_Task.hpp | 16 +- core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp | 9 +- .../OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp | 77 +- .../OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp | 73 +- .../OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp | 47 +- .../Kokkos_OpenMPTarget_Instance.cpp | 15 +- .../Kokkos_OpenMPTarget_MDRangePolicy.hpp | 5 + .../Kokkos_OpenMPTarget_Macros.hpp | 46 + .../Kokkos_OpenMPTarget_Parallel.hpp | 62 +- ...okkos_OpenMPTarget_ParallelFor_MDRange.hpp | 383 +++ .../Kokkos_OpenMPTarget_ParallelFor_Team.hpp | 7 +- ...s_OpenMPTarget_ParallelReduce_MDRange.hpp} | 626 ++-- ...kkos_OpenMPTarget_ParallelReduce_Range.hpp | 3 + ...okkos_OpenMPTarget_ParallelReduce_Team.hpp | 3 + ...Kokkos_OpenMPTarget_ParallelScan_Range.hpp | 17 +- .../Kokkos_OpenMPTarget_ParallelScan_Team.hpp | 61 +- .../Kokkos_OpenMPTarget_Parallel_Common.hpp | 151 +- .../Kokkos_OpenMPTarget_Reducer.hpp | 1 - core/src/SYCL/Kokkos_SYCL.cpp | 124 +- core/src/SYCL/Kokkos_SYCL.hpp | 20 +- core/src/SYCL/Kokkos_SYCL_Abort.hpp | 4 +- core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp | 157 + core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp | 56 + core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp | 174 + core/src/SYCL/Kokkos_SYCL_Half_Conversion.hpp | 128 +- core/src/SYCL/Kokkos_SYCL_Half_Impl_Type.hpp | 37 +- core/src/SYCL/Kokkos_SYCL_Instance.cpp | 235 +- core/src/SYCL/Kokkos_SYCL_Instance.hpp | 36 +- ...pp => Kokkos_SYCL_ParallelFor_MDRange.hpp} | 161 +- .../SYCL/Kokkos_SYCL_ParallelFor_Range.hpp | 179 + .../src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp | 200 ++ .../Kokkos_SYCL_ParallelReduce_MDRange.hpp | 378 +++ .../SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp | 396 +++ .../SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp | 484 +++ ...hpp => Kokkos_SYCL_ParallelScan_Range.hpp} | 279 +- core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp | 788 ----- core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp | 914 ----- core/src/SYCL/Kokkos_SYCL_Space.cpp | 313 +- core/src/SYCL/Kokkos_SYCL_Space.hpp | 201 +- core/src/SYCL/Kokkos_SYCL_Team.hpp | 191 +- core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp | 357 ++ .../SYCL/Kokkos_SYCL_WorkgroupReduction.hpp | 256 ++ core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp | 11 +- core/src/Serial/Kokkos_Serial.cpp | 49 +- core/src/Serial/Kokkos_Serial.hpp | 71 +- .../Serial/Kokkos_Serial_Parallel_MDRange.hpp | 21 +- .../Serial/Kokkos_Serial_Parallel_Range.hpp | 32 +- .../Serial/Kokkos_Serial_Parallel_Team.hpp | 32 +- core/src/Serial/Kokkos_Serial_Task.hpp | 4 +- core/src/Serial/Kokkos_Serial_ZeroMemset.hpp | 14 +- core/src/Threads/Kokkos_Threads.hpp | 29 +- ...dsExec.cpp => Kokkos_Threads_Instance.cpp} | 454 +-- ...dsExec.hpp => Kokkos_Threads_Instance.hpp} | 180 +- .../Kokkos_Threads_ParallelFor_MDRange.hpp | 115 + .../Kokkos_Threads_ParallelFor_Range.hpp | 122 + .../Kokkos_Threads_ParallelFor_Team.hpp | 118 + ...Kokkos_Threads_ParallelReduce_MDRange.hpp} | 131 +- .../Kokkos_Threads_ParallelReduce_Range.hpp | 173 + ...=> Kokkos_Threads_ParallelReduce_Team.hpp} | 110 +- .../Kokkos_Threads_ParallelScan_Range.hpp | 198 ++ .../Threads/Kokkos_Threads_Parallel_Range.hpp | 435 --- .../Kokkos_Threads_Spinwait.cpp} | 12 +- core/src/Threads/Kokkos_Threads_Spinwait.hpp | 43 + .../Threads/Kokkos_Threads_State.hpp} | 25 +- ...hreadsTeam.hpp => Kokkos_Threads_Team.hpp} | 118 +- .../Kokkos_Threads_WorkGraphPolicy.hpp | 11 +- core/src/View/Kokkos_ViewAlloc.hpp | 318 ++ .../View/MDSpan/Kokkos_MDSpan_Accessor.hpp | 220 ++ .../src/View/MDSpan/Kokkos_MDSpan_Extents.hpp | 29 +- core/src/View/MDSpan/Kokkos_MDSpan_Header.hpp | 24 +- core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp | 156 + core/src/decl/Kokkos_Declare_HIP.hpp | 10 +- core/src/decl/Kokkos_Declare_OPENMP.hpp | 3 + core/src/decl/Kokkos_Declare_SYCL.hpp | 14 +- core/src/decl/Kokkos_Declare_THREADS.hpp | 10 + .../src/impl/Kokkos_Abort.cpp | 37 +- core/src/impl/Kokkos_CheckedIntegerOps.hpp | 66 + core/src/impl/Kokkos_Core.cpp | 313 +- core/src/impl/Kokkos_Default_Graph_Impl.hpp | 9 +- .../Kokkos_DesulAtomicsConfig.hpp} | 12 +- core/src/impl/Kokkos_DeviceManagement.hpp | 8 +- core/src/impl/Kokkos_Error.cpp | 139 +- core/src/impl/Kokkos_Error.hpp | 246 +- core/src/impl/Kokkos_FunctorAnalysis.hpp | 4 - core/src/impl/Kokkos_HBWSpace.cpp | 313 -- .../impl/Kokkos_Half_FloatingPointWrapper.hpp | 247 +- .../Kokkos_Half_MathematicalFunctions.hpp | 259 ++ core/src/impl/Kokkos_Half_NumericTraits.hpp | 40 +- core/src/impl/Kokkos_HostSpace.cpp | 219 +- core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp | 11 +- core/src/impl/Kokkos_HostSpace_deepcopy.cpp | 2 +- core/src/impl/Kokkos_HostThreadTeam.cpp | 1 - core/src/impl/Kokkos_HostThreadTeam.hpp | 96 +- .../impl/Kokkos_InitializationSettings.hpp | 85 +- core/src/impl/Kokkos_MemorySpace.cpp | 72 - core/src/impl/Kokkos_MemorySpace.hpp | 71 - core/src/impl/Kokkos_Memory_Fence.hpp | 54 - core/src/impl/Kokkos_Profiling.cpp | 94 +- core/src/impl/Kokkos_Profiling.hpp | 59 +- core/src/impl/Kokkos_Profiling_C_Interface.h | 12 +- core/src/impl/Kokkos_Profiling_Interface.hpp | 2 +- core/src/impl/Kokkos_SharedAlloc.cpp | 65 +- core/src/impl/Kokkos_SharedAlloc.hpp | 212 +- core/src/impl/Kokkos_SharedAlloc_timpl.hpp | 141 +- core/src/impl/Kokkos_Spinwait.hpp | 109 - core/src/impl/Kokkos_TaskQueueCommon.hpp | 8 +- core/src/impl/Kokkos_TaskQueueMultiple.hpp | 4 +- core/src/impl/Kokkos_Utilities.hpp | 32 + core/src/impl/Kokkos_ViewArray.hpp | 617 ---- core/src/impl/Kokkos_ViewDataAnalysis.hpp | 402 +++ core/src/impl/Kokkos_ViewLayoutTiled.hpp | 1425 -------- core/src/impl/Kokkos_ViewMapping.hpp | 1136 ++----- core/src/setup/Kokkos_Setup_Cuda.hpp | 2 + core/src/setup/Kokkos_Setup_HIP.hpp | 2 + core/src/setup/Kokkos_Setup_SYCL.hpp | 33 +- core/src/traits/Kokkos_IndexTypeTrait.hpp | 2 +- .../traits/Kokkos_OccupancyControlTrait.hpp | 2 +- core/src/traits/Kokkos_PolicyTraitAdaptor.hpp | 4 +- core/src/traits/Kokkos_ScheduleTrait.hpp | 2 +- .../traits/Kokkos_WorkItemPropertyTrait.hpp | 2 +- core/unit_test/CMakeLists.txt | 312 +- core/unit_test/Makefile | 42 +- core/unit_test/TestAbort.hpp | 10 +- core/unit_test/TestAggregate.hpp | 112 - core/unit_test/TestArray.cpp | 193 ++ core/unit_test/TestArrayOps.hpp | 422 +++ core/unit_test/TestAtomicOperations.hpp | 1493 +++------ .../TestAtomicOperations_complexdouble.hpp | 26 +- .../TestAtomicOperations_complexfloat.hpp | 26 +- .../unit_test/TestAtomicOperations_double.hpp | 19 +- core/unit_test/TestAtomicOperations_float.hpp | 19 +- core/unit_test/TestAtomicOperations_int.hpp | 29 +- .../TestAtomicOperations_longint.hpp | 29 +- .../TestAtomicOperations_longlongint.hpp | 29 +- .../TestAtomicOperations_unsignedint.hpp | 33 +- .../TestAtomicOperations_unsignedlongint.hpp | 33 +- ...stAtomicOperations_unsignedlonglongint.hpp | 36 + core/unit_test/TestAtomics.hpp | 18 +- core/unit_test/TestBitManipulation.cpp | 4 +- .../unit_test/TestBitManipulationBuiltins.hpp | 37 +- core/unit_test/TestCheckedIntegerOps.hpp | 51 + core/unit_test/TestComplex.hpp | 186 +- core/unit_test/TestConcepts.hpp | 68 +- core/unit_test/TestDefaultDeviceTypeInit.hpp | 491 --- core/unit_test/TestDeviceAndThreads.py | 39 +- core/unit_test/TestExecSpacePartitioning.hpp | 73 +- core/unit_test/TestExecSpaceThreadSafety.hpp | 327 ++ core/unit_test/TestExecutionSpace.hpp | 64 +- core/unit_test/TestFunctorAnalysis.hpp | 58 +- core/unit_test/TestGraph.hpp | 71 +- core/unit_test/TestHalfOperators.hpp | 202 +- .../TestHostSharedPtrAccessOnDevice.hpp | 8 +- core/unit_test/TestInitializationSettings.cpp | 28 - .../TestJoinBackwardCompatibility.hpp | 34 +- core/unit_test/TestLocalDeepCopy.hpp | 1 + core/unit_test/TestMDRangePolicyCTAD.cpp | 138 + .../TestMDRangePolicyConstructors.hpp | 56 +- core/unit_test/TestMDSpan.hpp | 12 +- core/unit_test/TestMDSpanAtomicAccessor.hpp | 112 + core/unit_test/TestMDSpanConversion.hpp | 507 +++ core/unit_test/TestMathematicalConstants.hpp | 2 +- core/unit_test/TestMathematicalFunctions.hpp | 822 ++++- .../TestMathematicalSpecialFunctions.hpp | 116 +- core/unit_test/TestMinMaxClamp.hpp | 5 - core/unit_test/TestMultiGPU.hpp | 184 + core/unit_test/TestNestedReducerCTAD.cpp | 246 ++ core/unit_test/TestNonTrivialScalarTypes.hpp | 2 +- core/unit_test/TestNumericTraits.hpp | 251 +- core/unit_test/TestOccupancyControlTrait.hpp | 80 + core/unit_test/TestOther.hpp | 10 - .../TestParseCmdLineArgsAndEnvVars.cpp | 66 +- core/unit_test/TestPrintf.hpp | 37 + core/unit_test/TestQuadPrecisionMath.hpp | 2 +- core/unit_test/TestRange.hpp | 10 +- core/unit_test/TestRangePolicyCTAD.cpp | 150 + .../unit_test/TestRangePolicyConstructors.hpp | 166 + core/unit_test/TestRangePolicyRequire.hpp | 13 +- core/unit_test/TestRealloc.hpp | 13 + core/unit_test/TestReduce.hpp | 26 + core/unit_test/TestReducers.hpp | 137 +- core/unit_test/TestReducers_b.hpp | 4 + core/unit_test/TestReducers_d.hpp | 14 + core/unit_test/TestResize.hpp | 13 + core/unit_test/TestSharedSpace.cpp | 8 +- core/unit_test/TestSpaceAwareAccessor.hpp | 156 + .../TestSpaceAwareAccessorAccessViolation.hpp | 128 + core/unit_test/TestSwap.hpp | 68 + core/unit_test/TestTaskScheduler.hpp | 2 +- core/unit_test/TestTeam.hpp | 25 +- core/unit_test/TestTeamBasic.hpp | 2 +- core/unit_test/TestTeamMDRange.hpp | 526 ++- core/unit_test/TestTeamMDRangePolicyCTAD.cpp | 199 ++ core/unit_test/TestTeamPolicyCTAD.cpp | 135 + core/unit_test/TestTeamPolicyConstructors.hpp | 35 +- core/unit_test/TestTeamScan.hpp | 129 + core/unit_test/TestTeamScratch.hpp | 18 + core/unit_test/TestTeamVector.hpp | 195 +- core/unit_test/TestTeamVectorRange.hpp | 26 +- core/unit_test/TestUtilities.hpp | 108 +- core/unit_test/TestViewAPI.hpp | 114 +- core/unit_test/TestViewAPI_c.hpp | 1 + core/unit_test/TestViewAPI_d.hpp | 7 - core/unit_test/TestViewBadAlloc.hpp | 86 + core/unit_test/TestViewCopy_a.hpp | 34 + core/unit_test/TestViewCopy_c.hpp | 434 +++ core/unit_test/TestViewCtorDimMatch.hpp | 505 +-- .../TestViewEmptyRuntimeUnmanaged.hpp | 55 + .../TestViewLayoutStrideAssignment.hpp | 2 - core/unit_test/TestViewLayoutTiled.hpp | 1756 ---------- core/unit_test/TestViewMapping_a.hpp | 277 +- core/unit_test/TestViewMapping_b.hpp | 14 +- .../TestViewMemoryAccessViolation.hpp | 8 +- core/unit_test/TestViewOfViews.hpp | 75 + core/unit_test/TestViewOutOfBoundsAccess.hpp | 175 + core/unit_test/TestViewSubview.hpp | 5 +- core/unit_test/UnitTest_DeviceAndThreads.cpp | 46 +- core/unit_test/UnitTest_ScopeGuard.cpp | 155 + .../category_files/TestHPX_Category.hpp | 1 + .../category_files/TestOpenACC_Category.hpp | 1 + .../TestOpenMPTarget_Category.hpp | 1 + .../category_files/TestSYCL_Category.hpp | 1 + .../category_files/TestThreads_Category.hpp | 1 + .../test-code/test_config_arch_list.bash | 2 +- .../cuda/TestCuda_InterOp_StreamsMultiGPU.cpp | 108 + core/unit_test/cuda/TestCuda_Spaces.cpp | 176 +- .../default/TestDefaultDeviceType.cpp | 9 +- .../default/TestDefaultDeviceTypeInit_1.cpp | 18 - .../default/TestDefaultDeviceTypeInit_10.cpp | 18 - .../default/TestDefaultDeviceTypeInit_11.cpp | 18 - .../default/TestDefaultDeviceTypeInit_12.cpp | 18 - .../default/TestDefaultDeviceTypeInit_13.cpp | 18 - .../default/TestDefaultDeviceTypeInit_14.cpp | 18 - .../default/TestDefaultDeviceTypeInit_15.cpp | 18 - .../default/TestDefaultDeviceTypeInit_16.cpp | 18 - .../default/TestDefaultDeviceTypeInit_17.cpp | 18 - .../default/TestDefaultDeviceTypeInit_18.cpp | 18 - .../default/TestDefaultDeviceTypeInit_2.cpp | 18 - .../default/TestDefaultDeviceTypeInit_3.cpp | 18 - .../default/TestDefaultDeviceTypeInit_4.cpp | 18 - .../default/TestDefaultDeviceTypeInit_5.cpp | 18 - .../default/TestDefaultDeviceTypeInit_6.cpp | 18 - .../default/TestDefaultDeviceTypeInit_7.cpp | 18 - .../default/TestDefaultDeviceTypeInit_8.cpp | 18 - .../default/TestDefaultDeviceTypeInit_9.cpp | 18 - .../headers_self_contained/CMakeLists.txt | 5 + core/unit_test/hip/TestHIP_Graph.cpp | 18 - core/unit_test/hip/TestHIP_Spaces.cpp | 166 +- .../hpx/TestHPX_IndependentInstances.cpp | 2 +- ...X_IndependentInstancesDelayedExecution.cpp | 2 +- ...estHPX_IndependentInstancesInstanceIds.cpp | 2 +- .../incremental/Test01_execspace.hpp | 4 + core/unit_test/openmp/TestOpenMP_Graph.cpp | 18 - .../openmp/TestOpenMP_PartitionMaster.cpp | 105 - core/unit_test/serial/TestSerial_Graph.cpp | 18 - core/unit_test/sycl/TestSYCL_InterOp_Init.cpp | 3 +- .../sycl/TestSYCL_InterOp_Init_Context.cpp | 6 +- .../sycl/TestSYCL_InterOp_Streams.cpp | 3 +- .../sycl/TestSYCL_InterOp_StreamsMultiGPU.cpp | 64 + core/unit_test/sycl/TestSYCL_Spaces.cpp | 195 +- .../sycl/TestSYCL_TeamScratchStreams.cpp | 6 +- core/unit_test/tools/TestEventCorrectness.hpp | 24 +- core/unit_test/tools/TestLogicalSpaces.hpp | 177 - core/unit_test/tools/TestProfilingSection.cpp | 10 +- .../view/TestExtentsDatatypeConversion.cpp | 64 +- example/README | 4 +- example/build_cmake_in_tree/cmake_example.cpp | 2 + example/build_cmake_installed/CMakeLists.txt | 1 + .../build_cmake_installed/cmake_example.cpp | 2 + .../foo.cpp | 2 +- .../cmake_example.cpp | 2 + .../tutorial/01_hello_world/hello_world.cpp | 26 +- .../hello_world_lambda.cpp | 24 +- .../simple_reduce_lambda.cpp | 5 +- .../01_thread_teams/thread_teams.cpp | 10 +- .../thread_teams_lambda.cpp | 16 +- .../nested_parallel_for.cpp | 15 +- .../03_vectorization/vectorization.cpp | 7 +- generate_makefile.bash | 33 +- gnu_generate_makefile.bash | 12 - master_history.txt | 5 + scripts/check-copyright | 21 +- scripts/docker/Dockerfile.clang | 42 +- scripts/docker/Dockerfile.kokkosllvmproject | 10 +- scripts/docker/Dockerfile.nvhpc | 2 +- scripts/docker/Dockerfile.openmptarget | 9 +- scripts/docker/Dockerfile.sycl | 28 +- .../testing_scripts/generate_makefile.bash | 12 - scripts/testing_scripts/test_all_sandia | 30 +- simd/src/Kokkos_SIMD.hpp | 70 +- simd/src/Kokkos_SIMD_AVX2.hpp | 2212 +++++++++++-- simd/src/Kokkos_SIMD_AVX512.hpp | 2947 ++++++++++++++--- simd/src/Kokkos_SIMD_Common.hpp | 192 +- simd/src/Kokkos_SIMD_Common_Math.hpp | 260 ++ simd/src/Kokkos_SIMD_NEON.hpp | 2112 ++++++++++-- simd/src/Kokkos_SIMD_Scalar.hpp | 232 +- simd/unit_tests/CMakeLists.txt | 14 +- simd/unit_tests/TestSIMD.cpp | 549 +-- simd/unit_tests/include/SIMDTesting_Ops.hpp | 375 +++ .../include/SIMDTesting_Utilities.hpp | 194 ++ .../unit_tests/include/TestSIMD_Condition.hpp | 109 + .../include/TestSIMD_Conversions.hpp | 135 + .../include/TestSIMD_GeneratorCtors.hpp | 146 + simd/unit_tests/include/TestSIMD_MaskOps.hpp | 120 + simd/unit_tests/include/TestSIMD_MathOps.hpp | 323 ++ .../include/TestSIMD_Reductions.hpp | 192 ++ simd/unit_tests/include/TestSIMD_ShiftOps.hpp | 290 ++ .../include/TestSIMD_WhereExpressions.hpp | 205 ++ tpls/desul/Config.hpp.cmake.in | 2 + .../desul/include/desul/atomics/Adapt_HIP.hpp | 77 + .../include/desul/atomics/Adapt_SYCL.hpp | 7 +- .../include/desul/atomics/Atomic_Ref.hpp | 554 +--- .../desul/atomics/Compare_Exchange.hpp | 3 + .../desul/atomics/Compare_Exchange_HIP.hpp | 145 +- .../atomics/Compare_Exchange_OpenACC.hpp | 153 + tpls/desul/include/desul/atomics/Fetch_Op.hpp | 3 + .../include/desul/atomics/Fetch_Op_CUDA.hpp | 54 +- .../desul/atomics/Fetch_Op_Generic.hpp | 92 +- .../include/desul/atomics/Fetch_Op_HIP.hpp | 167 +- .../desul/atomics/Fetch_Op_OpenACC.hpp | 431 +++ tpls/desul/include/desul/atomics/Generic.hpp | 48 + .../include/desul/atomics/Lock_Array_SYCL.hpp | 80 +- .../desul/atomics/Lock_Based_Fetch_Op.hpp | 3 + .../atomics/Lock_Based_Fetch_Op_OpenACC.hpp | 81 + tpls/desul/include/desul/atomics/Macros.hpp | 28 + .../atomics/Operator_Function_Objects.hpp | 34 +- .../include/desul/atomics/Thread_Fence.hpp | 3 + .../desul/atomics/Thread_Fence_OpenACC.hpp | 25 + ...da_cc7_asm_atomic_fetch_op.inc_forceglobal | 153 - .../cuda_cc7_asm_atomic_fetch_op.inc_generic | 151 - .../cuda_cc7_asm_atomic_fetch_op.inc_isglobal | 57 +- ...cuda_cc7_asm_atomic_fetch_op.inc_predicate | 54 +- .../cuda_cc7_asm_atomic_op.inc_forceglobal | 64 - .../cuda/cuda_cc7_asm_atomic_op.inc_generic | 64 - .../cuda/cuda_cc7_asm_atomic_op.inc_isglobal | 55 +- .../cuda/cuda_cc7_asm_atomic_op.inc_predicate | 55 +- tpls/desul/src/Lock_Array_SYCL.cpp | 21 +- tpls/gtest/gtest/gtest-all.cc | 10 +- tpls/gtest/gtest/gtest.h | 18 +- .../__p0009_bits/compressed_pair.hpp | 214 +- .../experimental/__p0009_bits/config.hpp | 77 +- .../__p0009_bits/default_accessor.hpp | 50 +- .../__p0009_bits/dynamic_extent.hpp | 72 +- .../experimental/__p0009_bits/extents.hpp | 978 +++--- .../__p0009_bits/full_extent_t.hpp | 48 +- .../experimental/__p0009_bits/layout_left.hpp | 164 +- .../__p0009_bits/layout_right.hpp | 166 +- .../__p0009_bits/layout_stride.hpp | 428 ++- .../experimental/__p0009_bits/macros.hpp | 146 +- .../__p0009_bits/maybe_static_value.hpp | 152 - .../experimental/__p0009_bits/mdspan.hpp | 226 +- .../__p0009_bits/no_unique_address.hpp | 54 +- .../standard_layout_static_array.hpp | 685 ---- .../__p0009_bits/static_array.hpp | 286 -- .../experimental/__p0009_bits/submdspan.hpp | 586 ---- .../__p0009_bits/trait_backports.hpp | 62 +- .../experimental/__p0009_bits/type_list.hpp | 52 +- .../experimental/__p0009_bits/utility.hpp | 72 + .../experimental/__p1684_bits/mdarray.hpp | 263 +- .../experimental/__p2389_bits/dims.hpp | 16 +- .../__p2630_bits/strided_slice.hpp | 48 + .../experimental/__p2630_bits/submdspan.hpp | 40 + .../__p2630_bits/submdspan_extents.hpp | 321 ++ .../__p2630_bits/submdspan_mapping.hpp | 625 ++++ .../__p2642_bits/layout_padded.hpp | 867 +++++ .../__p2642_bits/layout_padded_fwd.hpp | 131 + tpls/mdspan/include/experimental/mdarray | 48 - tpls/mdspan/include/experimental/mdspan | 56 - .../mdspan/include/mdspan/mdarray.hpp | 22 +- tpls/mdspan/include/mdspan/mdspan.hpp | 43 + 800 files changed, 66747 insertions(+), 35204 deletions(-) create mode 100644 .github/dependabot.yml create mode 100644 .github/workflows/clang-format-check.yml create mode 100644 .github/workflows/codeql.yml create mode 100644 .github/workflows/releases.yml create mode 100644 .github/workflows/scorecard.yml create mode 100644 .github/workflows/windows.yml create mode 100644 .olcf-gitlab-ci.yml create mode 100644 CITATION.cff create mode 100644 SECURITY.md create mode 100644 algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp create mode 100644 algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp create mode 100644 algorithms/src/sorting/Kokkos_NestedSortPublicAPI.hpp create mode 100644 algorithms/src/sorting/Kokkos_SortByKeyPublicAPI.hpp create mode 100644 algorithms/src/sorting/Kokkos_SortPublicAPI.hpp create mode 100644 algorithms/src/sorting/impl/Kokkos_CopyOpsForBinSortImpl.hpp create mode 100644 algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp create mode 100644 algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp create mode 100644 algorithms/src/sorting/impl/Kokkos_SortImpl.hpp create mode 100644 algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp create mode 100644 algorithms/src/std_algorithms/impl/Kokkos_MustUseKokkosSingleInTeam.hpp create mode 100644 algorithms/unit_tests/TestSortByKey.hpp create mode 100644 algorithms/unit_tests/TestSortCustomComp.hpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentFind.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamAllOf.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamAnyOf.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamCopyBackward.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamCountIf.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamEqual.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamFill.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamFill_n.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamFindFirstOf.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamForEach.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamForEachN.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamGenerate.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamIsPartitioned.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamLexicographicalCompare.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamMismatch.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamMoveBackward.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamNoneOf.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamPartitionCopy.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamPartitionPoint.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamReduce.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamRemoveIf.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamReplace.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamReplaceIf.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamReverse.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamReverseCopy.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamRotate.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamSearch.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamSearchN.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamShiftLeft.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamTransformBinaryOp.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamTransformReduce.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamTransformUnaryOp.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp create mode 100644 algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp create mode 100644 benchmarks/CMakeLists.txt create mode 100644 benchmarks/atomic/CMakeLists.txt create mode 100644 benchmarks/bytes_and_flops/CMakeLists.txt create mode 100644 benchmarks/gather/CMakeLists.txt create mode 100644 benchmarks/gups/CMakeLists.txt delete mode 100644 benchmarks/gups/gups-kokkos.cpp create mode 100644 benchmarks/gups/gups.cpp create mode 100644 benchmarks/launch_latency/CMakeLists.txt create mode 100644 benchmarks/launch_latency/launch_latency.cpp create mode 100644 benchmarks/policy_performance/CMakeLists.txt create mode 100644 benchmarks/stream/CMakeLists.txt create mode 100644 benchmarks/view_copy_constructor/CMakeLists.txt rename benchmarks/{gups => view_copy_constructor}/Makefile (73%) create mode 100644 benchmarks/view_copy_constructor/view_copy_constructor.cpp delete mode 100644 cmake/Modules/FindTPLLIBRT.cmake delete mode 100644 cmake/Modules/FindTPLMEMKIND.cmake create mode 100644 cmake/Modules/FindTPLROCTHRUST.cmake delete mode 100644 cmake/deps/CUSPARSE.cmake delete mode 100644 cmake/tpls/FindTPLCUSPARSE.cmake delete mode 100755 config/test_all_sandia delete mode 100644 config/yaml/volta.yaml create mode 100644 core/perf_test/PerfTest_MallocFree.cpp create mode 100644 core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp create mode 100644 core/src/HIP/Kokkos_HIP_GraphNode_Impl.hpp create mode 100644 core/src/HIP/Kokkos_HIP_Graph_Impl.hpp create mode 100644 core/src/HIP/Kokkos_HIP_ParallelFor_MDRange.hpp create mode 100644 core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp create mode 100644 core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp rename core/src/HIP/{Kokkos_HIP_Parallel_MDRange.hpp => Kokkos_HIP_ParallelReduce_MDRange.hpp} (61%) create mode 100644 core/src/HIP/Kokkos_HIP_ParallelReduce_Range.hpp create mode 100644 core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp rename core/src/HIP/{Kokkos_HIP_Parallel_Range.hpp => Kokkos_HIP_ParallelScan_Range.hpp} (53%) delete mode 100644 core/src/HIP/Kokkos_HIP_Parallel_Team.hpp create mode 100644 core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp create mode 100644 core/src/Kokkos_Abort.hpp create mode 100644 core/src/Kokkos_Assert.hpp create mode 100644 core/src/Kokkos_Clamp.hpp delete mode 100644 core/src/Kokkos_HBWSpace.hpp delete mode 100644 core/src/Kokkos_LogicalSpaces.hpp delete mode 100644 core/src/Kokkos_MasterLock.hpp rename core/src/{Kokkos_MinMaxClamp.hpp => Kokkos_MinMax.hpp} (83%) create mode 100644 core/src/Kokkos_Printf.hpp create mode 100644 core/src/Kokkos_Swap.hpp delete mode 100644 core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp create mode 100644 core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp create mode 100644 core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp create mode 100644 core/src/OpenMP/Kokkos_OpenMP_Parallel_Scan.hpp create mode 100644 core/src/OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp create mode 100644 core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp rename core/src/OpenMPTarget/{Kokkos_OpenMPTarget_Parallel_MDRange.hpp => Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp} (62%) create mode 100644 core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp create mode 100644 core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp create mode 100644 core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp rename core/src/SYCL/{Kokkos_SYCL_Parallel_Range.hpp => Kokkos_SYCL_ParallelFor_MDRange.hpp} (58%) create mode 100644 core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp create mode 100644 core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp create mode 100644 core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp create mode 100644 core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp create mode 100644 core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp rename core/src/SYCL/{Kokkos_SYCL_Parallel_Scan.hpp => Kokkos_SYCL_ParallelScan_Range.hpp} (61%) delete mode 100644 core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp delete mode 100644 core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp create mode 100644 core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp create mode 100644 core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp rename core/src/Threads/{Kokkos_ThreadsExec.cpp => Kokkos_Threads_Instance.cpp} (56%) rename core/src/Threads/{Kokkos_ThreadsExec.hpp => Kokkos_Threads_Instance.hpp} (76%) create mode 100644 core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp create mode 100644 core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp create mode 100644 core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp rename core/src/Threads/{Kokkos_Threads_Parallel_MDRange.hpp => Kokkos_Threads_ParallelReduce_MDRange.hpp} (52%) create mode 100644 core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp rename core/src/Threads/{Kokkos_Threads_Parallel_Team.hpp => Kokkos_Threads_ParallelReduce_Team.hpp} (53%) create mode 100644 core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp delete mode 100644 core/src/Threads/Kokkos_Threads_Parallel_Range.hpp rename core/src/{impl/Kokkos_Spinwait.cpp => Threads/Kokkos_Threads_Spinwait.cpp} (90%) create mode 100644 core/src/Threads/Kokkos_Threads_Spinwait.hpp rename core/{unit_test/cuda/TestCuda_Graph.cpp => src/Threads/Kokkos_Threads_State.hpp} (54%) rename core/src/Threads/{Kokkos_ThreadsTeam.hpp => Kokkos_Threads_Team.hpp} (91%) create mode 100644 core/src/View/Kokkos_ViewAlloc.hpp create mode 100644 core/src/View/MDSpan/Kokkos_MDSpan_Accessor.hpp create mode 100644 core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp rename algorithms/src/std_algorithms/Kokkos_Swap.hpp => core/src/impl/Kokkos_Abort.cpp (54%) create mode 100644 core/src/impl/Kokkos_CheckedIntegerOps.hpp rename core/src/{Kokkos_Atomics_Desul_Config.hpp => impl/Kokkos_DesulAtomicsConfig.hpp} (72%) delete mode 100644 core/src/impl/Kokkos_HBWSpace.cpp create mode 100644 core/src/impl/Kokkos_Half_MathematicalFunctions.hpp delete mode 100644 core/src/impl/Kokkos_MemorySpace.cpp delete mode 100644 core/src/impl/Kokkos_MemorySpace.hpp delete mode 100644 core/src/impl/Kokkos_Memory_Fence.hpp delete mode 100644 core/src/impl/Kokkos_Spinwait.hpp delete mode 100644 core/src/impl/Kokkos_ViewArray.hpp create mode 100644 core/src/impl/Kokkos_ViewDataAnalysis.hpp delete mode 100644 core/src/impl/Kokkos_ViewLayoutTiled.hpp delete mode 100644 core/unit_test/TestAggregate.hpp create mode 100644 core/unit_test/TestArrayOps.hpp create mode 100644 core/unit_test/TestAtomicOperations_unsignedlonglongint.hpp create mode 100644 core/unit_test/TestCheckedIntegerOps.hpp delete mode 100644 core/unit_test/TestDefaultDeviceTypeInit.hpp create mode 100644 core/unit_test/TestExecSpaceThreadSafety.hpp create mode 100644 core/unit_test/TestMDRangePolicyCTAD.cpp create mode 100644 core/unit_test/TestMDSpanAtomicAccessor.hpp create mode 100644 core/unit_test/TestMDSpanConversion.hpp create mode 100644 core/unit_test/TestMultiGPU.hpp create mode 100644 core/unit_test/TestNestedReducerCTAD.cpp create mode 100644 core/unit_test/TestOccupancyControlTrait.hpp create mode 100644 core/unit_test/TestPrintf.hpp create mode 100644 core/unit_test/TestRangePolicyCTAD.cpp create mode 100644 core/unit_test/TestSpaceAwareAccessor.hpp create mode 100644 core/unit_test/TestSpaceAwareAccessorAccessViolation.hpp create mode 100644 core/unit_test/TestSwap.hpp create mode 100644 core/unit_test/TestTeamMDRangePolicyCTAD.cpp create mode 100644 core/unit_test/TestTeamPolicyCTAD.cpp create mode 100644 core/unit_test/TestViewBadAlloc.hpp create mode 100644 core/unit_test/TestViewCopy_c.hpp create mode 100644 core/unit_test/TestViewEmptyRuntimeUnmanaged.hpp delete mode 100644 core/unit_test/TestViewLayoutTiled.hpp create mode 100644 core/unit_test/TestViewOfViews.hpp create mode 100644 core/unit_test/TestViewOutOfBoundsAccess.hpp create mode 100644 core/unit_test/UnitTest_ScopeGuard.cpp create mode 100644 core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_1.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_10.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_11.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_12.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_13.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_14.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_15.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_16.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_17.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_18.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_2.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_3.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_4.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_5.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_6.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_7.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_8.cpp delete mode 100644 core/unit_test/default/TestDefaultDeviceTypeInit_9.cpp delete mode 100644 core/unit_test/hip/TestHIP_Graph.cpp delete mode 100644 core/unit_test/openmp/TestOpenMP_Graph.cpp delete mode 100644 core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp delete mode 100644 core/unit_test/serial/TestSerial_Graph.cpp create mode 100644 core/unit_test/sycl/TestSYCL_InterOp_StreamsMultiGPU.cpp delete mode 100644 core/unit_test/tools/TestLogicalSpaces.hpp create mode 100644 simd/src/Kokkos_SIMD_Common_Math.hpp create mode 100644 simd/unit_tests/include/SIMDTesting_Ops.hpp create mode 100644 simd/unit_tests/include/SIMDTesting_Utilities.hpp create mode 100644 simd/unit_tests/include/TestSIMD_Condition.hpp create mode 100644 simd/unit_tests/include/TestSIMD_Conversions.hpp create mode 100644 simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp create mode 100644 simd/unit_tests/include/TestSIMD_MaskOps.hpp create mode 100644 simd/unit_tests/include/TestSIMD_MathOps.hpp create mode 100644 simd/unit_tests/include/TestSIMD_Reductions.hpp create mode 100644 simd/unit_tests/include/TestSIMD_ShiftOps.hpp create mode 100644 simd/unit_tests/include/TestSIMD_WhereExpressions.hpp create mode 100644 tpls/desul/include/desul/atomics/Adapt_HIP.hpp create mode 100644 tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp create mode 100644 tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp create mode 100644 tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp create mode 100644 tpls/desul/include/desul/atomics/Thread_Fence_OpenACC.hpp delete mode 100644 tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_forceglobal delete mode 100644 tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_generic delete mode 100644 tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_forceglobal delete mode 100644 tpls/desul/include/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_generic delete mode 100644 tpls/mdspan/include/experimental/__p0009_bits/maybe_static_value.hpp delete mode 100644 tpls/mdspan/include/experimental/__p0009_bits/standard_layout_static_array.hpp delete mode 100644 tpls/mdspan/include/experimental/__p0009_bits/static_array.hpp delete mode 100644 tpls/mdspan/include/experimental/__p0009_bits/submdspan.hpp create mode 100644 tpls/mdspan/include/experimental/__p0009_bits/utility.hpp rename core/src/decl/Kokkos_Declare_HBWSpace.hpp => tpls/mdspan/include/experimental/__p2389_bits/dims.hpp (59%) create mode 100644 tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp create mode 100644 tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp create mode 100644 tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp create mode 100644 tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp create mode 100644 tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp create mode 100644 tpls/mdspan/include/experimental/__p2642_bits/layout_padded_fwd.hpp delete mode 100644 tpls/mdspan/include/experimental/mdarray delete mode 100644 tpls/mdspan/include/experimental/mdspan rename core/src/fwd/Kokkos_Fwd_HBWSpace.hpp => tpls/mdspan/include/mdspan/mdarray.hpp (64%) create mode 100644 tpls/mdspan/include/mdspan/mdspan.hpp diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 5a259e3a58..f7a0a7185a 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -1,21 +1,22 @@ --- name: Bug report -about: Create a report to correct failures and improve our code +about: Create a report (for github issue tracker) to correct failures title: '' labels: '' assignees: '' --- **Describe the bug** -Please provide a concise, clear description of the bug, as well as any available error logs. -**Please also include the following items to support reproducing the bug** -1. compilers (with versions) +Please provide a concise, clear description of the bug, as well as any available error logs. Feel free to contact the Kokkos Slack `# build` channel for further discussion of your issue. + +**Please include the following for a minimal reproducer** + +1. Compilers (with versions) 2. Kokkos release or commit used (i.e., the sha1 number) -3. platform and backend -4. cmake configure command -5. output from cmake command -6. code needed to reproduce the bug -7. command line needed to reproduce the bug -7. please also attach the `KokkosCore_config.h` header file (generated during the build); -**Any additional info** -Please provide any additional context about the issue here. +3. Platform, architecture and backend +4. CMake configure command +5. Output from CMake configure command +6. Minimum, complete code needed to reproduce the bug +7. Command line needed to reproduce the bug +8. `KokkosCore_config.h` header file (generated during the build) +9. Please provide any additional relevant error logs diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000000..7242bd2851 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,6 @@ +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: / + schedule: + interval: "weekly" diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml new file mode 100644 index 0000000000..f4dbe56011 --- /dev/null +++ b/.github/workflows/clang-format-check.yml @@ -0,0 +1,15 @@ +name: clang-format check + +on: [push, pull_request] + +permissions: read-all + +jobs: + formatting-check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Run clang-format style check. + uses: DoozyX/clang-format-lint-action@v0.17 + with: + clangFormatVersion: 8 diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000000..2ed86a1475 --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,51 @@ +name: "CodeQL" + +on: + push: + branches: [ "master", "develop", "release-*" ] + pull_request: + branches: [ "develop" ] + +permissions: read-all + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + timeout-minutes: 360 + permissions: + # required for all workflows + security-events: write + + # only required for workflows in private repositories + actions: read + contents: read + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: c-cpp + + - name: configure + run: + cmake -B build . + -DKokkos_ENABLE_OPENMP=ON + -DCMAKE_CXX_STANDARD=17 + -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF + -DKokkos_ENABLE_TESTS=ON + -DKokkos_ENABLE_EXAMPLES=ON + -DKokkos_ENABLE_BENCHMARKS=ON + -DCMAKE_BUILD_TYPE=Debug + - name: build + run: + cmake --build build --parallel 2 + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:c-cpp" diff --git a/.github/workflows/continuous-integration-workflow-32bit.yml b/.github/workflows/continuous-integration-workflow-32bit.yml index 7fab3b0e62..0260cb5894 100644 --- a/.github/workflows/continuous-integration-workflow-32bit.yml +++ b/.github/workflows/continuous-integration-workflow-32bit.yml @@ -1,5 +1,15 @@ name: github-Linux-32bit -on: [push, pull_request] + +on: + push: + branches: + - develop + pull_request: + paths-ignore: + - '**/*.md' + types: [ opened, reopened, synchronize ] + +permissions: read-all concurrency: group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }} @@ -13,7 +23,7 @@ jobs: image: ghcr.io/kokkos/ci-containers/ubuntu:latest steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: install_multilib run: sudo apt-get update && sudo apt-get install -y gcc-multilib g++-multilib gfortran-multilib - name: Configure Kokkos @@ -26,7 +36,7 @@ jobs: -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ - -DCMAKE_CXX_FLAGS="-Werror -m32 -DKOKKOS_IMPL_32BIT" \ + -DCMAKE_CXX_FLAGS="-Werror -m32" \ -DCMAKE_CXX_COMPILER=g++ \ -DCMAKE_BUILD_TYPE=RelWithDebInfo - name: Build diff --git a/.github/workflows/continuous-integration-workflow-hpx.yml b/.github/workflows/continuous-integration-workflow-hpx.yml index 0c7abd2fc1..35704a28cf 100644 --- a/.github/workflows/continuous-integration-workflow-hpx.yml +++ b/.github/workflows/continuous-integration-workflow-hpx.yml @@ -1,11 +1,20 @@ name: github-Linux-hpx -on: [push, pull_request] +on: + push: + branches: + - develop + pull_request: + paths-ignore: + - '**/*.md' + types: [ opened, reopened, synchronize ] concurrency: group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }} cancel-in-progress: ${{github.event_name == 'pull_request'}} +permissions: read-all + jobs: hpx: name: hpx @@ -13,7 +22,7 @@ jobs: steps: - name: checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: path: kokkos - name: setup hpx dependencies @@ -26,12 +35,12 @@ jobs: libboost-all-dev \ ninja-build - name: checkout hpx - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: repository: STELLAR-GROUP/hpx - ref: 1.8.0 + ref: v1.9.0 path: hpx - - uses: actions/cache@v3 + - uses: actions/cache@v4 id: cache-hpx with: path: ./hpx/install @@ -73,7 +82,7 @@ jobs: -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ -DKokkos_ENABLE_EXAMPLES=ON \ -DKokkos_ENABLE_HPX=ON \ - -DKokkos_ENABLE_SERIAL=OFF \ + -DKokkos_ENABLE_SERIAL=ON \ -DKokkos_ENABLE_TESTS=ON \ .. diff --git a/.github/workflows/continuous-integration-workflow.yml b/.github/workflows/continuous-integration-workflow.yml index 741f6b2746..9c7b7585d0 100644 --- a/.github/workflows/continuous-integration-workflow.yml +++ b/.github/workflows/continuous-integration-workflow.yml @@ -1,60 +1,87 @@ name: github-Linux -on: [push, pull_request] + +on: + push: + branches: + - develop + pull_request: + paths-ignore: + - '**/*.md' + types: [ opened, reopened, synchronize ] concurrency: group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }} cancel-in-progress: ${{github.event_name == 'pull_request'}} +permissions: read-all + jobs: CI: continue-on-error: true strategy: matrix: - distro: ['fedora:latest', 'fedora:rawhide', 'ubuntu:latest'] + distro: ['fedora:latest', 'ubuntu:latest'] cxx: ['g++', 'clang++'] cxx_extra_flags: [''] cmake_build_type: ['Release', 'Debug'] backend: ['OPENMP'] clang-tidy: [''] + stdcxx: [17] include: - - distro: 'fedora:intel' + - distro: 'ubuntu:intel' cxx: 'icpc' cxx_extra_flags: '-diag-disable=177,10441' cmake_build_type: 'Release' backend: 'OPENMP' - - distro: 'fedora:intel' + stdcxx: '17' + - distro: 'ubuntu:intel' cxx: 'icpc' cxx_extra_flags: '-diag-disable=177,10441' cmake_build_type: 'Debug' backend: 'OPENMP' - - distro: 'fedora:intel' + stdcxx: '17' + - distro: 'ubuntu:intel' cxx: 'icpx' - cxx_extra_flags: '-fp-model=precise -Wno-pass-failed' + cxx_extra_flags: '-fp-model=precise -Wno-pass-failed -fsanitize=address,undefined -fno-sanitize=function -fno-sanitize-recover=all' + extra_linker_flags: '-fsanitize=address,undefined -fno-sanitize=function -fno-sanitize-recover=all' cmake_build_type: 'Release' backend: 'OPENMP' - - distro: 'fedora:intel' + stdcxx: '17' + - distro: 'ubuntu:intel' cxx: 'icpx' cxx_extra_flags: '-fp-model=precise -Wno-pass-failed' cmake_build_type: 'Debug' backend: 'OPENMP' + stdcxx: '20' - distro: 'ubuntu:latest' cxx: 'clang++' + cxx_extra_flags: '-fsanitize=address,undefined -fno-sanitize=function -fno-sanitize-recover=all' + extra_linker_flags: '-fsanitize=address,undefined -fno-sanitize=function -fno-sanitize-recover=all' cmake_build_type: 'RelWithDebInfo' backend: 'THREADS' clang-tidy: '-DCMAKE_CXX_CLANG_TIDY="clang-tidy;-warnings-as-errors=*"' + stdcxx: '23' + - distro: 'ubuntu:latest' + cxx: 'clang++' + cxx_extra_flags: '-fsanitize=address,undefined -fno-sanitize=function -fno-sanitize-recover=all' + extra_linker_flags: '-fsanitize=address,undefined -fno-sanitize=function -fno-sanitize-recover=all' + cmake_build_type: 'RelWithDebInfo' + backend: 'SERIAL' + stdcxx: '20' - distro: 'ubuntu:latest' cxx: 'g++' cmake_build_type: 'RelWithDebInfo' backend: 'THREADS' + stdcxx: '23' runs-on: ubuntu-latest container: image: ghcr.io/kokkos/ci-containers/${{ matrix.distro }} steps: - name: Checkout desul - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: repository: desul/desul - ref: 477da9c8f40f8db369c28dd3f93a67e376d8511b + ref: 779d0441a778c7088a36d38c4cbf8df3cfa182cc path: desul - name: Install desul working-directory: desul @@ -66,15 +93,12 @@ jobs: cmake -DDESUL_ENABLE_TESTS=OFF -DCMAKE_INSTALL_PREFIX=/usr/desul-install .. sudo cmake --build . --target install --parallel 2 - name: Checkout code - uses: actions/checkout@v3 - - uses: actions/cache@v3 + uses: actions/checkout@v4 + - uses: actions/cache@v4 with: path: ~/.cache/ccache key: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.cmake_build_type }}-${{ matrix.openmp }}-${{ github.ref }}-${{ github.sha }} restore-keys: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.cmake_build_type }}-${{ matrix.openmp }}-${{ github.ref }} - - name: maybe_disable_death_tests - if: ${{ matrix.distro == 'fedora:rawhide' }} - run: echo "GTEST_FILTER=-*DeathTest*" >> $GITHUB_ENV - name: maybe_use_flang_new if: ${{ matrix.cxx == 'clang++' && startsWith(matrix.distro,'fedora:') }} run: echo "FC=flang-new" >> $GITHUB_ENV @@ -100,6 +124,8 @@ jobs: -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DCMAKE_CXX_FLAGS="-Werror ${{ matrix.cxx_extra_flags }}" \ + -DCMAKE_CXX_STANDARD="${{ matrix.stdcxx }}" \ + -DCMAKE_EXE_LINKER_FLAGS="${{ matrix.extra_linker_flags }}" \ -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} @@ -112,6 +138,7 @@ jobs: working-directory: builddir run: ctest --output-on-failure - name: Test linking against build dir + if: ${{ !contains(matrix.cxx_extra_flags, '-fsanitize=address') }} working-directory: example/build_cmake_installed run: | cmake -B builddir_buildtree -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} -DKokkos_ROOT=../../builddir @@ -122,6 +149,7 @@ jobs: - name: Install run: sudo cmake --build builddir --target install - name: Test install + if: ${{ !contains(matrix.cxx_extra_flags, '-fsanitize=address') }} working-directory: example/build_cmake_installed run: | cmake -B builddir -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 0ff3266848..2bcf41a0d3 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -1,11 +1,20 @@ name: github-OSX -on: [push, pull_request] +on: + push: + branches: + - develop + pull_request: + paths-ignore: + - '**/*.md' + types: [ opened, reopened, synchronize ] concurrency: group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }} cancel-in-progress: ${{github.event_name == 'pull_request'}} +permissions: read-all + jobs: osxci: name: osx-ci @@ -24,7 +33,7 @@ jobs: cmake_build_type: "Release" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: configure run: cmake -B build . diff --git a/.github/workflows/performance-benchmark.yml b/.github/workflows/performance-benchmark.yml index 205239e043..bfbbeea4a8 100644 --- a/.github/workflows/performance-benchmark.yml +++ b/.github/workflows/performance-benchmark.yml @@ -4,6 +4,11 @@ on: branches: - develop pull_request: + paths-ignore: + - '**/*.md' + types: [ opened, reopened, synchronize ] + +permissions: read-all jobs: CI: @@ -20,8 +25,8 @@ jobs: BUILD_ID: ${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.backend }} steps: - name: Checkout code - uses: actions/checkout@v3 - - uses: actions/cache@v3 + uses: actions/checkout@v4 + - uses: actions/cache@v4 with: path: ~/.cache/ccache key: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.backend }}-${{ github.ref }}-${{ github.sha }} diff --git a/.github/workflows/releases.yml b/.github/workflows/releases.yml new file mode 100644 index 0000000000..f0bb9b0b19 --- /dev/null +++ b/.github/workflows/releases.yml @@ -0,0 +1,83 @@ +on: + push: + tags: '[0-9]+.[0-9]+.[0-9][0-9]' + + +permissions: read-all + +jobs: + # This step builds our artifacts, uploads them to the workflow run, and + # outputs their digest. + build: + outputs: + hashes: ${{ steps.hash.outputs.hashes }} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Build artifacts + run: | + git archive -o kokkos-${{ github.ref_name }}.zip HEAD + git archive -o kokkos-${{ github.ref_name }}.tar.gz HEAD + + - name: Generate hashes + shell: bash + id: hash + run: | + # sha256sum generates sha256 hash for all artifacts. + # base64 -w0 encodes to base64 and outputs on a single line. + echo "hashes=$(sha256sum kokkos-${{ github.ref_name }}.zip kokkos-${{ github.ref_name }}.tar.gz | base64 -w0)" >> "$GITHUB_OUTPUT" + + - name: Upload source code (zip) + uses: actions/upload-artifact@89ef406dd8d7e03cfd12d9e0a4a378f454709029 # v4.3.5 + with: + name: kokkos-${{ github.ref_name }}.zip + path: kokkos-${{ github.ref_name }}.zip + if-no-files-found: error + retention-days: 5 + + - name: Upload source code (tar.gz) + uses: actions/upload-artifact@89ef406dd8d7e03cfd12d9e0a4a378f454709029 # v4.3.5 + with: + name: kokkos-${{ github.ref_name }}.tar.gz + path: kokkos-${{ github.ref_name }}.tar.gz + if-no-files-found: error + retention-days: 5 + + # This step calls the generic workflow to generate provenance. + provenance: + needs: [build] + permissions: + actions: read + id-token: write + contents: write + uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.0.0 + with: + base64-subjects: "${{ needs.build.outputs.hashes }}" + # Upload provenance to a new release + upload-assets: true + provenance-name: "kokkos-${{ github.ref_name }}.intoto.jsonl" + + # This step uploads our artifacts to the tagged GitHub release. + release: + needs: [build, provenance] + permissions: + contents: write + runs-on: ubuntu-latest + if: startsWith(github.ref, 'refs/tags/') + steps: + - name: Download kokkos-${{ github.ref_name }}.zip + uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 + with: + name: kokkos-${{ github.ref_name }}.zip + + - name: Download kokkos-${{ github.ref_name }}.tar.gz + uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 + with: + name: kokkos-${{ github.ref_name }}.tar.gz + + - name: Upload assets + uses: softprops/action-gh-release@c062e08bd532815e2082a85e87e3ef29c3e6d191 # v2.0.8 + with: + files: | + kokkos-${{ github.ref_name }}.zip + kokkos-${{ github.ref_name }}.tar.gz diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml new file mode 100644 index 0000000000..77580978b2 --- /dev/null +++ b/.github/workflows/scorecard.yml @@ -0,0 +1,73 @@ +# This workflow uses actions that are not certified by GitHub. They are provided +# by a third-party and are governed by separate terms of service, privacy +# policy, and support documentation. + +name: Scorecard supply-chain security +on: + # For Branch-Protection check. Only the default branch is supported. See + # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection + branch_protection_rule: + # To guarantee Maintained check is occasionally updated. See + # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained + schedule: + # Weekly on Saturdays. + - cron: '30 1 * * 6' + push: + branches: [ master, develop ] + +# Declare default permissions as read only. +permissions: read-all + +jobs: + analysis: + name: Scorecard analysis + runs-on: ubuntu-latest + permissions: + # Needed to upload the results to code-scanning dashboard. + security-events: write + # Needed to publish results and get a badge (see publish_results below). + id-token: write + # Uncomment the permissions below if installing in a private repository. + # contents: read + # actions: read + + steps: + - name: "Checkout code" + uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + with: + persist-credentials: false + + - name: "Run analysis" + uses: ossf/scorecard-action@62b2cac7ed8198b15735ed49ab1e5cf35480ba46 # v2.4.0 + with: + results_file: results.sarif + results_format: sarif + # (Optional) "write" PAT token. Uncomment the `repo_token` line below if: + # - you want to enable the Branch-Protection check on a *public* repository, or + # - you are installing Scorecard on a *private* repository + # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-pat. + # repo_token: ${{ secrets.SCORECARD_TOKEN }} + + # Public repositories: + # - Publish results to OpenSSF REST API for easy access by consumers + # - Allows the repository to include the Scorecard badge. + # - See https://github.com/ossf/scorecard-action#publishing-results. + # For private repositories: + # - `publish_results` will always be set to `false`, regardless + # of the value entered here. + publish_results: true + + # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF + # format to the repository Actions tab. + - name: "Upload artifact" + uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b # v4.3.4 + with: + name: SARIF file + path: results.sarif + retention-days: 5 + + # Upload the results to GitHub's code scanning dashboard. + - name: "Upload SARIF results to code scanning" + uses: github/codeql-action/upload-sarif@9fdb3e49720b44c48891d036bb502feb25684276 # v3.25.6 + with: + sarif_file: results.sarif diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml new file mode 100644 index 0000000000..d801c740a7 --- /dev/null +++ b/.github/workflows/windows.yml @@ -0,0 +1,35 @@ +name: github-windows + +on: + push: + pull_request: + +concurrency: + group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{github.event_name == 'pull_request'}} + +permissions: read-all + +jobs: + windows-cuda: + # Cuda build on Windows + name: Windows Cuda + runs-on: windows-2022 + + steps: + - uses: Jimver/cuda-toolkit@v0.2.16 + id: cuda-toolkit + with: + cuda: '12.4.1' + - uses: actions/checkout@v4 + - name: configure + shell: bash + run: | + mkdir build + mkdir c:/project + cd build + cmake -DKokkos_ENABLE_CUDA=ON -DKokkos_ARCH_VOLTA70=ON -DKokkos_ENABLE_TESTS=ON -DKokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE=ON .. + - name: build library + shell: bash + run: | + cmake --build build --parallel 2 --config Release diff --git a/.jenkins b/.jenkins index c7d8ce533d..0393ff06fb 100644 --- a/.jenkins +++ b/.jenkins @@ -3,21 +3,26 @@ pipeline { environment { CCACHE_DIR = '/tmp/ccache' - CCACHE_MAXSIZE = '10G' + CCACHE_MAXSIZE = '5G' CCACHE_CPP2 = 'true' } options { + disableConcurrentBuilds(abortPrevious: true) timeout(time: 6, unit: 'HOURS') } + triggers { + issueCommentTrigger('.*test this please.*') + } + stages { stage('Clang-Format') { agent { dockerfile { filename 'Dockerfile.clang' dir 'scripts/docker' - label 'nvidia-docker || rocm-docker || docker' + label 'nvidia-docker || docker' args '-v /tmp/ccache.kokkos:/tmp/ccache' } } @@ -27,7 +32,7 @@ pipeline { } stage('Build') { parallel { - stage('OPENACC-NVHPC-CUDA-11.6') { + stage('OPENACC-NVHPC-CUDA-12.2') { agent { dockerfile { filename 'Dockerfile.nvhpc' @@ -37,7 +42,7 @@ pipeline { } } environment { - CUDA_HOME = '/opt/nvidia/hpc_sdk/Linux_x86_64/22.3/cuda/11.6' + NVHPC_CUDA_HOME = '/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2' } steps { sh '''rm -rf build && mkdir -p build && cd build && \ @@ -53,13 +58,12 @@ pipeline { make -j8 && ctest --verbose''' } } - stage('CUDA-11.7-NVHPC') { + stage('CUDA-12.2-NVHPC-AS-HOST-COMPILER') { agent { dockerfile { filename 'Dockerfile.nvhpc' dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=nvcr.io/nvidia/nvhpc:22.9-devel-cuda11.7-ubuntu20.04' - label 'nvidia-docker && large_images' + label 'nvidia-docker && large_images && volta' args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' } } @@ -70,7 +74,7 @@ pipeline { OMP_MAX_ACTIVE_LEVELS = 1 OMP_PLACES = 'threads' OMP_PROC_BIND = 'spread' - NVHPC_CUDA_HOME = '/opt/nvidia/hpc_sdk/Linux_x86_64/22.9/cuda/11.7' + NVHPC_CUDA_HOME = '/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2' } steps { sh '''rm -rf build && mkdir -p build && cd build && \ @@ -78,7 +82,7 @@ pipeline { -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_CXX_COMPILER=nvc++ \ -DCMAKE_CXX_STANDARD=17 \ - -DCMAKE_CXX_FLAGS="--diag_suppress=implicit_return_from_non_void_function,no_device_stack" \ + -DCMAKE_CXX_FLAGS="--diag_suppress=implicit_return_from_non_void_function" \ -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ @@ -95,22 +99,22 @@ pipeline { dockerfile { filename 'Dockerfile.sycl' dir 'scripts/docker' - label 'nvidia-docker && volta' + label 'nvidia-docker && ampere' args '-v /tmp/ccache.kokkos:/tmp/ccache' } } steps { sh 'ccache --zero-stats' - sh '''. /opt/intel/oneapi/setvars.sh --include-intel-llvm && \ - rm -rf build && mkdir -p build && cd build && \ + sh '''rm -rf build && mkdir -p build && cd build && \ cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ - -DCMAKE_CXX_COMPILER=/opt/intel/oneapi/compiler/2023.0.0/linux/bin-llvm/clang++ \ + -DCMAKE_CXX_COMPILER=clang++ \ -DCMAKE_CXX_FLAGS="-fsycl-device-code-split=per_kernel -Wno-deprecated-declarations -Werror -Wno-gnu-zero-variadic-macro-arguments -Wno-unknown-cuda-version -Wno-sycl-target" \ + -DCMAKE_PREFIX_PATH="$ONE_DPL_DIR" \ -DKOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED=0 \ -DKokkos_ARCH_NATIVE=ON \ - -DKokkos_ARCH_VOLTA70=ON \ + -DKokkos_ARCH_AMPERE80=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ @@ -134,8 +138,8 @@ pipeline { dockerfile { filename 'Dockerfile.hipcc' dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.2' - label 'rocm-docker && vega' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.2-complete' + label 'rocm-docker ' args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' } } @@ -161,6 +165,7 @@ pipeline { -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_HIP=ON \ -DKokkos_ENABLE_OPENMP=ON \ + -DKokkos_ENABLE_IMPL_MDSPAN=OFF \ -DKokkos_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS=ON \ .. && \ make -j8 && ctest --verbose''' @@ -171,13 +176,13 @@ pipeline { } } } - stage('HIP-ROCm-5.2-C++20') { + stage('HIP-ROCm-5.6-C++20') { agent { dockerfile { filename 'Dockerfile.hipcc' dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.2' - label 'rocm-docker && vega' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-22.04:5.6-complete' + label 'rocm-docker' args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' } } @@ -185,6 +190,7 @@ pipeline { sh 'ccache --zero-stats' sh '''rm -rf build && mkdir -p build && cd build && \ cmake \ + -DBUILD_SHARED_LIBS=ON \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_CXX_COMPILER=hipcc \ -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument" \ @@ -239,7 +245,7 @@ pipeline { -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_OPENMPTARGET=ON \ -DKokkos_ENABLE_OPENMP=ON \ - -DKokkos_ARCH_VEGA906=ON \ + -DKokkos_ARCH_AMD_GFX906=ON \ && \ cmake --build build --parallel ${BUILD_JOBS} && \ cd build && ctest --output-on-failure @@ -287,7 +293,7 @@ pipeline { } } } - stage('CUDA-10.1-Clang-Tidy') { + stage('CUDA-11.0.3-Clang-Tidy') { agent { dockerfile { filename 'Dockerfile.kokkosllvmproject' @@ -304,7 +310,7 @@ pipeline { -DCMAKE_CXX_CLANG_TIDY="clang-tidy;-warnings-as-errors=*" \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ -DCMAKE_CXX_COMPILER=clang++ \ - -DCMAKE_CXX_FLAGS=-Werror \ + -DCMAKE_CXX_FLAGS="-Werror -Wno-unknown-cuda-version" \ -DCMAKE_CXX_STANDARD=17 \ -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ @@ -330,7 +336,7 @@ pipeline { dockerfile { filename 'Dockerfile.nvcc' dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.7.0-devel-ubuntu20.04' + additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.7.1-devel-ubuntu20.04' label 'nvidia-docker && volta' args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' } @@ -340,7 +346,7 @@ pipeline { sh '''rm -rf build && mkdir -p build && cd build && \ ../gnu_generate_makefile.bash \ --with-options=compiler_warnings \ - --cxxflags="-Werror" \ + --cxxflags="-Werror -Werror all-warnings -Xcudafe --diag_suppress=20208" \ --cxxstandard=c++17 \ --with-cuda \ --with-cuda-options=enable_lambda \ @@ -360,7 +366,7 @@ pipeline { filename 'Dockerfile.nvcc' dir 'scripts/docker' additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.0.3-devel-ubuntu18.04 --build-arg ADDITIONAL_PACKAGES="g++-8 gfortran clang" --build-arg CMAKE_VERSION=3.17.3' - label 'nvidia-docker' + label 'nvidia-docker && (volta || ampere)' args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' } } @@ -389,7 +395,6 @@ pipeline { -DKokkos_ENABLE_CUDA_LAMBDA=OFF \ -DKokkos_ENABLE_CUDA_UVM=ON \ -DKokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE=ON \ - -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DCMAKE_INSTALL_PREFIX=${PWD}/../install \ @@ -431,8 +436,8 @@ pipeline { dockerfile { filename 'Dockerfile.nvcc' dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.6.0-devel-ubuntu20.04' - label 'nvidia-docker' + additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.6.2-devel-ubuntu20.04' + label 'nvidia-docker && (volta || ampere)' args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' } } @@ -440,10 +445,11 @@ pipeline { sh 'ccache --zero-stats' sh '''rm -rf build && mkdir -p build && cd build && \ cmake \ + -DBUILD_SHARED_LIBS=ON \ -DCMAKE_BUILD_TYPE=Debug \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ -DCMAKE_CXX_COMPILER=$WORKSPACE/bin/nvcc_wrapper \ - -DCMAKE_CXX_FLAGS=-Werror \ + -DCMAKE_CXX_FLAGS="-Werror -Werror all-warnings -Xcudafe --diag_suppress=20208" \ -DCMAKE_CXX_STANDARD=17 \ -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ @@ -455,6 +461,8 @@ pipeline { -DKokkos_ENABLE_CUDA=ON \ -DKokkos_ENABLE_CUDA_LAMBDA=ON \ -DKokkos_ENABLE_LIBDL=OFF \ + -DKokkos_ENABLE_IMPL_MDSPAN=OFF \ + -DKokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC=OFF \ .. && \ make -j8 && ctest --verbose && \ cd ../example/build_cmake_in_tree && \ @@ -489,9 +497,8 @@ pipeline { -DCMAKE_CXX_FLAGS=-Werror \ -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ - -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=ON \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_OPENMP=ON \ diff --git a/.jenkins_nightly b/.jenkins_nightly index 8bcdb75a2a..8dd02e9f02 100644 --- a/.jenkins_nightly +++ b/.jenkins_nightly @@ -70,6 +70,74 @@ pipeline { ''' } } + stage('GCC-14') { + agent { + docker { + image 'gcc:14.1' + label 'docker' + } + } + steps { + sh ''' + wget https://github.com/Kitware/CMake/releases/download/v3.30.0/cmake-3.30.0-linux-x86_64.sh && \ + chmod +x cmake-3.30.0-linux-x86_64.sh && ./cmake-3.30.0-linux-x86_64.sh --skip-license --prefix=/usr + + mkdir -p build && cd build && \ + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_STANDARD=26 \ + -DCMAKE_CXX_FLAGS=-Werror \ + -DKokkos_ARCH_NATIVE=ON \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_BENCHMARKS=ON \ + -DKokkos_ENABLE_EXAMPLES=ON \ + -DKokkos_ENABLE_TESTS=ON \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=ON \ + -DKokkos_ENABLE_SERIAL=ON \ + .. && \ + make -j8 && ctest --verbose + ''' + } + } + stage('HIP-ROCM-6.1') { + agent { + dockerfile { + filename 'Dockerfile.hipcc' + dir 'scripts/docker' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-22.04:6.1.2-complete' + label 'rocm-docker && AMD_Radeon_Instinct_MI210' + args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' + } + } + environment { + // FIXME Test returns a wrong value + GTEST_FILTER = '-hip_hostpinned.view_allocation_large_rank' + } + steps { + sh 'ccache --zero-stats' + sh '''rm -rf build && mkdir -p build && cd build && \ + cmake \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_CXX_COMPILER=hipcc \ + -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument" \ + -DCMAKE_CXX_STANDARD=20 \ + -DKokkos_ARCH_NATIVE=ON \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=ON \ + -DKokkos_ENABLE_TESTS=ON \ + -DKokkos_ENABLE_BENCHMARKS=ON \ + -DKokkos_ENABLE_HIP=ON \ + .. && \ + make -j8 && ctest --verbose''' + } + post { + always { + sh 'ccache --show-stats' + } + } + } } } } diff --git a/.olcf-gitlab-ci.yml b/.olcf-gitlab-ci.yml new file mode 100644 index 0000000000..4e737cc536 --- /dev/null +++ b/.olcf-gitlab-ci.yml @@ -0,0 +1,12 @@ +test: + stage: test + tags: [frontier, shell] + id_tokens: + OLCF_ID_TOKEN: + aud: https://code.olcf.ornl.gov + script: + - module load rocm/6.0 + - cmake -B build -DCMAKE_CXX_COMPILER=hipcc -DKokkos_ENABLE_HIP=ON -DKokkos_ENABLE_TESTS=ON + - cmake --build build -j48 + - cd build + - ctest -E Kokkos_CoreUnitTest_DeviceAndThreads -V diff --git a/CHANGELOG.md b/CHANGELOG.md index 4c145c44b3..78225f9e6c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,307 @@ # CHANGELOG -## [4.1.00](https://github.com/kokkos/kokkos/tree/4.0.01) (2023-06-16) +## [4.4.00](https://github.com/kokkos/kokkos/tree/4.4.00) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.3.01...4.4.00) + +### Features: +* Add `Kokkos::View` conversions from and to [`std::mdspan`](https://en.cppreference.com/w/cpp/container/mdspan) [\#6830](https://github.com/kokkos/kokkos/pull/6830) [\#7069](https://github.com/kokkos/kokkos/pull/7069) + +### Backend and Architecture Enhancements: + +#### CUDA: +* `nvcc_wrapper`: Adding ability to process `--disable-warnings` flag [\#6936](https://github.com/kokkos/kokkos/issues/6936) +* Use recommended/max team size functions in Cuda ParallelFor and Reduce constructors [\#6891](https://github.com/kokkos/kokkos/issues/6891) +* Improve compile-times when building with `Kokkos_ENABLE_DEBUG_BOUNDS_CHECK` in Cuda [\#7013](https://github.com/kokkos/kokkos/pull/7013) + +#### HIP: +* Use HIP builtin atomics [\#6882](https://github.com/kokkos/kokkos/pull/6882) [\#7000](https://github.com/kokkos/kokkos/pull/7000) +* Enable user-specified compiler and linker flags for AMD GPUs [\#7127](https://github.com/kokkos/kokkos/pull/7127) + +#### SYCL: +* Add support for Graphs [\#6912](https://github.com/kokkos/kokkos/pull/6912) +* Fix multi-GPU support [\#6887](https://github.com/kokkos/kokkos/pull/6887) +* Improve performance of reduction and scan operations [\#6562](https://github.com/kokkos/kokkos/pull/6562), [\#6750](https://github.com/kokkos/kokkos/pull/6750) +* Fix lock for guarding scratch space in `TeamPolicy` `parallel_reduce` [\#6988](https://github.com/kokkos/kokkos/pull/6988) +* Include submission command queue property information into `SYCL::print_configuration()` [\#7004](https://github.com/kokkos/kokkos/pull/7004) + +#### OpenACC: +* Make `TeamPolicy` `parallel_for` execute on the correct async queue [\#7012](https://github.com/kokkos/kokkos/pull/7012) + +#### OpenMPTarget: +* Honor user requested loop ordering in `MDRange` policy [\#6925](https://github.com/kokkos/kokkos/pull/6925) +* Prevent data races by guarding the scratch space used in `parallel_scan` [\#6998](https://github.com/kokkos/kokkos/pull/6998) + +#### HPX: +* Workaround issue with template argument deduction to support compilation with NVCC [\#7015](https://github.com/kokkos/kokkos/pull/7015) + +### General Enhancements +* Improve performance of view copies in host parallel regions [\#6730](https://github.com/kokkos/kokkos/pull/6730) +* Harmonize convertibility rules of `Kokkos::RandomAccessIterator` with `View`s [\#6929](https://github.com/kokkos/kokkos/pull/6929) +* Add a check precondition non-overlapping ranges for the `adjacent_difference` algorithm in debug mode [\#6922](https://github.com/kokkos/kokkos/pull/6922) +* Add deduction guides for `TeamPolicy` [\#7030](https://github.com/kokkos/kokkos/pull/7030) +* SIMD: Allow flexible vector width for 32 bit types [\#6802](https://github.com/kokkos/kokkos/pull/6802) +* Updates for `Kokkos::Array`: add `kokkos_swap(Array)` specialization [\#6943](https://github.com/kokkos/kokkos/pull/6943), add `Kokkos::to_array` [\#6375](https://github.com/kokkos/kokkos/pull/6375), make `Kokkos::Array` equality-comparable [\#7148](https://github.com/kokkos/kokkos/pull/7148) +* Structured binding support for `Kokkos::complex` [\#7040](https://github.com/kokkos/kokkos/pull/7040) + +### Build System Changes +* Do not require OpenMP support for languages other than CXX [\#6965](https://github.com/kokkos/kokkos/pull/6965) +* Update Intel GPU architectures in Makefile [\#6895](https://github.com/kokkos/kokkos/pull/6895) +* Fix use of OpenMP with Cuda or HIP as compile language [\#6972](https://github.com/kokkos/kokkos/pull/6972) +* Define and enforce new minimum compiler versions for C++20 support [\#7128](https://github.com/kokkos/kokkos/pull/7128), [\#7123](https://github.com/kokkos/kokkos/pull/7123) +* Add nvidia Grace CPU architecture: `Kokkos_ARCH_ARMV9_GRACE` [\#7158](https://github.com/kokkos/kokkos/pull/7158) +* Fix Makefile.kokkos for Threads [\#6896](https://github.com/kokkos/kokkos/pull/6896) +* Remove support for NVHPC as CUDA device compiler [\#6987](https://github.com/kokkos/kokkos/pull/6987) +* Fix using CUDAToolkit for CMake 3.28.4 and higher [\#7062](https://github.com/kokkos/kokkos/pull/7062) + +### Incompatibilities (i.e. breaking changes) +* Drop `Kokkos::Array` special treatment in `View`s [\#6906](https://github.com/kokkos/kokkos/pull/6906) +* Drop `Experimental::RawMemoryAllocationFailure` [\#7145](https://github.com/kokkos/kokkos/pull/7145) + +### Deprecations +* Remove `Experimental::LayoutTiled` class template and deprecate `is_layouttiled` trait [\#6907](https://github.com/kokkos/kokkos/pull/6907) +* Deprecate `Kokkos::layout_iterate_type_selector` [\#7076](https://github.com/kokkos/kokkos/pull/7076) +* Deprecate specialization of `Kokkos::pair` for a single element [\#6947](https://github.com/kokkos/kokkos/pull/6947) +* Deprecate `deep_copy` of `UnorderedMap` of different size [\#6812](https://github.com/kokkos/kokkos/pull/6812) +* Deprecate trailing `Proxy` template argument of `Kokkos::Array` [\#6934](https://github.com/kokkos/kokkos/pull/6934) +* Deprecate implicit conversions of integers to `ChunkSize` [\#7151](https://github.com/kokkos/kokkos/pull/7151) +* Deprecate implicit conversions to execution spaces [\#7156](https://github.com/kokkos/kokkos/pull/7156) + +### Bug Fixes +* Do not return a copy of the input functor in `Experimental::for_each` [\#6910](https://github.com/kokkos/kokkos/pull/6910) +* Fix `realloc` on views of non-default constructible element types [\#6993](https://github.com/kokkos/kokkos/pull/6993) +* Fix undefined behavior in `View` initialization or fill with zeros [\#7014](https://github.com/kokkos/kokkos/pull/7014) +* Fix `sort_by_key` on host execution spaces when building with NVCC [\#7059](https://github.com/kokkos/kokkos/pull/7059) +* Fix using shared libraries and -fvisibility=hidden [\#7065](https://github.com/kokkos/kokkos/pull/7065) +* Fix view reference counting when functor copy constructor throws in parallel dispatch [\#6289](https://github.com/kokkos/kokkos/pull/6289) +* Fix `initialize(InitializationSetting)` for handling `print_configuration` setting [\#7098](https://github.com/kokkos/kokkos/pull/7098) +* Thread safety fixes for the Serial and OpenMP backend [\#7080](https://github.com/kokkos/kokkos/pull/7080), [\#6151](https://github.com/kokkos/kokkos/pull/6151) + +## [4.3.01](https://github.com/kokkos/kokkos/tree/4.3.01) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.3.00...4.3.01) + +### Backend and Architecture Enhancements: + +#### HIP: +* MI300 support unified memory [\#6877](https://github.com/kokkos/kokkos/pull/6877) + +### Bug Fixes +* Serial: Use the provided execution space instance in TeamPolicy [\#6951](https://github.com/kokkos/kokkos/pull/6951) +* `nvcc_wrapper`: bring back support for `--fmad` option [\#6931](https://github.com/kokkos/kokkos/pull/6931) +* Fix CUDA reduction overflow for `RangePolicy` [\#6578](https://github.com/kokkos/kokkos/pull/6578) + +## [4.3.00](https://github.com/kokkos/kokkos/tree/4.3.00) (2024-03-19) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.2.01...4.3.00) + +### Features: +* Add `Experimental::sort_by_key(exec, keys, values)` algorithm [\#6801](https://github.com/kokkos/kokkos/pull/6801) + +### Backend and Architecture Enhancements: + +#### CUDA: +* Experimental multi-GPU support (from the same process) [\#6782](https://github.com/kokkos/kokkos/pull/6782) +* Link against CUDA libraries even with KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE [\#6701](https://github.com/kokkos/kokkos/pull/6701) +* Don't use the compiler launcher script if the CMake compile language is CUDA. [\#6704](https://github.com/kokkos/kokkos/pull/6704) +* nvcc(wrapper): adding "long" and "short" versions for all flags [\#6615](https://github.com/kokkos/kokkos/pull/6615) + +#### HIP: + * Fix compilation when using amdclang (with ROCm >= 5.7) and RDC [\#6857](https://github.com/kokkos/kokkos/pull/6857) + * Use rocthrust for sorting, when available [\#6793](https://github.com/kokkos/kokkos/pull/6793) + +#### SYCL: +* We only support OneAPI SYCL implementation: add check during initialization + * Error out on initialization if the backend is different from `ext_oneapi_*` [\#6784](https://github.com/kokkos/kokkos/pull/6784) + * Filter GPU devices for `ext_onapi_*` GPU devices [\#6758](https://github.com/kokkos/kokkos/pull/6784) +* Performance Improvements + * Avoid unnecessary zero-memset of the scratch flags in SYCL [\#6739](https://github.com/kokkos/kokkos/pull/6739) + * Use host-pinned memory to copy reduction/scan result [\#6500](https://github.com/kokkos/kokkos/pull/6500) +* Address deprecations after oneAPI 2023.2.0 [\#6577](https://github.com/kokkos/kokkos/pull/6739) +* Make sure to call find_dependency for oneDPL if necessary [\#6870](https://github.com/kokkos/kokkos/pull/6870) + +#### OpenMPTarget: +* Use LLVM extensions for dynamic shared memory [\#6380](https://github.com/kokkos/kokkos/pull/6380) +* Guard scratch memory usage in ParallelReduce [\#6585 ](https://github.com/kokkos/kokkos/pull/6585) +* Update linker flags for Intel GPUs update [\#6735](https://github.com/kokkos/kokkos/pull/6735) +* Improve handling of printf on Intel GPUs [\#6652](https://github.com/kokkos/kokkos/pull/6652) + +#### OpenACC: +* Add atomics support [\#6446](https://github.com/kokkos/kokkos/pull/6446) +* Make the OpenACC backend asynchronous [\#6772](https://github.com/kokkos/kokkos/pull/6772) + +#### Threads: +* Add missing broadcast to TeamThreadRange parallel_scan [\#6601](https://github.com/kokkos/kokkos/pull/6601) + +#### OpenMP: +* Improve performance of view initializations and filling with zeros [\#6573](https://github.com/kokkos/kokkos/pull/6573) + +### General Enhancements + +* Improve performance of random number generation when using a normal distribution on GPUs [\#6556](https://github.com/kokkos/kokkos/pull/6556) +* Allocate temporary view with the user-provided execution space instance and do not initialize in `unique` algorithm [\#6598](https://github.com/kokkos/kokkos/pull/6598) +* Add deduction guide for `Kokkos::Array` [\#6373](https://github.com/kokkos/kokkos/pull/6373) +* Provide new public headers `` and `` [\#6687](https://github.com/kokkos/kokkos/pull/6687) +* Fix/improvement to `remove_if` parallel algorithm: use the provided execution space instance for temporary allocations and drop unnecessaryinitialization + avoid evaluating twice the predicate during final pass [\#6747](https://github.com/kokkos/kokkos/pull/6747) +* Add runtime function to query the number of devices and make device ID consistent with `KOKKOS_VISIBLE_DEVICES` [\#6713](https://github.com/kokkos/kokkos/pull/6713) +* simd: support `vector_aligned_tag` [\#6243](https://github.com/kokkos/kokkos/pull/6243) +* Avoid unnecessary allocation when default constructing Bitset [\#6524](https://github.com/kokkos/kokkos/pull/6524) +* Fix constness for views in std algorithms [\#6813](https://github.com/kokkos/kokkos/pull/6813) +* Improve error message on unsafe implicit conversion in MDRangePolicy [\#6855](https://github.com/kokkos/kokkos/pull/6855) +* CTAD (deduction guides) for RangePolicy [\#6850](https://github.com/kokkos/kokkos/pull/6850) +* CTAD (deduction guides) for MDRangePolicy [\#5516](https://github.com/kokkos/kokkos/pull/5516) + +### Build System Changes +* Require `Kokkos_ENABLE_ATOMICS_BYPASS` option to bypass atomic operation for Serial backend only builds [\#6692](https://github.com/kokkos/kokkos/pull/6692) +* Add support for RISCV and the Milk-V's Pioneer [\#6773](https://github.com/kokkos/kokkos/pull/6773) +* Add C++26 standard to CMake setup [\#6733](https://github.com/kokkos/kokkos/pull/6733) +* Fix Makefile when using gnu_generate_makefile.sh and make >= 4.3 [\#6606](https://github.com/kokkos/kokkos/pull/6606) +* Cuda: Fix configuring with CMake >= 3.28.4 - temporary fallback to internal CudaToolkit.cmake [\#6898](https://github.com/kokkos/kokkos/pull/6898) + +### Incompatibilities (i.e. breaking changes) +* Remove all `DEPRECATED_CODE_3` option and all code that was guarded by it [\#6523](https://github.com/kokkos/kokkos/pull/6523) +* Drop guards to accommodate external code defining `KOKKOS_ASSERT` [\#6665](https://github.com/kokkos/kokkos/pull/6665) +* `Profiling::ProfilingSection(std::string)` constructor marked explicit and nodiscard [\#6690](https://github.com/kokkos/kokkos/pull/6690) +* Add bound check preconditions for `RangePolicy` and `MDRangePolicy` [\#6617](https://github.com/kokkos/kokkos/pull/6617) [\#6726](https://github.com/kokkos/kokkos/pull/6726) +* Add checks for unsafe implicit conversions in RangePolicy [\#6754](https://github.com/kokkos/kokkos/pull/6754) +* Remove Kokkos::[b]half_t volatile overloads [\#6579](https://github.com/kokkos/kokkos/pull/6579) +* Remove KOKKOS_IMPL_DO_NOT_USE_PRINTF [\#6593](https://github.com/kokkos/kokkos/pull/6593) +* Check matching static extents in View constructor [\#5190 ](https://github.com/kokkos/kokkos/pull/5190) +* Tools(profiling): fix typo Kokkos_Tools_Optim[i]zationGoal [\#6642](https://github.com/kokkos/kokkos/pull/6642) +* Remove variadic range policy constructor (disallow passing multiple trailing chunk size arguments) [\#6845](https://github.com/kokkos/kokkos/pull/6845) +* Improve message on view out of bounds access and always abort [\#6861](https://github.com/kokkos/kokkos/pull/6861) +* Drop `KOKKOS_ENABLE_INTEL_MM_ALLOC` macro [\#6797](https://github.com/kokkos/kokkos/pull/6797) +* Remove `Kokkos::Experimental::LogicalMemorySpace` (without going through deprecation) [\#6557](https://github.com/kokkos/kokkos/pull/6557) +* Remove `Experimental::HBWSpace` and support for linking against memkind [\#6791](https://github.com/kokkos/kokkos/pull/6791) +* Drop librt TPL and associated `KOKKOS_ENABLE_LIBRT` macro [\#6798](https://github.com/kokkos/kokkos/pull/6798) +* Drop support for old CPU architectures (`ARCH_BGQ`, `ARCH_POWER7`, `ARCH_WSM` and associated `ARCH_SSE4` macro) [\#6806](https://github.com/kokkos/kokkos/pull/6806) +* Drop support for deprecated command-line arguments and environment variables [\#6744](https://github.com/kokkos/kokkos/pull/6744) + +### Deprecations +* Provide kokkos_swap as part of Core and deprecate Experimental::swap in Algorithms [\#6697](https://github.com/kokkos/kokkos/pull/6697) +* Deprecate {Cuda,HIP}::detect_device_count() and Cuda::[detect_]device_arch() [\#6710](https://github.com/kokkos/kokkos/pull/6710) +* Deprecate `ExecutionSpace::in_parallel()` [\#6582](https://github.com/kokkos/kokkos/pull/6582) + +### Bug Fixes +* Fix team-level MDRange reductions: [\#6511](https://github.com/kokkos/kokkos/pull/6511) +* Fix CUDA and SYCL small value type (16-bit) team reductions [\#5334](https://github.com/kokkos/kokkos/pull/5334) +* Enable `{transform_}exclusive_scan` in place [\#6667](https://github.com/kokkos/kokkos/pull/6667) +* `fill_random` overload that do not take an execution space instance argument should fence [\#6658](https://github.com/kokkos/kokkos/pull/6658) +* HIP,Cuda,OpenMPTarget: Fixup use provided execution space when copying host inaccessible reduction result [\#6777](https://github.com/kokkos/kokkos/pull/6777) +* Fix typo in `cuda_func_set_attribute[s]_wrapper` preventing proper setting of desired occupancy [\#6786](https://github.com/kokkos/kokkos/pull/6786) +* Avoid undefined behavior due to conversion between signed and unsigned integers in shift_{right, left}_team_impl [\#6821](https://github.com/kokkos/kokkos/pull/6821) +* Fix a bug in Makefile.kokkos when using AMD GPU architectures as `AMD_GFXYYY` [\#6892](https://github.com/kokkos/kokkos/pull/6892) + +## [4.2.01](https://github.com/kokkos/kokkos/tree/4.2.01) (2023-12-07) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.2.00...4.2.01) + +### Backend and Architecture Enhancements: + +#### CUDA: +- Add warp sync for `parallel_reduce` to avoid race condition [\#6630](https://github.com/kokkos/kokkos/pull/6630), [\#6746](https://github.com/kokkos/kokkos/pull/6746) + +#### HIP: +- Fix Graph "multiple definition of" linking error (missing `inline` specifier) [\#6624](https://github.com/kokkos/kokkos/pull/6624) +- Add support for gfx940 (AMD Instinct MI300 GPU) [\#6671](https://github.com/kokkos/kokkos/pull/6671) + +### Build System +- CMake: Don't let Kokkos set `CMAKE_CXX_FLAGS` for Trilinos builds [\#6742](https://github.com/kokkos/kokkos/pull/6742) + +### Bug Fixes +- Remove deprecation warning for `AllocationMechanism` for GCC <11.0 [\#6653](https://github.com/kokkos/kokkos/pull/6653) +- Fix bug early tools finalize with non-default host execution instances [\#6635](https://github.com/kokkos/kokkos/pull/6635) +- Fix various issues for MSVC CUDA builds [\#6659](https://github.com/kokkos/kokkos/pull/6659) +- Fix "extra `;`" warning with `-pedantic` flag in `` [\#6510](https://github.com/kokkos/kokkos/pull/6510) + +## [4.2.00](https://github.com/kokkos/kokkos/tree/4.2.00) (2023-11-06) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.1.00...4.2.00) + +### Features: +- SIMD: significant improvements to SIMD support and alignment with C++26 SIMD + - add `Kokkos::abs` overload for SIMD types [\#6069](https://github.com/kokkos/kokkos/pull/6069) + - add generator constructors [\#6347](https://github.com/kokkos/kokkos/pull/6347) + - convert binary operators to hidden friends [\#6320](https://github.com/kokkos/kokkos/pull/6320) + - add shift operators [\#6109](https://github.com/kokkos/kokkos/pull/6109) + - add `float` support [\#6177](https://github.com/kokkos/kokkos/pull/6177) + - add remaining `gather_from` and `scatter_to` overloads [\#6220](https://github.com/kokkos/kokkos/pull/6220) + - define simd math function overloads in the Kokkos namespace [\#6465](https://github.com/kokkos/kokkos/pull/6465), [\#6487](https://github.com/kokkos/kokkos/pull/6487) + - `Kokkos_ENABLE_NATIVE=ON` autodetects SIMD types supported [\#6188](https://github.com/kokkos/kokkos/pull/6188) + - fix AVX2 SIMD support for ZEN2 AMD CPU [\#6238](https://github.com/kokkos/kokkos/pull/6238) +- `Kokkos::printf` [\#6083](https://github.com/kokkos/kokkos/pull/6083) +- `Kokkos::sort`: support custom comparator [\#6253](https://github.com/kokkos/kokkos/pull/6253) +- `half_t` and `bhalf_t` numeric traits [\#5778](https://github.com/kokkos/kokkos/pull/5778) +- `half_t` and `bhalf_t` mixed comparisons [\#6407](https://github.com/kokkos/kokkos/pull/6407) +- `half_t` and `bhalf_t` mathematical functions [\#6124](https://github.com/kokkos/kokkos/pull/6124) +- `TeamThreadRange` `parallel_scan` with return value [\#6090](https://github.com/kokkos/kokkos/pull/6090), [\#6301](https://github.com/kokkos/kokkos/pull/6301), [\#6302](https://github.com/kokkos/kokkos/pull/6302), [\#6303](https://github.com/kokkos/kokkos/pull/6303), [\#6307](https://github.com/kokkos/kokkos/pull/6307) +- `ThreadVectorRange` `parallel_scan` with return value [\#6235](https://github.com/kokkos/kokkos/pull/6235), [\#6242](https://github.com/kokkos/kokkos/pull/6242), [\#6308](https://github.com/kokkos/kokkos/pull/6308), [\#6305](https://github.com/kokkos/kokkos/pull/6305), [\#6292](https://github.com/kokkos/kokkos/pull/6292) +- Add team-level std algorithms [\#6200](https://github.com/kokkos/kokkos/pull/6200), [\#6205](https://github.com/kokkos/kokkos/pull/6205), [\#6207](https://github.com/kokkos/kokkos/pull/6207), [\#6208](https://github.com/kokkos/kokkos/pull/6208), [\#6209](https://github.com/kokkos/kokkos/pull/6209), [\#6210](https://github.com/kokkos/kokkos/pull/6210), [\#6211](https://github.com/kokkos/kokkos/pull/6211), [\#6212](https://github.com/kokkos/kokkos/pull/6212), [\#6213](https://github.com/kokkos/kokkos/pull/6213), [\#6256](https://github.com/kokkos/kokkos/pull/6256), [\#6258](https://github.com/kokkos/kokkos/pull/6258), [\#6350](https://github.com/kokkos/kokkos/pull/6350), [\#6351](https://github.com/kokkos/kokkos/pull/6351) +- Serial: Allow for distinct execution space instances [\#6441](https://github.com/kokkos/kokkos/pull/6441) + +### Backend and Architecture Enhancements: + +#### CUDA: +- Fixed potential data race in Cuda `parallel_reduce` [\#6236](https://github.com/kokkos/kokkos/pull/6236) +- Use `cudaMallocAsync` by default [\#6402](https://github.com/kokkos/kokkos/pull/6402) +- Bugfix for using Kokkos from a thread of execution [\#6299](https://github.com/kokkos/kokkos/pull/6299) + +#### HIP: +- New naming convention for AMD GPU: VEGA906, VEGA908, VEGA90A, NAVI1030 to AMD_GFX906, AMD_GFX908, AMD_GFX90A, AMD_GFX1030 [\#6266](https://github.com/kokkos/kokkos/pull/6266) +- Add initial support for gfx942: [\#6358](https://github.com/kokkos/kokkos/pull/6358) +- Improve reduction performance [\#6229](https://github.com/kokkos/kokkos/pull/6229) +- Deprecate `HIP(hipStream_t,bool)` constructor [\#6401](https://github.com/kokkos/kokkos/pull/6401) +- Add support for Graph [\#6370](https://github.com/kokkos/kokkos/pull/6370) +- Improve reduction performance when using Teams [\#6284](https://github.com/kokkos/kokkos/pull/6284) +- Fix concurrency calculation [\#6479](https://github.com/kokkos/kokkos/pull/6479) +- Fix potential data race in HIP `parallel_reduce` [\#6429](https://github.com/kokkos/kokkos/pull/6429) + +#### SYCL: +- Enforce external `sycl::queues` to be in-order [\#6246](https://github.com/kokkos/kokkos/pull/6246) +- Improve reduction performance: [\#6272](https://github.com/kokkos/kokkos/pull/6272) [\#6271](https://github.com/kokkos/kokkos/pull/6271) [\#6270](https://github.com/kokkos/kokkos/pull/6270) [\#6264](https://github.com/kokkos/kokkos/pull/6264) +- Allow using the SYCL execution space on AMD GPUs [\#6321](https://github.com/kokkos/kokkos/pull/6321) +- Allow sorting via native oneDPL to support Views with stride=1 [\#6322](https://github.com/kokkos/kokkos/pull/6322) +- Make in-order queues the default via macro [\#6189](https://github.com/kokkos/kokkos/pull/6189) + +#### OpenACC: +- Support Clacc compiler [\#6250](https://github.com/kokkos/kokkos/pull/6250) + +### General Enhancements +- Add missing `is_*_view` traits and `is_*_view_v` helper variable templates for `DynRankView`, `DynamicView`, `OffsetView`, `ScatterView` containers [\#6195](https://github.com/kokkos/kokkos/pull/6195) +- Make `nvcc_wrapper` and `compiler_launcher` scripts more portable by switching to a `#!/usr/bin/env` shebang [\#6357](https://github.com/kokkos/kokkos/pull/6357) +- Add an improved `Kokkos::malloc` / `Kokkos::free` performance test [\#6377](https://github.com/kokkos/kokkos/pull/6377) +- Ensure `Views` with `size==0` can be used with `deep_copy` [\#6273](https://github.com/kokkos/kokkos/pull/6273) +- `Kokkos::abort` is moved to header `Kokkos_Abort.hpp` [\#6445](https://github.com/kokkos/kokkos/pull/6445) +- `KOKKOS_ASSERT`, `KOKKOS_EXPECTS`, `KOKKOS_ENSURES` are moved to header `Kokkos_Assert.hpp` [\#6445](https://github.com/kokkos/kokkos/pull/6445) +- Add a permuted-index mode to the gups benchmark [\#6378](https://github.com/kokkos/kokkos/pull/6378) +- Check for overflow during backend initialization [\#6159](https://github.com/kokkos/kokkos/pull/6159) +- Make constraints on `Kokkos::sort` more visible [\#6234](https://github.com/kokkos/kokkos/pull/6234) and cleanup API [\#6239](https://github.com/kokkos/kokkos/pull/6239) +- Add converting assignment to `DualView`: [\#6474](https://github.com/kokkos/kokkos/pull/6474) + + +### Build System Changes + +- Export `Kokkos_CXX_COMPILER_VERSION` [\#6282](https://github.com/kokkos/kokkos/pull/6282) +- Disable default oneDPL support in Trilinos [\#6342](https://github.com/kokkos/kokkos/pull/6342) + +### Incompatibilities (i.e. breaking changes) + - Ensure that `Kokkos::complex` only gets instantiated for cv-unqualified floating-point types [\#6251](https://github.com/kokkos/kokkos/pull/6251) + - Removed (deprecated-3) support for volatile join operators in reductions [\#6385](https://github.com/kokkos/kokkos/pull/6385) + - Enforce `ViewCtorArgs` restrictions for `create_mirror_view` [\#6304](https://github.com/kokkos/kokkos/pull/6304) + - SIMD types for ARM NEON are not autodetected anymore but need `Kokkos_ARCH_ARM_NEON` or `Kokkos_ARCH_NATIVE=ON` [\#6394](https://github.com/kokkos/kokkos/pull/6394) + - Remove `#include ` from headers where possible [\#6482](https://github.com/kokkos/kokkos/pull/6482) + +### Deprecations +- Deprecated `Kokkos::vector` [\#6252](https://github.com/kokkos/kokkos/pull/6252) +- All host allocation mechanisms except for `STD_MALLOC` have been deprecated [\#6341](https://github.com/kokkos/kokkos/pull/6341) + +### Bug Fixes + - Missing memory fence in `RandomPool::free_state` functions [\#6290](https://github.com/kokkos/kokkos/pull/6290) + - Fix for corner case in `Kokkos::Experimental::is_partitioned` algorithm [\#6257](https://github.com/kokkos/kokkos/pull/6257) + - Fix initialization of scratch lock variables in the `Cuda` backend [\#6433](https://github.com/kokkos/kokkos/pull/6433) + - Fixes for `Kokkos::Array` [\#6372](https://github.com/kokkos/kokkos/pull/6372) + - Fixed symlink configure issue for Windows [\#6241](https://github.com/kokkos/kokkos/pull/6241) + - OpenMPTarget init-join fix [\#6444](https://github.com/kokkos/kokkos/pull/6444) + - Fix atomic operations bug for Min and Max [\#6435](https://github.com/kokkos/kokkos/pull/6435) + - Fix implementation for `cyl_bessel_i0` [\#6484](https://github.com/kokkos/kokkos/pull/6484) + - Fix various NVCC warnings in `BinSort`, `Array`, and bit manipulation function templates [\#6483](https://github.com/kokkos/kokkos/pull/6483) + +## [4.1.00](https://github.com/kokkos/kokkos/tree/4.1.00) (2023-06-16) [Full Changelog](https://github.com/kokkos/kokkos/compare/4.0.01...4.1.00) ### Features: @@ -887,95 +1188,95 @@ - Major update for OpenMPTarget: many capabilities now work. For details contact us. - Added DPC++/SYCL backend: primary capabilites are working. - Added Kokkos Graph API analogous to CUDA Graphs. -- Added parallel_scan support with TeamThreadRange [\#3536](https://github.com/kokkos/kokkos/pull/#3536) -- Added Logical Memory Spaces [\#3546](https://github.com/kokkos/kokkos/pull/#3546) -- Added initial half precision support [\#3439](https://github.com/kokkos/kokkos/pull/#3439) -- Experimental feature: control cuda occupancy [\#3379](https://github.com/kokkos/kokkos/pull/#3379) +- Added parallel_scan support with TeamThreadRange [\#3536](https://github.com/kokkos/kokkos/pull/3536) +- Added Logical Memory Spaces [\#3546](https://github.com/kokkos/kokkos/pull/3546) +- Added initial half precision support [\#3439](https://github.com/kokkos/kokkos/pull/3439) +- Experimental feature: control cuda occupancy [\#3379](https://github.com/kokkos/kokkos/pull/3379) **Implemented enhancements Backends and Archs:** -- Add a64fx and fujitsu Compiler support [\#3614](https://github.com/kokkos/kokkos/pull/#3614) -- Adding support for AMD gfx908 archictecture [\#3375](https://github.com/kokkos/kokkos/pull/#3375) -- SYCL parallel\_for MDRangePolicy [\#3583](https://github.com/kokkos/kokkos/pull/#3583) -- SYCL add parallel\_scan [\#3577](https://github.com/kokkos/kokkos/pull/#3577) -- SYCL custom reductions [\#3544](https://github.com/kokkos/kokkos/pull/#3544) -- SYCL Enable container unit tests [\#3550](https://github.com/kokkos/kokkos/pull/#3550) -- SYCL feature level 5 [\#3480](https://github.com/kokkos/kokkos/pull/#3480) -- SYCL Feature level 4 (parallel\_for) [\#3474](https://github.com/kokkos/kokkos/pull/#3474) -- SYCL feature level 3 [\#3451](https://github.com/kokkos/kokkos/pull/#3451) -- SYCL feature level 2 [\#3447](https://github.com/kokkos/kokkos/pull/#3447) -- OpenMPTarget: Hierarchial reduction for + operator on scalars [\#3504](https://github.com/kokkos/kokkos/pull/#3504) -- OpenMPTarget hierarchical [\#3411](https://github.com/kokkos/kokkos/pull/#3411) -- HIP Add Impl::atomic\_[store,load] [\#3440](https://github.com/kokkos/kokkos/pull/#3440) -- HIP enable global lock arrays [\#3418](https://github.com/kokkos/kokkos/pull/#3418) -- HIP Implement multiple occupancy paths for various HIP kernel launchers [\#3366](https://github.com/kokkos/kokkos/pull/#3366) +- Add a64fx and fujitsu Compiler support [\#3614](https://github.com/kokkos/kokkos/pull/3614) +- Adding support for AMD gfx908 archictecture [\#3375](https://github.com/kokkos/kokkos/pull/3375) +- SYCL parallel\_for MDRangePolicy [\#3583](https://github.com/kokkos/kokkos/pull/3583) +- SYCL add parallel\_scan [\#3577](https://github.com/kokkos/kokkos/pull/3577) +- SYCL custom reductions [\#3544](https://github.com/kokkos/kokkos/pull/3544) +- SYCL Enable container unit tests [\#3550](https://github.com/kokkos/kokkos/pull/3550) +- SYCL feature level 5 [\#3480](https://github.com/kokkos/kokkos/pull/3480) +- SYCL Feature level 4 (parallel\_for) [\#3474](https://github.com/kokkos/kokkos/pull/3474) +- SYCL feature level 3 [\#3451](https://github.com/kokkos/kokkos/pull/3451) +- SYCL feature level 2 [\#3447](https://github.com/kokkos/kokkos/pull/3447) +- OpenMPTarget: Hierarchial reduction for + operator on scalars [\#3504](https://github.com/kokkos/kokkos/pull/3504) +- OpenMPTarget hierarchical [\#3411](https://github.com/kokkos/kokkos/pull/3411) +- HIP Add Impl::atomic\_[store,load] [\#3440](https://github.com/kokkos/kokkos/pull/3440) +- HIP enable global lock arrays [\#3418](https://github.com/kokkos/kokkos/pull/3418) +- HIP Implement multiple occupancy paths for various HIP kernel launchers [\#3366](https://github.com/kokkos/kokkos/pull/3366) **Implemented enhancements Policies:** -- MDRangePolicy: Let it be semiregular [\#3494](https://github.com/kokkos/kokkos/pull/#3494) -- MDRangePolicy: Check narrowing conversion in construction [\#3527](https://github.com/kokkos/kokkos/pull/#3527) -- MDRangePolicy: CombinedReducers support [\#3395](https://github.com/kokkos/kokkos/pull/#3395) -- Kokkos Graph: Interface and Default Implementation [\#3362](https://github.com/kokkos/kokkos/pull/#3362) -- Kokkos Graph: add Cuda Graph implementation [\#3369](https://github.com/kokkos/kokkos/pull/#3369) -- TeamPolicy: implemented autotuning of team sizes and vector lengths [\#3206](https://github.com/kokkos/kokkos/pull/#3206) -- RangePolicy: Initialize all data members in default constructor [\#3509](https://github.com/kokkos/kokkos/pull/#3509) +- MDRangePolicy: Let it be semiregular [\#3494](https://github.com/kokkos/kokkos/pull/3494) +- MDRangePolicy: Check narrowing conversion in construction [\#3527](https://github.com/kokkos/kokkos/pull/3527) +- MDRangePolicy: CombinedReducers support [\#3395](https://github.com/kokkos/kokkos/pull/3395) +- Kokkos Graph: Interface and Default Implementation [\#3362](https://github.com/kokkos/kokkos/pull/3362) +- Kokkos Graph: add Cuda Graph implementation [\#3369](https://github.com/kokkos/kokkos/pull/3369) +- TeamPolicy: implemented autotuning of team sizes and vector lengths [\#3206](https://github.com/kokkos/kokkos/pull/3206) +- RangePolicy: Initialize all data members in default constructor [\#3509](https://github.com/kokkos/kokkos/pull/3509) **Implemented enhancements BuildSystem:** -- Auto-generate core test files for all backends [\#3488](https://github.com/kokkos/kokkos/pull/#3488) -- Avoid rewriting test files when calling cmake [\#3548](https://github.com/kokkos/kokkos/pull/#3548) -- RULE\_LAUNCH\_COMPILE and RULE\_LAUNCH\_LINK system for nvcc\_wrapper [\#3136](https://github.com/kokkos/kokkos/pull/#3136) -- Adding -include as a known argument to nvcc\_wrapper [\#3434](https://github.com/kokkos/kokkos/pull/#3434) -- Install hpcbind script [\#3402](https://github.com/kokkos/kokkos/pull/#3402) -- cmake/kokkos\_tribits.cmake: add parsing for args [\#3457](https://github.com/kokkos/kokkos/pull/#3457) +- Auto-generate core test files for all backends [\#3488](https://github.com/kokkos/kokkos/pull/3488) +- Avoid rewriting test files when calling cmake [\#3548](https://github.com/kokkos/kokkos/pull/3548) +- RULE\_LAUNCH\_COMPILE and RULE\_LAUNCH\_LINK system for nvcc\_wrapper [\#3136](https://github.com/kokkos/kokkos/pull/3136) +- Adding -include as a known argument to nvcc\_wrapper [\#3434](https://github.com/kokkos/kokkos/pull/3434) +- Install hpcbind script [\#3402](https://github.com/kokkos/kokkos/pull/3402) +- cmake/kokkos\_tribits.cmake: add parsing for args [\#3457](https://github.com/kokkos/kokkos/pull/3457) **Implemented enhancements Tools:** -- Changed namespacing of Kokkos::Tools::Impl::Impl::tune\_policy [\#3455](https://github.com/kokkos/kokkos/pull/#3455) -- Delegate to an impl allocate/deallocate method to allow specifying a SpaceHandle for MemorySpaces [\#3530](https://github.com/kokkos/kokkos/pull/#3530) -- Use the Kokkos Profiling interface rather than the Impl interface [\#3518](https://github.com/kokkos/kokkos/pull/#3518) -- Runtime option for tuning [\#3459](https://github.com/kokkos/kokkos/pull/#3459) -- Dual View Tool Events [\#3326](https://github.com/kokkos/kokkos/pull/#3326) +- Changed namespacing of Kokkos::Tools::Impl::Impl::tune\_policy [\#3455](https://github.com/kokkos/kokkos/pull/3455) +- Delegate to an impl allocate/deallocate method to allow specifying a SpaceHandle for MemorySpaces [\#3530](https://github.com/kokkos/kokkos/pull/3530) +- Use the Kokkos Profiling interface rather than the Impl interface [\#3518](https://github.com/kokkos/kokkos/pull/3518) +- Runtime option for tuning [\#3459](https://github.com/kokkos/kokkos/pull/3459) +- Dual View Tool Events [\#3326](https://github.com/kokkos/kokkos/pull/3326) **Implemented enhancements Other:** -- Abort on errors instead of just printing [\#3528](https://github.com/kokkos/kokkos/pull/#3528) -- Enable C++14 macros unconditionally [\#3449](https://github.com/kokkos/kokkos/pull/#3449) -- Make ViewMapping trivially copyable [\#3436](https://github.com/kokkos/kokkos/pull/#3436) -- Rename struct ViewMapping to class [\#3435](https://github.com/kokkos/kokkos/pull/#3435) -- Replace enums in Kokkos\_ViewMapping.hpp (removes -Wextra) [\#3422](https://github.com/kokkos/kokkos/pull/#3422) -- Use bool for enums representing bools [\#3416](https://github.com/kokkos/kokkos/pull/#3416) -- Fence active instead of default execution space instances [\#3388](https://github.com/kokkos/kokkos/pull/#3388) -- Refactor parallel\_reduce fence usage [\#3359](https://github.com/kokkos/kokkos/pull/#3359) -- Moved Space EBO helpers to Kokkos\_EBO [\#3357](https://github.com/kokkos/kokkos/pull/#3357) -- Add remove\_cvref type trait [\#3340](https://github.com/kokkos/kokkos/pull/#3340) -- Adding identity type traits and update definition of identity\_t alias [\#3339](https://github.com/kokkos/kokkos/pull/#3339) -- Add is\_specialization\_of type trait [\#3338](https://github.com/kokkos/kokkos/pull/#3338) -- Make ScratchMemorySpace semi-regular [\#3309](https://github.com/kokkos/kokkos/pull/#3309) -- Optimize min/max atomics with early exit on no-op case [\#3265](https://github.com/kokkos/kokkos/pull/#3265) -- Refactor Backend Development [\#2941](https://github.com/kokkos/kokkos/pull/#2941) +- Abort on errors instead of just printing [\#3528](https://github.com/kokkos/kokkos/pull/3528) +- Enable C++14 macros unconditionally [\#3449](https://github.com/kokkos/kokkos/pull/3449) +- Make ViewMapping trivially copyable [\#3436](https://github.com/kokkos/kokkos/pull/3436) +- Rename struct ViewMapping to class [\#3435](https://github.com/kokkos/kokkos/pull/3435) +- Replace enums in Kokkos\_ViewMapping.hpp (removes -Wextra) [\#3422](https://github.com/kokkos/kokkos/pull/3422) +- Use bool for enums representing bools [\#3416](https://github.com/kokkos/kokkos/pull/3416) +- Fence active instead of default execution space instances [\#3388](https://github.com/kokkos/kokkos/pull/3388) +- Refactor parallel\_reduce fence usage [\#3359](https://github.com/kokkos/kokkos/pull/3359) +- Moved Space EBO helpers to Kokkos\_EBO [\#3357](https://github.com/kokkos/kokkos/pull/3357) +- Add remove\_cvref type trait [\#3340](https://github.com/kokkos/kokkos/pull/3340) +- Adding identity type traits and update definition of identity\_t alias [\#3339](https://github.com/kokkos/kokkos/pull/3339) +- Add is\_specialization\_of type trait [\#3338](https://github.com/kokkos/kokkos/pull/3338) +- Make ScratchMemorySpace semi-regular [\#3309](https://github.com/kokkos/kokkos/pull/3309) +- Optimize min/max atomics with early exit on no-op case [\#3265](https://github.com/kokkos/kokkos/pull/3265) +- Refactor Backend Development [\#2941](https://github.com/kokkos/kokkos/pull/2941) **Fixed bugs:** -- Fixup MDRangePolicy construction from Kokkos arrays [\#3591](https://github.com/kokkos/kokkos/pull/#3591) -- Add atomic functions for unsigned long long using gcc built-in [\#3588](https://github.com/kokkos/kokkos/pull/#3588) -- Fixup silent pointless comparison with zero in checked\_narrow\_cast (compiler workaround) [\#3566](https://github.com/kokkos/kokkos/pull/#3566) -- Fixes for ROCm 3.9 [\#3565](https://github.com/kokkos/kokkos/pull/#3565) -- Fix windows build issues which crept in for the CUDA build [\#3532](https://github.com/kokkos/kokkos/pull/#3532) -- HIP Fix atomics of large data types and clean up lock arrays [\#3529](https://github.com/kokkos/kokkos/pull/#3529) -- Pthreads fix exception resulting from 0 grain size [\#3510](https://github.com/kokkos/kokkos/pull/#3510) -- Fixup do not require atomic operation to be default constructible [\#3503](https://github.com/kokkos/kokkos/pull/#3503) -- Fix race condition in HIP backend [\#3467](https://github.com/kokkos/kokkos/pull/#3467) -- Replace KOKKOS\_DEBUG with KOKKOS\_ENABLE\_DEBUG [\#3458](https://github.com/kokkos/kokkos/pull/#3458) -- Fix multi-stream team scratch space definition for HIP [\#3398](https://github.com/kokkos/kokkos/pull/#3398) -- HIP fix template deduction [\#3393](https://github.com/kokkos/kokkos/pull/#3393) -- Fix compiling with HIP and C++17 [\#3390](https://github.com/kokkos/kokkos/pull/#3390) -- Fix sigFPE in HIP blocksize deduction [\#3378](https://github.com/kokkos/kokkos/pull/#3378) -- Type alias change: replace CS with CTS to avoid conflicts with NVSHMEM [\#3348](https://github.com/kokkos/kokkos/pull/#3348) -- Clang compilation of CUDA backend on Windows [\#3345](https://github.com/kokkos/kokkos/pull/#3345) -- Fix HBW support [\#3343](https://github.com/kokkos/kokkos/pull/#3343) -- Added missing fences to unique token [\#3260](https://github.com/kokkos/kokkos/pull/#3260) +- Fixup MDRangePolicy construction from Kokkos arrays [\#3591](https://github.com/kokkos/kokkos/pull/3591) +- Add atomic functions for unsigned long long using gcc built-in [\#3588](https://github.com/kokkos/kokkos/pull/3588) +- Fixup silent pointless comparison with zero in checked\_narrow\_cast (compiler workaround) [\#3566](https://github.com/kokkos/kokkos/pull/3566) +- Fixes for ROCm 3.9 [\#3565](https://github.com/kokkos/kokkos/pull/3565) +- Fix windows build issues which crept in for the CUDA build [\#3532](https://github.com/kokkos/kokkos/pull/3532) +- HIP Fix atomics of large data types and clean up lock arrays [\#3529](https://github.com/kokkos/kokkos/pull/3529) +- Pthreads fix exception resulting from 0 grain size [\#3510](https://github.com/kokkos/kokkos/pull/3510) +- Fixup do not require atomic operation to be default constructible [\#3503](https://github.com/kokkos/kokkos/pull/3503) +- Fix race condition in HIP backend [\#3467](https://github.com/kokkos/kokkos/pull/3467) +- Replace KOKKOS\_DEBUG with KOKKOS\_ENABLE\_DEBUG [\#3458](https://github.com/kokkos/kokkos/pull/3458) +- Fix multi-stream team scratch space definition for HIP [\#3398](https://github.com/kokkos/kokkos/pull/3398) +- HIP fix template deduction [\#3393](https://github.com/kokkos/kokkos/pull/3393) +- Fix compiling with HIP and C++17 [\#3390](https://github.com/kokkos/kokkos/pull/3390) +- Fix sigFPE in HIP blocksize deduction [\#3378](https://github.com/kokkos/kokkos/pull/3378) +- Type alias change: replace CS with CTS to avoid conflicts with NVSHMEM [\#3348](https://github.com/kokkos/kokkos/pull/3348) +- Clang compilation of CUDA backend on Windows [\#3345](https://github.com/kokkos/kokkos/pull/3345) +- Fix HBW support [\#3343](https://github.com/kokkos/kokkos/pull/3343) +- Added missing fences to unique token [\#3260](https://github.com/kokkos/kokkos/pull/3260) **Incompatibilities:** -- Remove unused utilities (forward, move, and expand\_variadic) from Kokkos::Impl [\#3535](https://github.com/kokkos/kokkos/pull/#3535) -- Remove unused traits [\#3534](https://github.com/kokkos/kokkos/pull/#3534) -- HIP: Remove old HCC code [\#3301](https://github.com/kokkos/kokkos/pull/#3301) -- Prepare for deprecation of ViewAllocateWithoutInitializing [\#3264](https://github.com/kokkos/kokkos/pull/#3264) -- Remove ROCm backend [\#3148](https://github.com/kokkos/kokkos/pull/#3148) +- Remove unused utilities (forward, move, and expand\_variadic) from Kokkos::Impl [\#3535](https://github.com/kokkos/kokkos/pull/3535) +- Remove unused traits [\#3534](https://github.com/kokkos/kokkos/pull/3534) +- HIP: Remove old HCC code [\#3301](https://github.com/kokkos/kokkos/pull/3301) +- Prepare for deprecation of ViewAllocateWithoutInitializing [\#3264](https://github.com/kokkos/kokkos/pull/3264) +- Remove ROCm backend [\#3148](https://github.com/kokkos/kokkos/pull/3148) ## [3.2.01](https://github.com/kokkos/kokkos/tree/3.2.01) (2020-11-17) [Full Changelog](https://github.com/kokkos/kokkos/compare/3.2.00...3.2.01) diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000000..28c674c451 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,65 @@ +cff-version: 1.2.0 +title: Kokkos +message: >- + If you use this software, please cite the overview paper +type: software +authors: + - name: The Kokkos authors + website: https://kokkos.org/community/team/ +identifiers: + - type: url + website: https://kokkos.org/kokkos-core-wiki/citation.html +repository-code: 'https://github.com/kokkos/kokkos' +url: 'https://kokkos.org/' +license: Apache-2.0 +preferred-citation: + type: article + authors: + - given-names: Christian R. + family-names: Trott + - given-names: Damien + family-names: Lebrun-Grandié + - given-names: Daniel + family-names: Arndt + - family-names: Ciesko + given-names: Jan + - given-names: Vinh + family-names: Dang + - family-names: Ellingwood + given-names: Nathan + - given-names: Rahulkumar + family-names: Gayatri + - given-names: Evan + family-names: Harvey + - given-names: Daisy S. + family-names: Hollman + - given-names: Dan + family-names: Ibanez + - given-names: Nevin + family-names: Liber + - given-names: Jonathan + family-names: Madsen + - given-names: Jeff + family-names: Miles + - given-names: David + family-names: Poliakoff + - given-names: Amy + family-names: Powell + - given-names: Sivasankaran + family-names: Rajamanickam + - given-names: Mikael + family-names: Simberg + - given-names: Dan + family-names: Sunderland + - given-names: Bruno + family-names: Turcksin + - given-names: Jeremiah + family-names: Wilke + doi: 10.1109/TPDS.2021.3097283 + journal: IEEE Transactions on Parallel and Distributed Systems + start: 805 + end: 817 + title: "Kokkos 3: Programming Model Extensions for the Exascale Era" + volume: 33 + issue: 4 + year: 2022 diff --git a/CMakeLists.txt b/CMakeLists.txt index 895cee6a08..054de2c1da 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -150,8 +150,8 @@ ENDIF() set(Kokkos_VERSION_MAJOR 4) -set(Kokkos_VERSION_MINOR 1) -set(Kokkos_VERSION_PATCH 00) +set(Kokkos_VERSION_MINOR 4) +set(Kokkos_VERSION_PATCH 0) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") message(STATUS "Kokkos version: ${Kokkos_VERSION}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") @@ -252,7 +252,6 @@ ENDIF() # subpackages ## This restores the old behavior of ProjectCompilerPostConfig.cmake -# It sets the CMAKE_CXX_FLAGS globally to those used by Kokkos # We must do this before KOKKOS_PACKAGE_DECL IF (KOKKOS_HAS_TRILINOS) # Overwrite the old flags at the top-level @@ -280,21 +279,13 @@ IF (KOKKOS_HAS_TRILINOS) SET(KOKKOSCORE_XCOMPILER_OPTIONS "${KOKKOSCORE_XCOMPILER_OPTIONS} -Xcompiler ${XCOMP_FLAG}") LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcompiler ${XCOMP_FLAG}) ENDFOREACH() - SET(KOKKOSCORE_CXX_FLAGS "${KOKKOSCORE_COMPILE_OPTIONS} ${KOKKOSCORE_XCOMPILER_OPTIONS}") IF (KOKKOS_ENABLE_CUDA) STRING(REPLACE ";" " " KOKKOSCORE_CUDA_OPTIONS "${KOKKOS_CUDA_OPTIONS}") FOREACH(CUDAFE_FLAG ${KOKKOS_CUDAFE_OPTIONS}) SET(KOKKOSCORE_CUDAFE_OPTIONS "${KOKKOSCORE_CUDAFE_OPTIONS} -Xcudafe ${CUDAFE_FLAG}") LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcudafe ${CUDAFE_FLAG}) ENDFOREACH() - SET(KOKKOSCORE_CXX_FLAGS "${KOKKOSCORE_CXX_FLAGS} ${KOKKOSCORE_CUDA_OPTIONS} ${KOKKOSCORE_CUDAFE_OPTIONS}") ENDIF() - # Both parent scope and this package - # In ProjectCompilerPostConfig.cmake, we capture the "global" flags Trilinos wants in - # TRILINOS_TOPLEVEL_CXX_FLAGS - SET(CMAKE_CXX_FLAGS "${TRILINOS_TOPLEVEL_CXX_FLAGS} ${KOKKOSCORE_CXX_FLAGS}" PARENT_SCOPE) - SET(CMAKE_CXX_FLAGS "${TRILINOS_TOPLEVEL_CXX_FLAGS} ${KOKKOSCORE_CXX_FLAGS}") - #CMAKE_CXX_FLAGS will get added to Kokkos and Kokkos dependencies automatically here #These flags get set up in KOKKOS_PACKAGE_DECL, which means they #must be configured before KOKKOS_PACKAGE_DECL SET(KOKKOS_ALL_COMPILE_OPTIONS @@ -314,7 +305,6 @@ KOKKOS_PROCESS_SUBPACKAGES() # E) If Kokkos itself is enabled, process the Kokkos package # -KOKKOS_EXCLUDE_AUTOTOOLS_FILES() KOKKOS_PACKAGE_POSTPROCESS() KOKKOS_CONFIGURE_CORE() diff --git a/Copyright.txt b/Copyright.txt index 5e2f8d8647..cbba3efc7b 100644 --- a/Copyright.txt +++ b/Copyright.txt @@ -1,41 +1,8 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER +************************************************************************ + + Kokkos v. 4.0 + Copyright (2022) National Technology & Engineering + Solutions of Sandia, LLC (NTESS). + +Under the terms of Contract DE-NA0003525 with NTESS, +the U.S. Government retains certain rights in this software. diff --git a/LICENSE b/LICENSE index 6572cc2db0..4d9d69d7c4 100644 --- a/LICENSE +++ b/LICENSE @@ -1,13 +1,3 @@ - ************************************************************************ - - Kokkos v. 4.0 - Copyright (2022) National Technology & Engineering - Solutions of Sandia, LLC (NTESS). - - Under the terms of Contract DE-NA0003525 with NTESS, - the U.S. Government retains certain rights in this software. - - ============================================================================== Kokkos is under the Apache License v2.0 with LLVM Exceptions: ============================================================================== diff --git a/Makefile.kokkos b/Makefile.kokkos index 9436b75b9e..15f24f3073 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -1,8 +1,8 @@ # Default settings common options. KOKKOS_VERSION_MAJOR = 4 -KOKKOS_VERSION_MINOR = 1 -KOKKOS_VERSION_PATCH = 00 +KOKKOS_VERSION_MINOR = 4 +KOKKOS_VERSION_PATCH = 0 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial @@ -11,15 +11,15 @@ KOKKOS_DEVICES ?= "Threads" # Options: # Intel: KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR # NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Ada89,Hopper90 -# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX -# IBM: BGQ,Power7,Power8,Power9 -# AMD-GPUS: Vega906,Vega908,Vega90A,Navi1030 +# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX,ARMv9-Grace +# IBM: Power8,Power9 +# AMD-GPUS: AMD_GFX906,AMD_GFX908,AMD_GFX90A,AMD_GFX940,AMD_GFX942,AMD_GFX1030,AMD_GFX1100 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3 -# Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP,PVC +# Intel-GPUs: Intel_Gen,Intel_Gen9,Intel_Gen11,Intel_Gen12LP,Intel_DG1,Intel_XeHP,Intel_PVC KOKKOS_ARCH ?= "" # Options: yes,no KOKKOS_DEBUG ?= "no" -# Options: hwloc,librt,experimental_memkind +# Options: hwloc KOKKOS_USE_TPLS ?= "" # Options: c++17,c++1z,c++20,c++2a,c++23,c++2b KOKKOS_CXX_STANDARD ?= "c++17" @@ -30,7 +30,7 @@ KOKKOS_TRIBITS ?= "no" KOKKOS_STANDALONE_CMAKE ?= "no" # Default settings specific options. -# Options: force_uvm,use_ldg,rdc,enable_lambda,enable_constexpr +# Options: force_uvm,use_ldg,rdc,enable_lambda,enable_constexpr,disable_malloc_async KOKKOS_CUDA_OPTIONS ?= "" # Options: rdc @@ -46,7 +46,7 @@ uppercase_internal=$(if $1,$$(subst $(firstword $1),$(call uppercase_internal,$( uppercase=$(eval uppercase_RESULT:=$(call uppercase_internal,$(uppercase_TABLE),$1))$(uppercase_RESULT) # Return a 1 if a string contains a substring and 0 if not # Note the search string should be without '"' -# Example: $(call kokkos_has_string,"hwloc,librt",hwloc) +# Example: $(call kokkos_has_string,"hwloc,libdl",hwloc) # Will return a 1 kokkos_has_string=$(if $(findstring $(call uppercase,$2),$(call uppercase,$1)),1,0) # Returns 1 if the path exists, 0 otherwise @@ -63,11 +63,11 @@ KOKKOS_INTERNAL_ENABLE_CXX20 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD), KOKKOS_INTERNAL_ENABLE_CXX2A := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2a) KOKKOS_INTERNAL_ENABLE_CXX23 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++23) KOKKOS_INTERNAL_ENABLE_CXX2B := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2b) +KOKKOS_INTERNAL_ENABLE_CXX26 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++26) +KOKKOS_INTERNAL_ENABLE_CXX2C := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2c) # Check for external libraries. KOKKOS_INTERNAL_USE_HWLOC := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),hwloc) -KOKKOS_INTERNAL_USE_LIBRT := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),librt) -KOKKOS_INTERNAL_USE_MEMKIND := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),experimental_memkind) # Check for advanced settings. KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),compiler_warnings) @@ -82,6 +82,7 @@ KOKKOS_INTERNAL_CUDA_USE_UVM := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS), KOKKOS_INTERNAL_CUDA_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),rdc) KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_lambda) KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_constexpr) +KOKKOS_INTERNAL_CUDA_DISABLE_MALLOC_ASYNC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),disable_malloc_async) KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH := $(call kokkos_has_string,$(KOKKOS_HPX_OPTIONS),enable_async_dispatch) # deprecated KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_desul_atomics) @@ -307,7 +308,6 @@ endif # Intel based. KOKKOS_INTERNAL_USE_ARCH_KNC := $(call kokkos_has_string,$(KOKKOS_ARCH),KNC) -KOKKOS_INTERNAL_USE_ARCH_WSM := $(call kokkos_has_string,$(KOKKOS_ARCH),WSM) KOKKOS_INTERNAL_USE_ARCH_SNB := $(call kokkos_has_string,$(KOKKOS_ARCH),SNB) KOKKOS_INTERNAL_USE_ARCH_HSW := $(call kokkos_has_string,$(KOKKOS_ARCH),HSW) KOKKOS_INTERNAL_USE_ARCH_BDW := $(call kokkos_has_string,$(KOKKOS_ARCH),BDW) @@ -318,12 +318,43 @@ KOKKOS_INTERNAL_USE_ARCH_ICL := $(call kokkos_has_string,$(KOKKOS_ARCH),ICL) KOKKOS_INTERNAL_USE_ARCH_ICX := $(call kokkos_has_string,$(KOKKOS_ARCH),ICX) KOKKOS_INTERNAL_USE_ARCH_SPR := $(call kokkos_has_string,$(KOKKOS_ARCH),SPR) -KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen) +# Traditionally, we supported, e.g., IntelGen9 instead of Intel_Gen9. The latter +# matches the CMake option but we also accept the former for backward-compatibility. KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen9) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9 := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_Gen9) +endif KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen11) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11 := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_Gen11) +endif KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen12LP) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_Gen12LP) +endif +KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen9) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9 := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_Gen9) +endif +KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN_SET := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9) \ + + $(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11) \ + + $(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP)) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN_SET), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen) + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_Gen) + endif +endif KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelDG1) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1 := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_DG1) +endif KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelXeHP) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_XeHP) +endif +# Traditionally the architecture was called PVC instead of Intel_PVC. This +# version makes us accept IntelPVC and Intel_PVC as well. KOKKOS_INTERNAL_USE_ARCH_INTEL_PVC := $(call kokkos_has_string,$(KOKKOS_ARCH),PVC) # NVIDIA based. @@ -384,14 +415,13 @@ KOKKOS_INTERNAL_USE_ARCH_ARMV81 := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv8 KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv8-ThunderX) KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2 := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv8-TX2) KOKKOS_INTERNAL_USE_ARCH_A64FX := $(call kokkos_has_string,$(KOKKOS_ARCH),A64FX) -KOKKOS_INTERNAL_USE_ARCH_ARM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2)+$(KOKKOS_INTERNAL_USE_ARCH_A64FX) | bc)) +KOKKOS_INTERNAL_USE_ARCH_ARMV9_GRACE := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv9-Grace) +KOKKOS_INTERNAL_USE_ARCH_ARM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2)+$(KOKKOS_INTERNAL_USE_ARCH_A64FX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV9_GRACE) | bc)) # IBM based. -KOKKOS_INTERNAL_USE_ARCH_BGQ := $(call kokkos_has_string,$(KOKKOS_ARCH),BGQ) -KOKKOS_INTERNAL_USE_ARCH_POWER7 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power7) KOKKOS_INTERNAL_USE_ARCH_POWER8 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power8) KOKKOS_INTERNAL_USE_ARCH_POWER9 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power9) -KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BGQ)+$(KOKKOS_INTERNAL_USE_ARCH_POWER7)+$(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc)) +KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc)) # AMD based. KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(call kokkos_has_string,$(KOKKOS_ARCH),AMDAVX) @@ -402,19 +432,37 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN3), 0) KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen) endif endif -KOKKOS_INTERNAL_USE_ARCH_VEGA906 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega906) -KOKKOS_INTERNAL_USE_ARCH_VEGA908 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega908) -KOKKOS_INTERNAL_USE_ARCH_VEGA90A := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega90A) -KOKKOS_INTERNAL_USE_ARCH_NAVI1030 := $(call kokkos_has_string,$(KOKKOS_ARCH),Navi1030) + +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX906) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA906) +endif +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX908) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA908) +endif +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX90A) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA90A) +endif +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX940) +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX942) +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1030) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1030) +endif +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1100) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100) +endif # Any AVX? -KOKKOS_INTERNAL_USE_ARCH_SSE42 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM)) KOKKOS_INTERNAL_USE_ARCH_AVX := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)) KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3)) KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL)) # Incompatible flags? -KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_SKL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX)+$(KOKKOS_INTERNAL_USE_ARCH_ICL)+$(KOKKOS_INTERNAL_USE_ARCH_ICX)+$(KOKKOS_INTERNAL_USE_ARCH_SPR)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc) +KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_SKL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX)+$(KOKKOS_INTERNAL_USE_ARCH_ICL)+$(KOKKOS_INTERNAL_USE_ARCH_ICX)+$(KOKKOS_INTERNAL_USE_ARCH_SPR)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc) KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1") | bc) ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1) @@ -559,6 +607,16 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX2B), 1) KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX2B_FLAG) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX23") endif +ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX26), 1) + #I cannot make CMake add this in a good way - so add it here + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX26_FLAG) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX26") +endif +ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX2C), 1) + #I cannot make CMake add this in a good way - so add it here + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX2C_FLAG) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX26") +endif ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1) ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) @@ -598,27 +656,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HWLOC") endif -ifeq ($(KOKKOS_INTERNAL_USE_LIBRT), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_LIBRT") - KOKKOS_LIBS += -lrt - KOKKOS_TPL_LIBRARY_NAMES += rt -endif - -ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) - ifneq ($(KOKKOS_CMAKE), yes) - ifneq ($(MEMKIND_PATH),) - KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include - KOKKOS_LIBDIRS += -L$(MEMKIND_PATH)/lib - KOKKOS_CXXLDFLAGS += -L$(MEMKIND_PATH)/lib - KOKKOS_TPL_INCLUDE_DIRS += $(MEMKIND_PATH)/include - KOKKOS_TPL_LIBRARY_DIRS += $(MEMKIND_PATH)/lib - endif - KOKKOS_LIBS += -lmemkind -lnuma - KOKKOS_TPL_LIBRARY_NAMES += memkind numa - endif - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HBWSPACE") -endif - ifeq ($(KOKKOS_INTERNAL_ENABLE_LARGE_MEM_TESTS), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_LARGE_MEM_TESTS") endif @@ -685,8 +722,10 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) endif endif - ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_IMPL_CUDA_CLANG_WORKAROUND") + ifeq ($(KOKKOS_INTERNAL_CUDA_DISABLE_MALLOC_ASYNC), 0) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC") + else + tmp := $(call kokkos_append_header,"/* $H""undef KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC */") endif endif @@ -700,6 +739,7 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV80), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV80") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARM_NEON") ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) KOKKOS_CXXFLAGS += @@ -712,6 +752,7 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV81), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV81") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARM_NEON") ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) KOKKOS_CXXFLAGS += @@ -724,6 +765,7 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_A64FX), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_A64FX") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARM_NEON") KOKKOS_CXXFLAGS += -march=armv8.2-a+sve KOKKOS_LDFLAGS += -march=armv8.2-a+sve @@ -737,9 +779,17 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_A64FX), 1) endif endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV9_GRACE), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV9_GRACE") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARM_NEON") + + KOKKOS_CXXFLAGS += -mcpu=neoverse-v2 -msve-vector-bits=128 + KOKKOS_LDFLAGS += -mcpu=neoverse-v2 -msve-vector-bits=128 +endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_ZEN") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_AVX2") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX2") ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) KOKKOS_CXXFLAGS += -mavx2 @@ -752,7 +802,7 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN2), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_ZEN2") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_AVX2") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX2") ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) KOKKOS_CXXFLAGS += -mavx2 @@ -765,7 +815,7 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN3), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_ZEN3") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_AVX2") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX2") ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) KOKKOS_CXXFLAGS += -mavx2 @@ -779,6 +829,7 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV80") tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV8_THUNDERX") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARM_NEON") ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) KOKKOS_CXXFLAGS += @@ -792,6 +843,7 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV81") tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV8_THUNDERX2") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARM_NEON") ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) KOKKOS_CXXFLAGS += @@ -802,20 +854,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2), 1) endif endif -ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SSE42), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_SSE42") - - ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) - KOKKOS_CXXFLAGS += -xSSE4.2 - KOKKOS_LDFLAGS += -xSSE4.2 - else ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) - else - # Assume that this is a really a GNU compiler. - KOKKOS_CXXFLAGS += -msse4.2 - KOKKOS_LDFLAGS += -msse4.2 - endif -endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX") @@ -1075,29 +1113,39 @@ endif # Figure out the architecture flag for ROCm. -ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA906), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA906") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA") +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX906") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx906 endif -ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA908), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA908") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA") +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX908") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx908 endif -ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA90A), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA90A") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA") +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX90A") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx90a endif -ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NAVI1030), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_NAVI1030") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_NAVI") +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX940") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") + KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx940 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX942") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") + KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx942 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX1030") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1030 endif -ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NAVI1100), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_NAVI1100") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_NAVI") +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX1100") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1100 endif @@ -1192,6 +1240,8 @@ ifeq ($(KOKKOS_INTERNAL_DISABLE_BUNDLED_MDSPAN), 0) endif tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_MDSPAN") +tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_REF_COUNT_BRANCH_UNLIKELY") + KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h 2>&1) ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h) @@ -1214,7 +1264,22 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) tmp := $(call kokkos_update_config_header, KOKKOS_FWD_HPP_, "KokkosCore_Config_FwdBackend.tmp", "KokkosCore_Config_FwdBackend.hpp") tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config_SetupBackend.tmp", "KokkosCore_Config_SetupBackend.hpp") tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp") - tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp") + ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") + endif + ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1) + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") + endif + ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") + endif + ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") + endif ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") @@ -1234,30 +1299,10 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_SetupBackend.hpp") endif - ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") - endif ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif - ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1) - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") - endif - ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") - endif - ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") - endif - ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") - endif endif KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp) @@ -1368,11 +1413,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) KOKKOS_TPL_LIBRARY_NAMES += hpx endif -# Don't include Kokkos_HBWSpace.cpp if not using MEMKIND to avoid a link warning. -ifneq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) - KOKKOS_SRC := $(filter-out $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp,$(KOKKOS_SRC)) -endif - # With Cygwin functions such as fdopen and fileno are not defined # when strict ansi is enabled. strict ansi gets enabled with -std=c++14 # though. So we hard undefine it here. Not sure if that has any bad side effects @@ -1426,6 +1466,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) else tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENMP */") endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) + tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_OPENACC") +else + tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENACC */") +endif tmp := $(call desul_append_header, "") tmp := $(call desul_append_header, "$H""endif") @@ -1458,7 +1504,7 @@ include $(KOKKOS_PATH)/Makefile.targets kokkos-clean: rm -f $(KOKKOS_OBJ_LINK) $(DESUL_CONFIG_HEADER) $(DESUL_INTERNAL_CONFIG_TMP) KokkosCore_config.h KokkosCore_config.tmp libkokkos.a KokkosCore_Config_SetupBackend.hpp \ KokkosCore_Config_FwdBackend.hpp KokkosCore_Config_DeclareBackend.hpp KokkosCore_Config_DeclareBackend.tmp \ - KokkosCore_Config_FwdBackend.tmp KokkosCore_Config_PostInclude.hpp KokkosCore_Config_PostInclude.tmp KokkosCore_Config_SetupBackend.tmp + KokkosCore_Config_FwdBackend.tmp KokkosCore_Config_SetupBackend.tmp libkokkos.a: $(KOKKOS_OBJ_LINK) $(KOKKOS_SRC) $(KOKKOS_HEADERS) ar cr libkokkos.a $(KOKKOS_OBJ_LINK) diff --git a/Makefile.targets b/Makefile.targets index 4e08a46c69..e8e429e027 100644 --- a/Makefile.targets +++ b/Makefile.targets @@ -20,8 +20,6 @@ Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Ta $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp -Kokkos_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp Kokkos_HostBarrier.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp Kokkos_Profiling.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling.cpp @@ -30,12 +28,12 @@ Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_ $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp -Kokkos_MemorySpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp Kokkos_HostSpace_deepcopy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp Kokkos_NumericTraits.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp +Kokkos_Abort.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Abort.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Abort.cpp ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial.cpp @@ -80,8 +78,10 @@ Lock_Array_HIP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array endif ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1) -Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp +Kokkos_Threads_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Instance.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Instance.cpp +Kokkos_Threads_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Spinwait.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Spinwait.cpp endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) @@ -121,6 +121,3 @@ Kokkos_OpenACC_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenACC Kokkos_OpenACC_SharedAllocationRecord.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp endif - -Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp diff --git a/README.md b/README.md index 033346e956..c8c6f8f7cf 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -![Kokkos](https://avatars2.githubusercontent.com/u/10199860?s=200&v=4) +[![Kokkos](https://avatars2.githubusercontent.com/u/10199860?s=200&v=4)](https://kokkos.org) # Kokkos: Core Libraries @@ -10,48 +10,71 @@ hierarchies and multiple types of execution resources. It currently can use CUDA, HIP, SYCL, HPX, OpenMP and C++ threads as backend programming models with several other backends in development. -**Kokkos Core is part of the Kokkos C++ Performance Portability Programming EcoSystem.** +**Kokkos Core is part of the [Kokkos C++ Performance Portability Programming Ecosystem](https://kokkos.org/about/abstract/).** -For the complete documentation, click below: +Kokkos is a [Linux Foundation](https://linuxfoundation.org) project. -# [kokkos.github.io/kokkos-core-wiki](https://kokkos.github.io/kokkos-core-wiki) - -# Learning about Kokkos +## Learning about Kokkos To start learning about Kokkos: -- [Kokkos Lectures](https://kokkos.github.io/kokkos-core-wiki/videolectures.html): they contain a mix of lecture videos and hands-on exercises covering all the important Kokkos Ecosystem capabilities. +- [Kokkos Lectures](https://kokkos.org/kokkos-core-wiki/videolectures.html): they contain a mix of lecture videos and hands-on exercises covering all the important capabilities. -- [Programming guide](https://kokkos.github.io/kokkos-core-wiki/programmingguide.html): contains in "narrative" form a technical description of the programming model, machine model, and the main building blocks like the Views and parallel dispatch. +- [Programming guide](https://kokkos.org/kokkos-core-wiki/programmingguide.html): contains in "narrative" form a technical description of the programming model, machine model, and the main building blocks like the Views and parallel dispatch. -- [API reference](https://kokkos.github.io/kokkos-core-wiki/): organized by category, i.e., [core](https://kokkos.github.io/kokkos-core-wiki/API/core-index.html), [algorithms](https://kokkos.github.io/kokkos-core-wiki/API/algorithms-index.html) and [containers](https://kokkos.github.io/kokkos-core-wiki/API/containers-index.html) or, if you prefer, in [alphabetical order](https://kokkos.github.io/kokkos-core-wiki/API/alphabetical.html). +- [API reference](https://kokkos.org/kokkos-core-wiki/): organized by category, i.e., [core](https://kokkos.org/kokkos-core-wiki/API/core-index.html), [algorithms](https://kokkos.org/kokkos-core-wiki/API/algorithms-index.html) and [containers](https://kokkos.org/kokkos-core-wiki/API/containers-index.html) or, if you prefer, in [alphabetical order](https://kokkos.org/kokkos-core-wiki/API/alphabetical.html). -- [Use cases and Examples](https://kokkos.github.io/kokkos-core-wiki/usecases.html): a series of examples ranging from how to use Kokkos with MPI to Fortran interoperability. +- [Use cases and Examples](https://kokkos.org/kokkos-core-wiki/usecases.html): a serie of examples ranging from how to use Kokkos with MPI to Fortran interoperability. -For questions find us on Slack: https://kokkosteam.slack.com or open a github issue. +## Obtaining Kokkos -For non-public questions send an email to: *crtrott(at)sandia.gov* +The latest release of Kokkos can be obtained from the [GitHub releases page](https://github.com/kokkos/kokkos/releases/latest). + +The current release is [4.3.01](https://github.com/kokkos/kokkos/releases/tag/4.3.01). + +```bash +curl -OJ -L https://github.com/kokkos/kokkos/archive/refs/tags/4.3.01.tar.gz +# Or with wget +wget https://github.com/kokkos/kokkos/archive/refs/tags/4.3.01.tar.gz +``` + +To clone the latest development version of Kokkos from GitHub: + +```bash +git clone -b develop https://github.com/kokkos/kokkos.git +``` -# Contributing to Kokkos +### Building Kokkos -Please see [this page](https://kokkos.github.io/kokkos-core-wiki/contributing.html) for details on how to contribute. +To build Kokkos, you will need to have a C++ compiler that supports C++17 or later. +All requirements including minimum and primary tested compiler versions can be found [here](https://kokkos.org/kokkos-core-wiki/requirements.html). -# Requirements, Building and Installing +Building and installation instructions are described [here](https://kokkos.org/kokkos-core-wiki/building.html). + +You can also install Kokkos using [Spack](https://spack.io/): `spack install kokkos`. [Available configuration options](https://packages.spack.io/package.html?name=kokkos) can be displayed using `spack info kokkos`. + +## For the complete documentation: [kokkos.org/kokkos-core-wiki/](https://kokkos.org/kokkos-core-wiki/) + +## Support + +For questions find us on Slack: https://kokkosteam.slack.com or open a GitHub issue. + +For non-public questions send an email to: *crtrott(at)sandia.gov* -All requirements including minimum and primary tested compiler versions can be found [here](https://kokkos.github.io/kokkos-core-wiki/requirements.html). +## Contributing -Building and installation instructions are described [here](https://kokkos.github.io/kokkos-core-wiki/building.html). +Please see [this page](https://kokkos.org/kokkos-core-wiki/contributing.html) for details on how to contribute. -# Citing Kokkos +## Citing Kokkos -Please see the [following page](https://kokkos.github.io/kokkos-core-wiki/citation.html). +Please see the [following page](https://kokkos.org/kokkos-core-wiki/citation.html). -# License +## License -[![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) +[![License](https://img.shields.io/badge/License-Apache--2.0_WITH_LLVM--exception-blue)](https://spdx.org/licenses/LLVM-exception.html) Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights in this software. -The full license statement used in all headers is available [here](https://kokkos.github.io/kokkos-core-wiki/license.html) or -[here](https://github.com/kokkos/kokkos/blob/master/LICENSE). +The full license statement used in all headers is available [here](https://kokkos.org/kokkos-core-wiki/license.html) or +[here](https://github.com/kokkos/kokkos/blob/develop/LICENSE). diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000000..93cf6e3663 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,12 @@ +# Reporting Security Issues + +To report a security issue, please email +[lebrungrandt@ornl.gov](mailto:lebrungrandt@ornl.gov) +and [crtrott@sandia.gov](mailto:crtrott@sandia.gov) +with a description of the issue, the steps you took to create the issue, +affected versions, and, if known, mitigations for the issue. + +Our vulnerability management team will respond within 5 working days of your +email. If the issue is confirmed as a vulnerability, we will open a +Security Advisory and acknowledge your contributions as part of it. This project +follows a 90 day disclosure timeline. diff --git a/Spack.md b/Spack.md index 79606c259d..06c763a64e 100644 --- a/Spack.md +++ b/Spack.md @@ -159,7 +159,6 @@ If you don't specify a CUDA build variant in a `packages.yaml` and you build you > spack install superscience ```` you may end up just getting the default Kokkos (i.e. Serial). -Some examples are included in the `config/yaml` folder for common platforms. Before running `spack install ` we recommend running `spack spec ` to confirm your dependency tree is correct. For example, with Kokkos Kernels: ````bash diff --git a/algorithms/CMakeLists.txt b/algorithms/CMakeLists.txt index ab557ab66a..368984647e 100644 --- a/algorithms/CMakeLists.txt +++ b/algorithms/CMakeLists.txt @@ -2,6 +2,6 @@ IF (NOT Kokkos_INSTALL_TESTING) ADD_SUBDIRECTORY(src) ENDIF() # FIXME_OPENACC: temporarily disabled due to unimplemented features -IF(NOT ((KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_OPENACC) AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)) +IF(NOT ((KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) OR KOKKOS_ENABLE_OPENACC)) KOKKOS_ADD_TEST_DIRECTORIES(unit_tests) ENDIF() diff --git a/algorithms/src/CMakeLists.txt b/algorithms/src/CMakeLists.txt index 1695778947..b490caca62 100644 --- a/algorithms/src/CMakeLists.txt +++ b/algorithms/src/CMakeLists.txt @@ -30,5 +30,5 @@ KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkosalgorithms ${CMAKE_CURRENT_SOURCE_DIR} ) - - +KOKKOS_LINK_TPL(kokkoscontainers PUBLIC ROCTHRUST) +KOKKOS_LINK_TPL(kokkoscore PUBLIC ONEDPL) diff --git a/algorithms/src/Kokkos_NestedSort.hpp b/algorithms/src/Kokkos_NestedSort.hpp index 4c8be792d8..18e0674efe 100644 --- a/algorithms/src/Kokkos_NestedSort.hpp +++ b/algorithms/src/Kokkos_NestedSort.hpp @@ -14,175 +14,17 @@ // //@HEADER -#ifndef KOKKOS_NESTEDSORT_HPP_ -#define KOKKOS_NESTEDSORT_HPP_ - -#include -#include -#include - -namespace Kokkos { -namespace Experimental { -namespace Impl { - -// true for TeamVectorRange, false for ThreadVectorRange -template -struct NestedRange {}; - -// Specialization for team-level -template <> -struct NestedRange { - template - KOKKOS_FUNCTION static auto create(const TeamMember& t, SizeType len) { - return Kokkos::TeamVectorRange(t, len); - } - template - KOKKOS_FUNCTION static void barrier(const TeamMember& t) { - t.team_barrier(); - } -}; - -// Specialization for thread-level -template <> -struct NestedRange { - template - KOKKOS_FUNCTION static auto create(const TeamMember& t, SizeType len) { - return Kokkos::ThreadVectorRange(t, len); - } - // Barrier is no-op, as vector lanes of a thread are implicitly synchronized - // after parallel region - template - KOKKOS_FUNCTION static void barrier(const TeamMember&) {} -}; - -// When just doing sort (not sort_by_key), use nullptr_t for ValueViewType. -// This only takes the NestedRange instance for template arg deduction. -template -KOKKOS_INLINE_FUNCTION void sort_nested_impl( - const TeamMember& t, const KeyViewType& keyView, - [[maybe_unused]] const ValueViewType& valueView, const Comparator& comp, - const NestedRange) { - using SizeType = typename KeyViewType::size_type; - using KeyType = typename KeyViewType::non_const_value_type; - using Range = NestedRange; - SizeType n = keyView.extent(0); - SizeType npot = 1; - SizeType levels = 0; - // FIXME: ceiling power-of-two is a common thing to need - make it a utility - while (npot < n) { - levels++; - npot <<= 1; - } - for (SizeType i = 0; i < levels; i++) { - for (SizeType j = 0; j <= i; j++) { - // n/2 pairs of items are compared in parallel - Kokkos::parallel_for(Range::create(t, npot / 2), [=](const SizeType k) { - // How big are the brown/pink boxes? - // (Terminology comes from Wikipedia diagram) - // https://commons.wikimedia.org/wiki/File:BitonicSort.svg#/media/File:BitonicSort.svg - SizeType boxSize = SizeType(2) << (i - j); - // Which box contains this thread? - SizeType boxID = k >> (i - j); // k * 2 / boxSize; - SizeType boxStart = boxID << (1 + i - j); // boxID * boxSize - SizeType boxOffset = k - (boxStart >> 1); // k - boxID * boxSize / 2; - SizeType elem1 = boxStart + boxOffset; - // In first phase (j == 0, brown box): within a box, compare with the - // opposite value in the box. - // In later phases (j > 0, pink box): within a box, compare with fixed - // distance (boxSize / 2) apart. - SizeType elem2 = (j == 0) ? (boxStart + boxSize - 1 - boxOffset) - : (elem1 + boxSize / 2); - if (elem2 < n) { - KeyType key1 = keyView(elem1); - KeyType key2 = keyView(elem2); - if (comp(key2, key1)) { - keyView(elem1) = key2; - keyView(elem2) = key1; - if constexpr (!std::is_same_v) { - Kokkos::Experimental::swap(valueView(elem1), valueView(elem2)); - } - } - } - }); - Range::barrier(t); - } - } -} - -} // namespace Impl - -template -KOKKOS_INLINE_FUNCTION void sort_team(const TeamMember& t, - const ViewType& view) { - Impl::sort_nested_impl(t, view, nullptr, - Experimental::Impl::StdAlgoLessThanBinaryPredicate< - typename ViewType::non_const_value_type>(), - Impl::NestedRange()); -} - -template -KOKKOS_INLINE_FUNCTION void sort_team(const TeamMember& t, const ViewType& view, - const Comparator& comp) { - Impl::sort_nested_impl(t, view, nullptr, comp, Impl::NestedRange()); -} - -template -KOKKOS_INLINE_FUNCTION void sort_by_key_team(const TeamMember& t, - const KeyViewType& keyView, - const ValueViewType& valueView) { - Impl::sort_nested_impl(t, keyView, valueView, - Experimental::Impl::StdAlgoLessThanBinaryPredicate< - typename KeyViewType::non_const_value_type>(), - Impl::NestedRange()); -} - -template -KOKKOS_INLINE_FUNCTION void sort_by_key_team(const TeamMember& t, - const KeyViewType& keyView, - const ValueViewType& valueView, - const Comparator& comp) { - Impl::sort_nested_impl(t, keyView, valueView, comp, - Impl::NestedRange()); -} - -template -KOKKOS_INLINE_FUNCTION void sort_thread(const TeamMember& t, - const ViewType& view) { - Impl::sort_nested_impl(t, view, nullptr, - Experimental::Impl::StdAlgoLessThanBinaryPredicate< - typename ViewType::non_const_value_type>(), - Impl::NestedRange()); -} - -template -KOKKOS_INLINE_FUNCTION void sort_thread(const TeamMember& t, - const ViewType& view, - const Comparator& comp) { - Impl::sort_nested_impl(t, view, nullptr, comp, Impl::NestedRange()); -} - -template -KOKKOS_INLINE_FUNCTION void sort_by_key_thread(const TeamMember& t, - const KeyViewType& keyView, - const ValueViewType& valueView) { - Impl::sort_nested_impl(t, keyView, valueView, - Experimental::Impl::StdAlgoLessThanBinaryPredicate< - typename KeyViewType::non_const_value_type>(), - Impl::NestedRange()); -} +#ifndef KOKKOS_NESTED_SORT_HPP_ +#define KOKKOS_NESTED_SORT_HPP_ +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#define KOKKOS_IMPL_PUBLIC_INCLUDE +#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_NESTED_SORT +#endif -template -KOKKOS_INLINE_FUNCTION void sort_by_key_thread(const TeamMember& t, - const KeyViewType& keyView, - const ValueViewType& valueView, - const Comparator& comp) { - Impl::sort_nested_impl(t, keyView, valueView, comp, - Impl::NestedRange()); -} +#include "sorting/Kokkos_NestedSortPublicAPI.hpp" -} // namespace Experimental -} // namespace Kokkos +#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_NESTED_SORT +#undef KOKKOS_IMPL_PUBLIC_INCLUDE +#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_NESTED_SORT +#endif #endif diff --git a/algorithms/src/Kokkos_Random.hpp b/algorithms/src/Kokkos_Random.hpp index abb028d28e..7df12b8518 100644 --- a/algorithms/src/Kokkos_Random.hpp +++ b/algorithms/src/Kokkos_Random.hpp @@ -849,18 +849,17 @@ class Random_XorShift64 { return drand(end - start) + start; } - // Marsaglia polar method for drawing a standard normal distributed random + // Box-muller method for drawing a standard normal distributed random // number KOKKOS_INLINE_FUNCTION double normal() { - double S = 2.0; - double U; - while (S >= 1.0) { - U = 2.0 * drand() - 1.0; - const double V = 2.0 * drand() - 1.0; - S = U * U + V * V; - } - return U * std::sqrt(-2.0 * std::log(S) / S); + constexpr auto two_pi = 2 * Kokkos::numbers::pi_v; + + const double u = drand(); + const double v = drand(); + const double r = Kokkos::sqrt(-2.0 * Kokkos::log(u)); + const double theta = v * two_pi; + return r * Kokkos::cos(theta); } KOKKOS_INLINE_FUNCTION @@ -956,6 +955,8 @@ class Random_XorShift64_Pool { KOKKOS_INLINE_FUNCTION void free_state(const Random_XorShift64& state) const { state_(state.state_idx_, 0) = state.state_; + // Release the lock only after the state has been updated in memory + Kokkos::memory_fence(); locks_(state.state_idx_, 0) = 0; } }; @@ -1092,18 +1093,17 @@ class Random_XorShift1024 { return drand(end - start) + start; } - // Marsaglia polar method for drawing a standard normal distributed random + // Box-muller method for drawing a standard normal distributed random // number KOKKOS_INLINE_FUNCTION double normal() { - double S = 2.0; - double U; - while (S >= 1.0) { - U = 2.0 * drand() - 1.0; - const double V = 2.0 * drand() - 1.0; - S = U * U + V * V; - } - return U * std::sqrt(-2.0 * std::log(S) / S); + constexpr auto two_pi = 2 * Kokkos::numbers::pi_v; + + const double u = drand(); + const double v = drand(); + const double r = Kokkos::sqrt(-2.0 * Kokkos::log(u)); + const double theta = v * two_pi; + return r * Kokkos::cos(theta); } KOKKOS_INLINE_FUNCTION @@ -1208,7 +1208,9 @@ class Random_XorShift1024_Pool { KOKKOS_INLINE_FUNCTION void free_state(const Random_XorShift1024& state) const { for (int i = 0; i < 16; i++) state_(state.state_idx_, i) = state.state_[i]; - p_(state.state_idx_, 0) = state.p_; + p_(state.state_idx_, 0) = state.p_; + // Release the lock only after the state has been updated in memory + Kokkos::memory_fence(); locks_(state.state_idx_, 0) = 0; } }; @@ -1541,13 +1543,23 @@ template void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type begin, typename ViewType::const_value_type end) { - fill_random(typename ViewType::execution_space{}, a, g, begin, end); + Kokkos::fence( + "fill_random: fence before since no execution space instance provided"); + typename ViewType::execution_space exec; + fill_random(exec, a, g, begin, end); + exec.fence( + "fill_random: fence after since no execution space instance provided"); } template void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type range) { - fill_random(typename ViewType::execution_space{}, a, g, 0, range); + Kokkos::fence( + "fill_random: fence before since no execution space instance provided"); + typename ViewType::execution_space exec; + fill_random(exec, a, g, 0, range); + exec.fence( + "fill_random: fence after since no execution space instance provided"); } } // namespace Kokkos diff --git a/algorithms/src/Kokkos_Sort.hpp b/algorithms/src/Kokkos_Sort.hpp index 10f9ad6462..136b4ec82d 100644 --- a/algorithms/src/Kokkos_Sort.hpp +++ b/algorithms/src/Kokkos_Sort.hpp @@ -21,762 +21,10 @@ #define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_SORT #endif -#include -#include -#include -#include - -#if defined(KOKKOS_ENABLE_CUDA) - -// Workaround for `Instruction 'shfl' without '.sync' is not supported on -// .target sm_70 and higher from PTX ISA version 6.4`. -// Also see https://github.com/NVIDIA/cub/pull/170. -#if !defined(CUB_USE_COOPERATIVE_GROUPS) -#define CUB_USE_COOPERATIVE_GROUPS -#endif - -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wshadow" - -#if defined(KOKKOS_COMPILER_CLANG) -// Some versions of Clang fail to compile Thrust, failing with errors like -// this: -// /thrust/system/cuda/detail/core/agent_launcher.h:557:11: -// error: use of undeclared identifier 'va_printf' -// The exact combination of versions for Clang and Thrust (or CUDA) for this -// failure was not investigated, however even very recent version combination -// (Clang 10.0.0 and Cuda 10.0) demonstrated failure. -// -// Defining _CubLog here locally allows us to avoid that code path, however -// disabling some debugging diagnostics -#pragma push_macro("_CubLog") -#ifdef _CubLog -#undef _CubLog -#endif -#define _CubLog -#include -#include -#pragma pop_macro("_CubLog") -#else -#include -#include -#endif - -#pragma GCC diagnostic pop - -#endif - -#if defined(KOKKOS_ENABLE_ONEDPL) -#include -#include -#endif - -namespace Kokkos { - -namespace Impl { - -template -struct CopyOp; - -template -struct CopyOp { - KOKKOS_INLINE_FUNCTION - static void copy(DstViewType const& dst, size_t i_dst, SrcViewType const& src, - size_t i_src) { - dst(i_dst) = src(i_src); - } -}; - -template -struct CopyOp { - KOKKOS_INLINE_FUNCTION - static void copy(DstViewType const& dst, size_t i_dst, SrcViewType const& src, - size_t i_src) { - for (int j = 0; j < (int)dst.extent(1); j++) dst(i_dst, j) = src(i_src, j); - } -}; - -template -struct CopyOp { - KOKKOS_INLINE_FUNCTION - static void copy(DstViewType const& dst, size_t i_dst, SrcViewType const& src, - size_t i_src) { - for (int j = 0; j < dst.extent(1); j++) - for (int k = 0; k < dst.extent(2); k++) - dst(i_dst, j, k) = src(i_src, j, k); - } -}; -} // namespace Impl - -//---------------------------------------------------------------------------- - -template -class BinSort { - public: - template - struct copy_functor { - using src_view_type = typename SrcViewType::const_type; - - using copy_op = Impl::CopyOp; - - DstViewType dst_values; - src_view_type src_values; - int dst_offset; - - copy_functor(DstViewType const& dst_values_, int const& dst_offset_, - SrcViewType const& src_values_) - : dst_values(dst_values_), - src_values(src_values_), - dst_offset(dst_offset_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const int& i) const { - copy_op::copy(dst_values, i + dst_offset, src_values, i); - } - }; - - template - struct copy_permute_functor { - // If a Kokkos::View then can generate constant random access - // otherwise can only use the constant type. - - using src_view_type = std::conditional_t< - Kokkos::is_view::value, - Kokkos::View -#endif - >, - typename SrcViewType::const_type>; - - using perm_view_type = typename PermuteViewType::const_type; - - using copy_op = Impl::CopyOp; - - DstViewType dst_values; - perm_view_type sort_order; - src_view_type src_values; - int src_offset; - - copy_permute_functor(DstViewType const& dst_values_, - PermuteViewType const& sort_order_, - SrcViewType const& src_values_, int const& src_offset_) - : dst_values(dst_values_), - sort_order(sort_order_), - src_values(src_values_), - src_offset(src_offset_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const int& i) const { - copy_op::copy(dst_values, i, src_values, src_offset + sort_order(i)); - } - }; - - // Naming this alias "execution_space" would be problematic since it would be - // considered as execution space for the various functors which might use - // another execution space through sort() or create_permute_vector(). - using exec_space = typename Space::execution_space; - using bin_op_type = BinSortOp; - - struct bin_count_tag {}; - struct bin_offset_tag {}; - struct bin_binning_tag {}; - struct bin_sort_bins_tag {}; - - public: - using size_type = SizeType; - using value_type = size_type; - - using offset_type = Kokkos::View; - using bin_count_type = Kokkos::View; - - using const_key_view_type = typename KeyViewType::const_type; - - // If a Kokkos::View then can generate constant random access - // otherwise can only use the constant type. - - using const_rnd_key_view_type = std::conditional_t< - Kokkos::is_view::value, - Kokkos::View >, - const_key_view_type>; - - using non_const_key_scalar = typename KeyViewType::non_const_value_type; - using const_key_scalar = typename KeyViewType::const_value_type; - - using bin_count_atomic_type = - Kokkos::View >; - - private: - const_key_view_type keys; - const_rnd_key_view_type keys_rnd; - - public: - BinSortOp bin_op; - offset_type bin_offsets; - bin_count_atomic_type bin_count_atomic; - bin_count_type bin_count_const; - offset_type sort_order; - - int range_begin; - int range_end; - bool sort_within_bins; - - public: -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - KOKKOS_DEPRECATED BinSort() = default; -#else - BinSort() = delete; -#endif - - //---------------------------------------- - // Constructor: takes the keys, the binning_operator and optionally whether to - // sort within bins (default false) - template - BinSort(const ExecutionSpace& exec, const_key_view_type keys_, - int range_begin_, int range_end_, BinSortOp bin_op_, - bool sort_within_bins_ = false) - : keys(keys_), - keys_rnd(keys_), - bin_op(bin_op_), - bin_offsets(), - bin_count_atomic(), - bin_count_const(), - sort_order(), - range_begin(range_begin_), - range_end(range_end_), - sort_within_bins(sort_within_bins_) { - static_assert( - Kokkos::SpaceAccessibility::accessible, - "The provided execution space must be able to access the memory space " - "BinSort was initialized with!"); - if (bin_op.max_bins() <= 0) - Kokkos::abort( - "The number of bins in the BinSortOp object must be greater than 0!"); - bin_count_atomic = Kokkos::View( - "Kokkos::SortImpl::BinSortFunctor::bin_count", bin_op.max_bins()); - bin_count_const = bin_count_atomic; - bin_offsets = - offset_type(view_alloc(exec, WithoutInitializing, - "Kokkos::SortImpl::BinSortFunctor::bin_offsets"), - bin_op.max_bins()); - sort_order = - offset_type(view_alloc(exec, WithoutInitializing, - "Kokkos::SortImpl::BinSortFunctor::sort_order"), - range_end - range_begin); - } - - BinSort(const_key_view_type keys_, int range_begin_, int range_end_, - BinSortOp bin_op_, bool sort_within_bins_ = false) - : BinSort(exec_space{}, keys_, range_begin_, range_end_, bin_op_, - sort_within_bins_) {} - - template - BinSort(const ExecutionSpace& exec, const_key_view_type keys_, - BinSortOp bin_op_, bool sort_within_bins_ = false) - : BinSort(exec, keys_, 0, keys_.extent(0), bin_op_, sort_within_bins_) {} - - BinSort(const_key_view_type keys_, BinSortOp bin_op_, - bool sort_within_bins_ = false) - : BinSort(exec_space{}, keys_, bin_op_, sort_within_bins_) {} - - //---------------------------------------- - // Create the permutation vector, the bin_offset array and the bin_count - // array. Can be called again if keys changed - template - void create_permute_vector(const ExecutionSpace& exec) { - static_assert( - Kokkos::SpaceAccessibility::accessible, - "The provided execution space must be able to access the memory space " - "BinSort was initialized with!"); - - const size_t len = range_end - range_begin; - Kokkos::parallel_for( - "Kokkos::Sort::BinCount", - Kokkos::RangePolicy(exec, 0, len), - *this); - Kokkos::parallel_scan("Kokkos::Sort::BinOffset", - Kokkos::RangePolicy( - exec, 0, bin_op.max_bins()), - *this); - - Kokkos::deep_copy(exec, bin_count_atomic, 0); - Kokkos::parallel_for( - "Kokkos::Sort::BinBinning", - Kokkos::RangePolicy(exec, 0, len), - *this); - - if (sort_within_bins) - Kokkos::parallel_for( - "Kokkos::Sort::BinSort", - Kokkos::RangePolicy( - exec, 0, bin_op.max_bins()), - *this); - } - - // Create the permutation vector, the bin_offset array and the bin_count - // array. Can be called again if keys changed - void create_permute_vector() { - Kokkos::fence("Kokkos::Binsort::create_permute_vector: before"); - exec_space e{}; - create_permute_vector(e); - e.fence("Kokkos::Binsort::create_permute_vector: after"); - } - - // Sort a subset of a view with respect to the first dimension using the - // permutation array - template - void sort(const ExecutionSpace& exec, ValuesViewType const& values, - int values_range_begin, int values_range_end) const { - if (values.extent(0) == 0) { - return; - } - - static_assert( - Kokkos::SpaceAccessibility::accessible, - "The provided execution space must be able to access the memory space " - "BinSort was initialized with!"); - static_assert( - Kokkos::SpaceAccessibility< - ExecutionSpace, typename ValuesViewType::memory_space>::accessible, - "The provided execution space must be able to access the memory space " - "of the View argument!"); - - const size_t len = range_end - range_begin; - const size_t values_len = values_range_end - values_range_begin; - if (len != values_len) { - Kokkos::abort( - "BinSort::sort: values range length != permutation vector length"); - } - - using scratch_view_type = - Kokkos::View; - scratch_view_type sorted_values( - view_alloc(exec, WithoutInitializing, - "Kokkos::SortImpl::BinSortFunctor::sorted_values"), - values.rank_dynamic > 0 ? len : KOKKOS_IMPL_CTOR_DEFAULT_ARG, - values.rank_dynamic > 1 ? values.extent(1) - : KOKKOS_IMPL_CTOR_DEFAULT_ARG, - values.rank_dynamic > 2 ? values.extent(2) - : KOKKOS_IMPL_CTOR_DEFAULT_ARG, - values.rank_dynamic > 3 ? values.extent(3) - : KOKKOS_IMPL_CTOR_DEFAULT_ARG, - values.rank_dynamic > 4 ? values.extent(4) - : KOKKOS_IMPL_CTOR_DEFAULT_ARG, - values.rank_dynamic > 5 ? values.extent(5) - : KOKKOS_IMPL_CTOR_DEFAULT_ARG, - values.rank_dynamic > 6 ? values.extent(6) - : KOKKOS_IMPL_CTOR_DEFAULT_ARG, - values.rank_dynamic > 7 ? values.extent(7) - : KOKKOS_IMPL_CTOR_DEFAULT_ARG); - - { - copy_permute_functor - functor(sorted_values, sort_order, values, - values_range_begin - range_begin); - - parallel_for("Kokkos::Sort::CopyPermute", - Kokkos::RangePolicy(exec, 0, len), functor); - } - - { - copy_functor functor( - values, range_begin, sorted_values); - - parallel_for("Kokkos::Sort::Copy", - Kokkos::RangePolicy(exec, 0, len), functor); - } - } - - // Sort a subset of a view with respect to the first dimension using the - // permutation array - template - void sort(ValuesViewType const& values, int values_range_begin, - int values_range_end) const { - Kokkos::fence("Kokkos::Binsort::sort: before"); - exec_space exec; - sort(exec, values, values_range_begin, values_range_end); - exec.fence("Kokkos::BinSort:sort: after"); - } - - template - void sort(ExecutionSpace const& exec, ValuesViewType const& values) const { - this->sort(exec, values, 0, /*values.extent(0)*/ range_end - range_begin); - } - - template - void sort(ValuesViewType const& values) const { - this->sort(values, 0, /*values.extent(0)*/ range_end - range_begin); - } - - // Get the permutation vector - KOKKOS_INLINE_FUNCTION - offset_type get_permute_vector() const { return sort_order; } - - // Get the start offsets for each bin - KOKKOS_INLINE_FUNCTION - offset_type get_bin_offsets() const { return bin_offsets; } - - // Get the count for each bin - KOKKOS_INLINE_FUNCTION - bin_count_type get_bin_count() const { return bin_count_const; } - - public: - KOKKOS_INLINE_FUNCTION - void operator()(const bin_count_tag& /*tag*/, const int i) const { - const int j = range_begin + i; - bin_count_atomic(bin_op.bin(keys, j))++; - } - - KOKKOS_INLINE_FUNCTION - void operator()(const bin_offset_tag& /*tag*/, const int i, - value_type& offset, const bool& final) const { - if (final) { - bin_offsets(i) = offset; - } - offset += bin_count_const(i); - } - - KOKKOS_INLINE_FUNCTION - void operator()(const bin_binning_tag& /*tag*/, const int i) const { - const int j = range_begin + i; - const int bin = bin_op.bin(keys, j); - const int count = bin_count_atomic(bin)++; - - sort_order(bin_offsets(bin) + count) = j; - } - - KOKKOS_INLINE_FUNCTION - void operator()(const bin_sort_bins_tag& /*tag*/, const int i) const { - auto bin_size = bin_count_const(i); - if (bin_size <= 1) return; - constexpr bool use_std_sort = - std::is_same_v; - int lower_bound = bin_offsets(i); - int upper_bound = lower_bound + bin_size; - // Switching to std::sort for more than 10 elements has been found - // reasonable experimentally. - if (use_std_sort && bin_size > 10) { - if constexpr (use_std_sort) { - std::sort(&sort_order(lower_bound), &sort_order(upper_bound), - [this](int p, int q) { return bin_op(keys_rnd, p, q); }); - } - } else { - for (int k = lower_bound + 1; k < upper_bound; ++k) { - int old_idx = sort_order(k); - int j = k - 1; - while (j >= lower_bound) { - int new_idx = sort_order(j); - if (!bin_op(keys_rnd, old_idx, new_idx)) break; - sort_order(j + 1) = new_idx; - --j; - } - sort_order(j + 1) = old_idx; - } - } - } -}; - -//---------------------------------------------------------------------------- - -template -struct BinOp1D { - int max_bins_ = {}; - double mul_ = {}; - double min_ = {}; - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - KOKKOS_DEPRECATED BinOp1D() = default; -#else - BinOp1D() = delete; -#endif - - // Construct BinOp with number of bins, minimum value and maximum value - BinOp1D(int max_bins__, typename KeyViewType::const_value_type min, - typename KeyViewType::const_value_type max) - : max_bins_(max_bins__ + 1), - // Cast to double to avoid possible overflow when using integer - mul_(static_cast(max_bins__) / - (static_cast(max) - static_cast(min))), - min_(static_cast(min)) { - // For integral types the number of bins may be larger than the range - // in which case we can exactly have one unique value per bin - // and then don't need to sort bins. - if (std::is_integral::value && - (static_cast(max) - static_cast(min)) <= - static_cast(max_bins__)) { - mul_ = 1.; - } - } - - // Determine bin index from key value - template - KOKKOS_INLINE_FUNCTION int bin(ViewType& keys, const int& i) const { - return static_cast(mul_ * (static_cast(keys(i)) - min_)); - } - - // Return maximum bin index + 1 - KOKKOS_INLINE_FUNCTION - int max_bins() const { return max_bins_; } - - // Compare to keys within a bin if true new_val will be put before old_val - template - KOKKOS_INLINE_FUNCTION bool operator()(ViewType& keys, iType1& i1, - iType2& i2) const { - return keys(i1) < keys(i2); - } -}; - -template -struct BinOp3D { - int max_bins_[3] = {}; - double mul_[3] = {}; - double min_[3] = {}; - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - KOKKOS_DEPRECATED BinOp3D() = default; -#else - BinOp3D() = delete; -#endif - - BinOp3D(int max_bins__[], typename KeyViewType::const_value_type min[], - typename KeyViewType::const_value_type max[]) { - max_bins_[0] = max_bins__[0]; - max_bins_[1] = max_bins__[1]; - max_bins_[2] = max_bins__[2]; - mul_[0] = static_cast(max_bins__[0]) / - (static_cast(max[0]) - static_cast(min[0])); - mul_[1] = static_cast(max_bins__[1]) / - (static_cast(max[1]) - static_cast(min[1])); - mul_[2] = static_cast(max_bins__[2]) / - (static_cast(max[2]) - static_cast(min[2])); - min_[0] = static_cast(min[0]); - min_[1] = static_cast(min[1]); - min_[2] = static_cast(min[2]); - } - - template - KOKKOS_INLINE_FUNCTION int bin(ViewType& keys, const int& i) const { - return int((((int(mul_[0] * (keys(i, 0) - min_[0])) * max_bins_[1]) + - int(mul_[1] * (keys(i, 1) - min_[1]))) * - max_bins_[2]) + - int(mul_[2] * (keys(i, 2) - min_[2]))); - } - - KOKKOS_INLINE_FUNCTION - int max_bins() const { return max_bins_[0] * max_bins_[1] * max_bins_[2]; } - - template - KOKKOS_INLINE_FUNCTION bool operator()(ViewType& keys, iType1& i1, - iType2& i2) const { - if (keys(i1, 0) > keys(i2, 0)) - return true; - else if (keys(i1, 0) == keys(i2, 0)) { - if (keys(i1, 1) > keys(i2, 1)) - return true; - else if (keys(i1, 1) == keys(i2, 1)) { - if (keys(i1, 2) > keys(i2, 2)) return true; - } - } - return false; - } -}; - -namespace Impl { - -template -struct min_max_functor { - using minmax_scalar = - Kokkos::MinMaxScalar; - - ViewType view; - min_max_functor(const ViewType& view_) : view(view_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const size_t& i, minmax_scalar& minmax) const { - if (view(i) < minmax.min_val) minmax.min_val = view(i); - if (view(i) > minmax.max_val) minmax.max_val = view(i); - } -}; - -} // namespace Impl - -template -std::enable_if_t<(Kokkos::is_execution_space::value) && - (!SpaceAccessibility< - HostSpace, typename Kokkos::View:: - memory_space>::accessible)> -sort(const ExecutionSpace& exec, - const Kokkos::View& view) { - if (view.extent(0) == 0) { - return; - } - - using ViewType = Kokkos::View; - using CompType = BinOp1D; - - Kokkos::MinMaxScalar result; - Kokkos::MinMax reducer(result); - parallel_reduce("Kokkos::Sort::FindExtent", - Kokkos::RangePolicy( - exec, 0, view.extent(0)), - Impl::min_max_functor(view), reducer); - if (result.min_val == result.max_val) return; - // For integral types the number of bins may be larger than the range - // in which case we can exactly have one unique value per bin - // and then don't need to sort bins. - bool sort_in_bins = true; - // TODO: figure out better max_bins then this ... - int64_t max_bins = view.extent(0) / 2; - if (std::is_integral::value) { - // Cast to double to avoid possible overflow when using integer - auto const max_val = static_cast(result.max_val); - auto const min_val = static_cast(result.min_val); - // using 10M as the cutoff for special behavior (roughly 40MB for the count - // array) - if ((max_val - min_val) < 10000000) { - max_bins = max_val - min_val + 1; - sort_in_bins = false; - } - } - if (std::is_floating_point::value) { - KOKKOS_ASSERT(std::isfinite(static_cast(result.max_val) - - static_cast(result.min_val))); - } - - BinSort bin_sort( - view, CompType(max_bins, result.min_val, result.max_val), sort_in_bins); - bin_sort.create_permute_vector(exec); - bin_sort.sort(exec, view); -} - -#if defined(KOKKOS_ENABLE_ONEDPL) -template -void sort(const Experimental::SYCL& space, - const Kokkos::View& view) { - if (view.extent(0) == 0) { - return; - } - - using ViewType = Kokkos::View; - static_assert(SpaceAccessibility::accessible, - "SYCL execution space is not able to access the memory space " - "of the View argument!"); - - auto queue = space.sycl_queue(); - auto policy = oneapi::dpl::execution::make_device_policy(queue); - - // Can't use Experimental::begin/end here since the oneDPL then assumes that - // the data is on the host. - static_assert( - ViewType::rank == 1 && - (std::is_same::value || - std::is_same::value), - "SYCL sort only supports contiguous 1D Views."); - const int n = view.extent(0); - oneapi::dpl::sort(policy, view.data(), view.data() + n); -} -#endif - -template -std::enable_if_t<(Kokkos::is_execution_space::value) && - (SpaceAccessibility< - HostSpace, typename Kokkos::View:: - memory_space>::accessible)> -sort(const ExecutionSpace&, const Kokkos::View& view) { - if (view.extent(0) == 0) { - return; - } - auto first = Experimental::begin(view); - auto last = Experimental::end(view); - std::sort(first, last); -} - -#if defined(KOKKOS_ENABLE_CUDA) -template -void sort(const Cuda& space, - const Kokkos::View& view) { - if (view.extent(0) == 0) { - return; - } - const auto exec = thrust::cuda::par.on(space.cuda_stream()); - auto first = Experimental::begin(view); - auto last = Experimental::end(view); - thrust::sort(exec, first, last); -} -#endif - -template -void sort(ViewType const& view) { - Kokkos::fence("Kokkos::sort: before"); - - if (view.extent(0) == 0) { - return; - } - - typename ViewType::execution_space exec; - sort(exec, view); - exec.fence("Kokkos::sort: fence after sorting"); -} - -template -std::enable_if_t::value> sort( - const ExecutionSpace& exec, ViewType view, size_t const begin, - size_t const end) { - if (view.extent(0) == 0) { - return; - } - - using range_policy = Kokkos::RangePolicy; - using CompType = BinOp1D; - - Kokkos::MinMaxScalar result; - Kokkos::MinMax reducer(result); - - parallel_reduce("Kokkos::Sort::FindExtent", range_policy(exec, begin, end), - Impl::min_max_functor(view), reducer); - - if (result.min_val == result.max_val) return; - - BinSort bin_sort( - exec, view, begin, end, - CompType((end - begin) / 2, result.min_val, result.max_val), true); - - bin_sort.create_permute_vector(exec); - bin_sort.sort(exec, view, begin, end); -} - -template -void sort(ViewType view, size_t const begin, size_t const end) { - Kokkos::fence("Kokkos::sort: before"); - - if (view.extent(0) == 0) { - return; - } - - typename ViewType::execution_space exec; - sort(exec, view, begin, end); - exec.fence("Kokkos::Sort: fence after sorting"); -} - -} // namespace Kokkos +#include "sorting/Kokkos_BinSortPublicAPI.hpp" +#include "sorting/Kokkos_SortPublicAPI.hpp" +#include "sorting/Kokkos_SortByKeyPublicAPI.hpp" +#include "sorting/Kokkos_NestedSortPublicAPI.hpp" #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_SORT #undef KOKKOS_IMPL_PUBLIC_INCLUDE diff --git a/algorithms/src/Kokkos_StdAlgorithms.hpp b/algorithms/src/Kokkos_StdAlgorithms.hpp index 436ae0d10b..b532a774e1 100644 --- a/algorithms/src/Kokkos_StdAlgorithms.hpp +++ b/algorithms/src/Kokkos_StdAlgorithms.hpp @@ -35,7 +35,6 @@ // following the std classification. // modifying ops -#include "std_algorithms/Kokkos_Swap.hpp" #include "std_algorithms/Kokkos_IterSwap.hpp" // non-modifying sequence diff --git a/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp b/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp new file mode 100644 index 0000000000..73e751f572 --- /dev/null +++ b/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp @@ -0,0 +1,129 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_BIN_OPS_PUBLIC_API_HPP_ +#define KOKKOS_BIN_OPS_PUBLIC_API_HPP_ + +#include +#include + +namespace Kokkos { + +template +struct BinOp1D { + int max_bins_ = {}; + double mul_ = {}; + double min_ = {}; + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED BinOp1D() = default; +#else + BinOp1D() = delete; +#endif + + // Construct BinOp with number of bins, minimum value and maximum value + BinOp1D(int max_bins__, typename KeyViewType::const_value_type min, + typename KeyViewType::const_value_type max) + : max_bins_(max_bins__ + 1), + // Cast to double to avoid possible overflow when using integer + mul_(static_cast(max_bins__) / + (static_cast(max) - static_cast(min))), + min_(static_cast(min)) { + // For integral types the number of bins may be larger than the range + // in which case we can exactly have one unique value per bin + // and then don't need to sort bins. + if (std::is_integral::value && + (static_cast(max) - static_cast(min)) <= + static_cast(max_bins__)) { + mul_ = 1.; + } + } + + // Determine bin index from key value + template + KOKKOS_INLINE_FUNCTION int bin(ViewType& keys, const int& i) const { + return static_cast(mul_ * (static_cast(keys(i)) - min_)); + } + + // Return maximum bin index + 1 + KOKKOS_INLINE_FUNCTION + int max_bins() const { return max_bins_; } + + // Compare to keys within a bin if true new_val will be put before old_val + template + KOKKOS_INLINE_FUNCTION bool operator()(ViewType& keys, iType1& i1, + iType2& i2) const { + return keys(i1) < keys(i2); + } +}; + +template +struct BinOp3D { + int max_bins_[3] = {}; + double mul_[3] = {}; + double min_[3] = {}; + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED BinOp3D() = default; +#else + BinOp3D() = delete; +#endif + + BinOp3D(int max_bins__[], typename KeyViewType::const_value_type min[], + typename KeyViewType::const_value_type max[]) { + max_bins_[0] = max_bins__[0]; + max_bins_[1] = max_bins__[1]; + max_bins_[2] = max_bins__[2]; + mul_[0] = static_cast(max_bins__[0]) / + (static_cast(max[0]) - static_cast(min[0])); + mul_[1] = static_cast(max_bins__[1]) / + (static_cast(max[1]) - static_cast(min[1])); + mul_[2] = static_cast(max_bins__[2]) / + (static_cast(max[2]) - static_cast(min[2])); + min_[0] = static_cast(min[0]); + min_[1] = static_cast(min[1]); + min_[2] = static_cast(min[2]); + } + + template + KOKKOS_INLINE_FUNCTION int bin(ViewType& keys, const int& i) const { + return int((((int(mul_[0] * (keys(i, 0) - min_[0])) * max_bins_[1]) + + int(mul_[1] * (keys(i, 1) - min_[1]))) * + max_bins_[2]) + + int(mul_[2] * (keys(i, 2) - min_[2]))); + } + + KOKKOS_INLINE_FUNCTION + int max_bins() const { return max_bins_[0] * max_bins_[1] * max_bins_[2]; } + + template + KOKKOS_INLINE_FUNCTION bool operator()(ViewType& keys, iType1& i1, + iType2& i2) const { + if (keys(i1, 0) > keys(i2, 0)) + return true; + else if (keys(i1, 0) == keys(i2, 0)) { + if (keys(i1, 1) > keys(i2, 1)) + return true; + else if (keys(i1, 1) == keys(i2, 1)) { + if (keys(i1, 2) > keys(i2, 2)) return true; + } + } + return false; + } +}; + +} // namespace Kokkos +#endif diff --git a/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp b/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp new file mode 100644 index 0000000000..c399279fe4 --- /dev/null +++ b/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp @@ -0,0 +1,410 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_BIN_SORT_PUBLIC_API_HPP_ +#define KOKKOS_BIN_SORT_PUBLIC_API_HPP_ + +#include "Kokkos_BinOpsPublicAPI.hpp" +#include "impl/Kokkos_CopyOpsForBinSortImpl.hpp" +#include +#include + +namespace Kokkos { + +template +class BinSort { + public: + template + struct copy_functor { + using src_view_type = typename SrcViewType::const_type; + + using copy_op = Impl::CopyOp; + + DstViewType dst_values; + src_view_type src_values; + int dst_offset; + + copy_functor(DstViewType const& dst_values_, int const& dst_offset_, + SrcViewType const& src_values_) + : dst_values(dst_values_), + src_values(src_values_), + dst_offset(dst_offset_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i) const { + copy_op::copy(dst_values, i + dst_offset, src_values, i); + } + }; + + template + struct copy_permute_functor { + // If a Kokkos::View then can generate constant random access + // otherwise can only use the constant type. + + using src_view_type = std::conditional_t< + Kokkos::is_view::value, + Kokkos::View= 230700) + , + Kokkos::MemoryTraits +#endif + >, + typename SrcViewType::const_type>; + + using perm_view_type = typename PermuteViewType::const_type; + + using copy_op = Impl::CopyOp; + + DstViewType dst_values; + perm_view_type sort_order; + src_view_type src_values; + int src_offset; + + copy_permute_functor(DstViewType const& dst_values_, + PermuteViewType const& sort_order_, + SrcViewType const& src_values_, int const& src_offset_) + : dst_values(dst_values_), + sort_order(sort_order_), + src_values(src_values_), + src_offset(src_offset_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i) const { + copy_op::copy(dst_values, i, src_values, src_offset + sort_order(i)); + } + }; + + // Naming this alias "execution_space" would be problematic since it would be + // considered as execution space for the various functors which might use + // another execution space through sort() or create_permute_vector(). + using exec_space = typename Space::execution_space; + using bin_op_type = BinSortOp; + + struct bin_count_tag {}; + struct bin_offset_tag {}; + struct bin_binning_tag {}; + struct bin_sort_bins_tag {}; + + public: + using size_type = SizeType; + using value_type = size_type; + + using offset_type = Kokkos::View; + using bin_count_type = Kokkos::View; + + using const_key_view_type = typename KeyViewType::const_type; + + // If a Kokkos::View then can generate constant random access + // otherwise can only use the constant type. + + using const_rnd_key_view_type = std::conditional_t< + Kokkos::is_view::value, + Kokkos::View >, + const_key_view_type>; + + using non_const_key_scalar = typename KeyViewType::non_const_value_type; + using const_key_scalar = typename KeyViewType::const_value_type; + + using bin_count_atomic_type = + Kokkos::View >; + + private: + const_key_view_type keys; + const_rnd_key_view_type keys_rnd; + + public: + BinSortOp bin_op; + offset_type bin_offsets; + bin_count_atomic_type bin_count_atomic; + bin_count_type bin_count_const; + offset_type sort_order; + + int range_begin; + int range_end; + bool sort_within_bins; + + public: +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED BinSort() = default; +#else + BinSort() = delete; +#endif + + //---------------------------------------- + // Constructor: takes the keys, the binning_operator and optionally whether to + // sort within bins (default false) + template + BinSort(const ExecutionSpace& exec, const_key_view_type keys_, + int range_begin_, int range_end_, BinSortOp bin_op_, + bool sort_within_bins_ = false) + : keys(keys_), + keys_rnd(keys_), + bin_op(bin_op_), + bin_offsets(), + bin_count_atomic(), + bin_count_const(), + sort_order(), + range_begin(range_begin_), + range_end(range_end_), + sort_within_bins(sort_within_bins_) { + static_assert( + Kokkos::SpaceAccessibility::accessible, + "The provided execution space must be able to access the memory space " + "BinSort was initialized with!"); + if (bin_op.max_bins() <= 0) + Kokkos::abort( + "The number of bins in the BinSortOp object must be greater than 0!"); + bin_count_atomic = Kokkos::View( + "Kokkos::SortImpl::BinSortFunctor::bin_count", bin_op.max_bins()); + bin_count_const = bin_count_atomic; + bin_offsets = + offset_type(view_alloc(exec, WithoutInitializing, + "Kokkos::SortImpl::BinSortFunctor::bin_offsets"), + bin_op.max_bins()); + sort_order = + offset_type(view_alloc(exec, WithoutInitializing, + "Kokkos::SortImpl::BinSortFunctor::sort_order"), + range_end - range_begin); + } + + BinSort(const_key_view_type keys_, int range_begin_, int range_end_, + BinSortOp bin_op_, bool sort_within_bins_ = false) + : BinSort(exec_space{}, keys_, range_begin_, range_end_, bin_op_, + sort_within_bins_) {} + + template + BinSort(const ExecutionSpace& exec, const_key_view_type keys_, + BinSortOp bin_op_, bool sort_within_bins_ = false) + : BinSort(exec, keys_, 0, keys_.extent(0), bin_op_, sort_within_bins_) {} + + BinSort(const_key_view_type keys_, BinSortOp bin_op_, + bool sort_within_bins_ = false) + : BinSort(exec_space{}, keys_, bin_op_, sort_within_bins_) {} + + //---------------------------------------- + // Create the permutation vector, the bin_offset array and the bin_count + // array. Can be called again if keys changed + template + void create_permute_vector(const ExecutionSpace& exec) { + static_assert( + Kokkos::SpaceAccessibility::accessible, + "The provided execution space must be able to access the memory space " + "BinSort was initialized with!"); + + const size_t len = range_end - range_begin; + Kokkos::parallel_for( + "Kokkos::Sort::BinCount", + Kokkos::RangePolicy(exec, 0, len), + *this); + Kokkos::parallel_scan("Kokkos::Sort::BinOffset", + Kokkos::RangePolicy( + exec, 0, bin_op.max_bins()), + *this); + + Kokkos::deep_copy(exec, bin_count_atomic, 0); + Kokkos::parallel_for( + "Kokkos::Sort::BinBinning", + Kokkos::RangePolicy(exec, 0, len), + *this); + + if (sort_within_bins) + Kokkos::parallel_for( + "Kokkos::Sort::BinSort", + Kokkos::RangePolicy( + exec, 0, bin_op.max_bins()), + *this); + } + + // Create the permutation vector, the bin_offset array and the bin_count + // array. Can be called again if keys changed + void create_permute_vector() { + Kokkos::fence("Kokkos::Binsort::create_permute_vector: before"); + exec_space e{}; + create_permute_vector(e); + e.fence("Kokkos::Binsort::create_permute_vector: after"); + } + + // Sort a subset of a view with respect to the first dimension using the + // permutation array + template + void sort(const ExecutionSpace& exec, ValuesViewType const& values, + int values_range_begin, int values_range_end) const { + if (values.extent(0) == 0) { + return; + } + + static_assert( + Kokkos::SpaceAccessibility::accessible, + "The provided execution space must be able to access the memory space " + "BinSort was initialized with!"); + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename ValuesViewType::memory_space>::accessible, + "The provided execution space must be able to access the memory space " + "of the View argument!"); + + const size_t len = range_end - range_begin; + const size_t values_len = values_range_end - values_range_begin; + if (len != values_len) { + Kokkos::abort( + "BinSort::sort: values range length != permutation vector length"); + } + + using scratch_view_type = + Kokkos::View; + scratch_view_type sorted_values( + view_alloc(exec, WithoutInitializing, + "Kokkos::SortImpl::BinSortFunctor::sorted_values"), + values.rank_dynamic > 0 ? len : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + values.rank_dynamic > 1 ? values.extent(1) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + values.rank_dynamic > 2 ? values.extent(2) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + values.rank_dynamic > 3 ? values.extent(3) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + values.rank_dynamic > 4 ? values.extent(4) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + values.rank_dynamic > 5 ? values.extent(5) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + values.rank_dynamic > 6 ? values.extent(6) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + values.rank_dynamic > 7 ? values.extent(7) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG); + + { + copy_permute_functor + functor(sorted_values, sort_order, values, + values_range_begin - range_begin); + + parallel_for("Kokkos::Sort::CopyPermute", + Kokkos::RangePolicy(exec, 0, len), functor); + } + + { + copy_functor functor( + values, range_begin, sorted_values); + + parallel_for("Kokkos::Sort::Copy", + Kokkos::RangePolicy(exec, 0, len), functor); + } + } + + // Sort a subset of a view with respect to the first dimension using the + // permutation array + template + void sort(ValuesViewType const& values, int values_range_begin, + int values_range_end) const { + Kokkos::fence("Kokkos::Binsort::sort: before"); + exec_space exec; + sort(exec, values, values_range_begin, values_range_end); + exec.fence("Kokkos::BinSort:sort: after"); + } + + template + void sort(ExecutionSpace const& exec, ValuesViewType const& values) const { + this->sort(exec, values, 0, /*values.extent(0)*/ range_end - range_begin); + } + + template + void sort(ValuesViewType const& values) const { + this->sort(values, 0, /*values.extent(0)*/ range_end - range_begin); + } + + // Get the permutation vector + KOKKOS_INLINE_FUNCTION + offset_type get_permute_vector() const { return sort_order; } + + // Get the start offsets for each bin + KOKKOS_INLINE_FUNCTION + offset_type get_bin_offsets() const { return bin_offsets; } + + // Get the count for each bin + KOKKOS_INLINE_FUNCTION + bin_count_type get_bin_count() const { return bin_count_const; } + + public: + KOKKOS_INLINE_FUNCTION + void operator()(const bin_count_tag& /*tag*/, const int i) const { + const int j = range_begin + i; + bin_count_atomic(bin_op.bin(keys, j))++; + } + + KOKKOS_INLINE_FUNCTION + void operator()(const bin_offset_tag& /*tag*/, const int i, + value_type& offset, const bool& final) const { + if (final) { + bin_offsets(i) = offset; + } + offset += bin_count_const(i); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const bin_binning_tag& /*tag*/, const int i) const { + const int j = range_begin + i; + const int bin = bin_op.bin(keys, j); + const int count = bin_count_atomic(bin)++; + + sort_order(bin_offsets(bin) + count) = j; + } + + KOKKOS_INLINE_FUNCTION + void operator()(const bin_sort_bins_tag& /*tag*/, const int i) const { + auto bin_size = bin_count_const(i); + if (bin_size <= 1) return; + constexpr bool use_std_sort = + std::is_same_v; + int lower_bound = bin_offsets(i); + int upper_bound = lower_bound + bin_size; + // Switching to std::sort for more than 10 elements has been found + // reasonable experimentally. + if (use_std_sort && bin_size > 10) { + KOKKOS_IF_ON_HOST( + (std::sort(&sort_order(lower_bound), &sort_order(upper_bound), + [this](int p, int q) { return bin_op(keys_rnd, p, q); });)) + } else { + for (int k = lower_bound + 1; k < upper_bound; ++k) { + int old_idx = sort_order(k); + int j = k - 1; + while (j >= lower_bound) { + int new_idx = sort_order(j); + if (!bin_op(keys_rnd, old_idx, new_idx)) break; + sort_order(j + 1) = new_idx; + --j; + } + sort_order(j + 1) = old_idx; + } + } + } +}; + +} // namespace Kokkos +#endif diff --git a/algorithms/src/sorting/Kokkos_NestedSortPublicAPI.hpp b/algorithms/src/sorting/Kokkos_NestedSortPublicAPI.hpp new file mode 100644 index 0000000000..dd468e0734 --- /dev/null +++ b/algorithms/src/sorting/Kokkos_NestedSortPublicAPI.hpp @@ -0,0 +1,100 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_NESTED_SORT_PUBLIC_API_HPP_ +#define KOKKOS_NESTED_SORT_PUBLIC_API_HPP_ + +#include "impl/Kokkos_NestedSortImpl.hpp" +#include +#include + +namespace Kokkos { +namespace Experimental { + +template +KOKKOS_INLINE_FUNCTION void sort_team(const TeamMember& t, + const ViewType& view) { + Impl::sort_nested_impl(t, view, nullptr, + Experimental::Impl::StdAlgoLessThanBinaryPredicate< + typename ViewType::non_const_value_type>(), + Impl::NestedRange()); +} + +template +KOKKOS_INLINE_FUNCTION void sort_team(const TeamMember& t, const ViewType& view, + const Comparator& comp) { + Impl::sort_nested_impl(t, view, nullptr, comp, Impl::NestedRange()); +} + +template +KOKKOS_INLINE_FUNCTION void sort_by_key_team(const TeamMember& t, + const KeyViewType& keyView, + const ValueViewType& valueView) { + Impl::sort_nested_impl(t, keyView, valueView, + Experimental::Impl::StdAlgoLessThanBinaryPredicate< + typename KeyViewType::non_const_value_type>(), + Impl::NestedRange()); +} + +template +KOKKOS_INLINE_FUNCTION void sort_by_key_team(const TeamMember& t, + const KeyViewType& keyView, + const ValueViewType& valueView, + const Comparator& comp) { + Impl::sort_nested_impl(t, keyView, valueView, comp, + Impl::NestedRange()); +} + +template +KOKKOS_INLINE_FUNCTION void sort_thread(const TeamMember& t, + const ViewType& view) { + Impl::sort_nested_impl(t, view, nullptr, + Experimental::Impl::StdAlgoLessThanBinaryPredicate< + typename ViewType::non_const_value_type>(), + Impl::NestedRange()); +} + +template +KOKKOS_INLINE_FUNCTION void sort_thread(const TeamMember& t, + const ViewType& view, + const Comparator& comp) { + Impl::sort_nested_impl(t, view, nullptr, comp, Impl::NestedRange()); +} + +template +KOKKOS_INLINE_FUNCTION void sort_by_key_thread(const TeamMember& t, + const KeyViewType& keyView, + const ValueViewType& valueView) { + Impl::sort_nested_impl(t, keyView, valueView, + Experimental::Impl::StdAlgoLessThanBinaryPredicate< + typename KeyViewType::non_const_value_type>(), + Impl::NestedRange()); +} + +template +KOKKOS_INLINE_FUNCTION void sort_by_key_thread(const TeamMember& t, + const KeyViewType& keyView, + const ValueViewType& valueView, + const Comparator& comp) { + Impl::sort_nested_impl(t, keyView, valueView, comp, + Impl::NestedRange()); +} + +} // namespace Experimental +} // namespace Kokkos +#endif diff --git a/algorithms/src/sorting/Kokkos_SortByKeyPublicAPI.hpp b/algorithms/src/sorting/Kokkos_SortByKeyPublicAPI.hpp new file mode 100644 index 0000000000..fc73eccad6 --- /dev/null +++ b/algorithms/src/sorting/Kokkos_SortByKeyPublicAPI.hpp @@ -0,0 +1,117 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SORT_BY_KEY_PUBLIC_API_HPP_ +#define KOKKOS_SORT_BY_KEY_PUBLIC_API_HPP_ + +#include "./impl/Kokkos_SortByKeyImpl.hpp" +#include +#include + +namespace Kokkos::Experimental { + +// --------------------------------------------------------------- +// basic overloads +// --------------------------------------------------------------- + +template +void sort_by_key( + const ExecutionSpace& exec, + const Kokkos::View& keys, + const Kokkos::View& values) { + // constraints + using KeysType = Kokkos::View; + using ValuesType = Kokkos::View; + ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(keys); + ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(values); + + static_assert(SpaceAccessibility::accessible, + "Kokkos::sort: execution space instance is not able to access " + "the memory space of the keys View argument!"); + static_assert( + SpaceAccessibility::accessible, + "Kokkos::sort: execution space instance is not able to access " + "the memory space of the values View argument!"); + + static_assert(KeysType::static_extent(0) == 0 || + ValuesType::static_extent(0) == 0 || + KeysType::static_extent(0) == ValuesType::static_extent(0)); + if (values.size() != keys.size()) + Kokkos::abort((std::string("values and keys extents must be the same. The " + "values extent is ") + + std::to_string(values.size()) + ", and the keys extent is " + + std::to_string(keys.size()) + ".") + .c_str()); + + if (keys.extent(0) <= 1) { + return; + } + + ::Kokkos::Impl::sort_by_key_device_view_without_comparator(exec, keys, + values); +} + +// --------------------------------------------------------------- +// overloads supporting a custom comparator +// --------------------------------------------------------------- + +template +void sort_by_key( + const ExecutionSpace& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + const ComparatorType& comparator) { + // constraints + using KeysType = Kokkos::View; + using ValuesType = Kokkos::View; + ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(keys); + ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(values); + + static_assert(SpaceAccessibility::accessible, + "Kokkos::sort: execution space instance is not able to access " + "the memory space of the keys View argument!"); + static_assert( + SpaceAccessibility::accessible, + "Kokkos::sort: execution space instance is not able to access " + "the memory space of the values View argument!"); + + static_assert(KeysType::static_extent(0) == 0 || + ValuesType::static_extent(0) == 0 || + KeysType::static_extent(0) == ValuesType::static_extent(0)); + if (values.size() != keys.size()) + Kokkos::abort((std::string("values and keys extents must be the same. The " + "values extent is ") + + std::to_string(values.size()) + ", and the keys extent is " + + std::to_string(keys.size()) + ".") + .c_str()); + + if (keys.extent(0) <= 1) { + return; + } + + ::Kokkos::Impl::sort_by_key_device_view_with_comparator(exec, keys, values, + comparator); +} + +} // namespace Kokkos::Experimental +#endif diff --git a/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp b/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp new file mode 100644 index 0000000000..308e9e3a00 --- /dev/null +++ b/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp @@ -0,0 +1,196 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SORT_PUBLIC_API_HPP_ +#define KOKKOS_SORT_PUBLIC_API_HPP_ + +#include "./impl/Kokkos_SortImpl.hpp" +#include +#include +#include + +namespace Kokkos { + +// --------------------------------------------------------------- +// basic overloads +// --------------------------------------------------------------- + +template +void sort(const ExecutionSpace& exec, + const Kokkos::View& view) { + // constraints + using ViewType = Kokkos::View; + using MemSpace = typename ViewType::memory_space; + static_assert( + ViewType::rank == 1 && + (std::is_same_v || + std::is_same_v || + std::is_same_v), + "Kokkos::sort without comparator: supports 1D Views with LayoutRight, " + "LayoutLeft or LayoutStride."); + + static_assert(SpaceAccessibility::accessible, + "Kokkos::sort: execution space instance is not able to access " + "the memory space of the " + "View argument!"); + + if (view.extent(0) <= 1) { + return; + } + + if constexpr (Impl::better_off_calling_std_sort_v) { + exec.fence("Kokkos::sort without comparator use std::sort"); + auto first = ::Kokkos::Experimental::begin(view); + auto last = ::Kokkos::Experimental::end(view); + std::sort(first, last); + } else { + Impl::sort_device_view_without_comparator(exec, view); + } +} + +template +void sort(const Kokkos::View& view) { + using ViewType = Kokkos::View; + static_assert(ViewType::rank == 1, + "Kokkos::sort: currently only supports rank-1 Views."); + + Kokkos::fence("Kokkos::sort: before"); + + if (view.extent(0) <= 1) { + return; + } + + typename ViewType::execution_space exec; + sort(exec, view); + exec.fence("Kokkos::sort: fence after sorting"); +} + +// --------------------------------------------------------------- +// overloads supporting a custom comparator +// --------------------------------------------------------------- +template +void sort(const ExecutionSpace& exec, + const Kokkos::View& view, + const ComparatorType& comparator) { + // constraints + using ViewType = Kokkos::View; + using MemSpace = typename ViewType::memory_space; + static_assert( + ViewType::rank == 1 && + (std::is_same_v || + std::is_same_v || + std::is_same_v), + "Kokkos::sort with comparator: supports 1D Views with LayoutRight, " + "LayoutLeft or LayoutStride."); + + static_assert(SpaceAccessibility::accessible, + "Kokkos::sort: execution space instance is not able to access " + "the memory space of the View argument!"); + + if (view.extent(0) <= 1) { + return; + } + + if constexpr (Impl::better_off_calling_std_sort_v) { + exec.fence("Kokkos::sort with comparator use std::sort"); + auto first = ::Kokkos::Experimental::begin(view); + auto last = ::Kokkos::Experimental::end(view); + std::sort(first, last, comparator); + } else { + Impl::sort_device_view_with_comparator(exec, view, comparator); + } +} + +template +void sort(const Kokkos::View& view, + const ComparatorType& comparator) { + using ViewType = Kokkos::View; + static_assert( + ViewType::rank == 1 && + (std::is_same_v || + std::is_same_v || + std::is_same_v), + "Kokkos::sort with comparator: supports 1D Views with LayoutRight, " + "LayoutLeft or LayoutStride."); + + Kokkos::fence("Kokkos::sort with comparator: before"); + + if (view.extent(0) <= 1) { + return; + } + + typename ViewType::execution_space exec; + sort(exec, view, comparator); + exec.fence("Kokkos::sort with comparator: fence after sorting"); +} + +// --------------------------------------------------------------- +// overloads for sorting a view with a subrange +// specified via integers begin, end +// --------------------------------------------------------------- + +template +std::enable_if_t::value> sort( + const ExecutionSpace& exec, ViewType view, size_t const begin, + size_t const end) { + // view must be rank-1 because the Impl::min_max_functor + // used below only works for rank-1 views for now + static_assert(ViewType::rank == 1, + "Kokkos::sort: currently only supports rank-1 Views."); + + if (view.extent(0) <= 1) { + return; + } + + using range_policy = Kokkos::RangePolicy; + using CompType = BinOp1D; + + Kokkos::MinMaxScalar result; + Kokkos::MinMax reducer(result); + + parallel_reduce("Kokkos::Sort::FindExtent", range_policy(exec, begin, end), + Impl::min_max_functor(view), reducer); + + if (result.min_val == result.max_val) return; + + BinSort bin_sort( + exec, view, begin, end, + CompType((end - begin) / 2, result.min_val, result.max_val), true); + + bin_sort.create_permute_vector(exec); + bin_sort.sort(exec, view, begin, end); +} + +template +void sort(ViewType view, size_t const begin, size_t const end) { + // same constraints as the overload above which this gets dispatched to + static_assert(ViewType::rank == 1, + "Kokkos::sort: currently only supports rank-1 Views."); + + Kokkos::fence("Kokkos::sort: before"); + + if (view.extent(0) <= 1) { + return; + } + + typename ViewType::execution_space exec; + sort(exec, view, begin, end); + exec.fence("Kokkos::Sort: fence after sorting"); +} + +} // namespace Kokkos +#endif diff --git a/algorithms/src/sorting/impl/Kokkos_CopyOpsForBinSortImpl.hpp b/algorithms/src/sorting/impl/Kokkos_CopyOpsForBinSortImpl.hpp new file mode 100644 index 0000000000..07f5926d82 --- /dev/null +++ b/algorithms/src/sorting/impl/Kokkos_CopyOpsForBinSortImpl.hpp @@ -0,0 +1,61 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_COPY_OPS_FOR_BINSORT_IMPL_HPP_ +#define KOKKOS_COPY_OPS_FOR_BINSORT_IMPL_HPP_ + +#include +#include + +namespace Kokkos { +namespace Impl { + +template +struct CopyOp; + +template +struct CopyOp { + KOKKOS_INLINE_FUNCTION + static void copy(DstViewType const& dst, size_t i_dst, SrcViewType const& src, + size_t i_src) { + dst(i_dst) = src(i_src); + } +}; + +template +struct CopyOp { + KOKKOS_INLINE_FUNCTION + static void copy(DstViewType const& dst, size_t i_dst, SrcViewType const& src, + size_t i_src) { + for (int j = 0; j < (int)dst.extent(1); j++) dst(i_dst, j) = src(i_src, j); + } +}; + +template +struct CopyOp { + KOKKOS_INLINE_FUNCTION + static void copy(DstViewType const& dst, size_t i_dst, SrcViewType const& src, + size_t i_src) { + for (int j = 0; j < dst.extent(1); j++) + for (int k = 0; k < dst.extent(2); k++) + dst(i_dst, j, k) = src(i_src, j, k); + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp b/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp new file mode 100644 index 0000000000..2fe58272d9 --- /dev/null +++ b/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp @@ -0,0 +1,114 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_NESTED_SORT_IMPL_HPP_ +#define KOKKOS_NESTED_SORT_IMPL_HPP_ + +#include + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +// true for TeamVectorRange, false for ThreadVectorRange +template +struct NestedRange {}; + +// Specialization for team-level +template <> +struct NestedRange { + template + KOKKOS_FUNCTION static auto create(const TeamMember& t, SizeType len) { + return Kokkos::TeamVectorRange(t, len); + } + template + KOKKOS_FUNCTION static void barrier(const TeamMember& t) { + t.team_barrier(); + } +}; + +// Specialization for thread-level +template <> +struct NestedRange { + template + KOKKOS_FUNCTION static auto create(const TeamMember& t, SizeType len) { + return Kokkos::ThreadVectorRange(t, len); + } + // Barrier is no-op, as vector lanes of a thread are implicitly synchronized + // after parallel region + template + KOKKOS_FUNCTION static void barrier(const TeamMember&) {} +}; + +// When just doing sort (not sort_by_key), use nullptr_t for ValueViewType. +// This only takes the NestedRange instance for template arg deduction. +template +KOKKOS_INLINE_FUNCTION void sort_nested_impl( + const TeamMember& t, const KeyViewType& keyView, + [[maybe_unused]] const ValueViewType& valueView, const Comparator& comp, + const NestedRange) { + using SizeType = typename KeyViewType::size_type; + using KeyType = typename KeyViewType::non_const_value_type; + using Range = NestedRange; + SizeType n = keyView.extent(0); + SizeType npot = 1; + SizeType levels = 0; + // FIXME: ceiling power-of-two is a common thing to need - make it a utility + while (npot < n) { + levels++; + npot <<= 1; + } + for (SizeType i = 0; i < levels; i++) { + for (SizeType j = 0; j <= i; j++) { + // n/2 pairs of items are compared in parallel + Kokkos::parallel_for(Range::create(t, npot / 2), [=](const SizeType k) { + // How big are the brown/pink boxes? + // (Terminology comes from Wikipedia diagram) + // https://commons.wikimedia.org/wiki/File:BitonicSort.svg#/media/File:BitonicSort.svg + SizeType boxSize = SizeType(2) << (i - j); + // Which box contains this thread? + SizeType boxID = k >> (i - j); // k * 2 / boxSize; + SizeType boxStart = boxID << (1 + i - j); // boxID * boxSize + SizeType boxOffset = k - (boxStart >> 1); // k - boxID * boxSize / 2; + SizeType elem1 = boxStart + boxOffset; + // In first phase (j == 0, brown box): within a box, compare with the + // opposite value in the box. + // In later phases (j > 0, pink box): within a box, compare with fixed + // distance (boxSize / 2) apart. + SizeType elem2 = (j == 0) ? (boxStart + boxSize - 1 - boxOffset) + : (elem1 + boxSize / 2); + if (elem2 < n) { + KeyType key1 = keyView(elem1); + KeyType key2 = keyView(elem2); + if (comp(key2, key1)) { + keyView(elem1) = key2; + keyView(elem2) = key1; + if constexpr (!std::is_same_v) { + Kokkos::kokkos_swap(valueView(elem1), valueView(elem2)); + } + } + } + }); + Range::barrier(t); + } + } +} + +} // namespace Impl +} // namespace Experimental +} // namespace Kokkos +#endif diff --git a/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp b/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp new file mode 100644 index 0000000000..f11f807048 --- /dev/null +++ b/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp @@ -0,0 +1,424 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SORT_BY_KEY_FREE_FUNCS_IMPL_HPP_ +#define KOKKOS_SORT_BY_KEY_FREE_FUNCS_IMPL_HPP_ + +#include + +#if defined(KOKKOS_ENABLE_CUDA) + +// Workaround for `Instruction 'shfl' without '.sync' is not supported on +// .target sm_70 and higher from PTX ISA version 6.4`. +// Also see https://github.com/NVIDIA/cub/pull/170. +#if !defined(CUB_USE_COOPERATIVE_GROUPS) +#define CUB_USE_COOPERATIVE_GROUPS +#endif + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wshadow" + +#if defined(KOKKOS_COMPILER_CLANG) +// Some versions of Clang fail to compile Thrust, failing with errors like +// this: +// /thrust/system/cuda/detail/core/agent_launcher.h:557:11: +// error: use of undeclared identifier 'va_printf' +// The exact combination of versions for Clang and Thrust (or CUDA) for this +// failure was not investigated, however even very recent version combination +// (Clang 10.0.0 and Cuda 10.0) demonstrated failure. +// +// Defining _CubLog here locally allows us to avoid that code path, however +// disabling some debugging diagnostics +#pragma push_macro("_CubLog") +#ifdef _CubLog +#undef _CubLog +#endif +#define _CubLog +#include +#include +#pragma pop_macro("_CubLog") +#else +#include +#include +#endif + +#pragma GCC diagnostic pop + +#endif + +#if defined(KOKKOS_ENABLE_ROCTHRUST) +#include +#include +#endif + +#if defined(KOKKOS_ENABLE_ONEDPL) && \ + (ONEDPL_VERSION_MAJOR > 2022 || \ + (ONEDPL_VERSION_MAJOR == 2022 && ONEDPL_VERSION_MINOR >= 2)) +#define KOKKOS_ONEDPL_HAS_SORT_BY_KEY +#include +#include +#endif + +namespace Kokkos::Impl { + +template +constexpr inline bool is_admissible_to_kokkos_sort_by_key = + ::Kokkos::is_view::value&& T::rank() == 1 && + (std::is_same::value || + std::is_same::value || + std::is_same::value); + +template +KOKKOS_INLINE_FUNCTION constexpr void +static_assert_is_admissible_to_kokkos_sort_by_key(const ViewType& /* view */) { + static_assert(is_admissible_to_kokkos_sort_by_key, + "Kokkos::sort_by_key only accepts 1D values View with " + "LayoutRight, LayoutLeft or LayoutStride."); +} + +// For the fallback implementation for sort_by_key using Kokkos::sort, we need +// to consider if Kokkos::sort defers to the fallback implementation that copies +// the array to the host and uses std::sort, see +// copy_to_host_run_stdsort_copy_back() in impl/Kokkos_SortImpl.hpp. If +// sort_on_device_v is true, we assume that std::sort doesn't copy data. +// Otherwise, we manually copy all data to the host and provide Kokkos::sort +// with a host execution space. +template +inline constexpr bool sort_on_device_v = false; + +#if defined(KOKKOS_ENABLE_CUDA) +template +inline constexpr bool sort_on_device_v = true; + +template +void sort_by_key_cudathrust( + const Kokkos::Cuda& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + MaybeComparator&&... maybeComparator) { + const auto policy = thrust::cuda::par.on(exec.cuda_stream()); + auto keys_first = ::Kokkos::Experimental::begin(keys); + auto keys_last = ::Kokkos::Experimental::end(keys); + auto values_first = ::Kokkos::Experimental::begin(values); + thrust::sort_by_key(policy, keys_first, keys_last, values_first, + std::forward(maybeComparator)...); +} +#endif + +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +inline constexpr bool sort_on_device_v = true; + +template +void sort_by_key_rocthrust( + const Kokkos::HIP& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + MaybeComparator&&... maybeComparator) { + const auto policy = thrust::hip::par.on(exec.hip_stream()); + auto keys_first = ::Kokkos::Experimental::begin(keys); + auto keys_last = ::Kokkos::Experimental::end(keys); + auto values_first = ::Kokkos::Experimental::begin(values); + thrust::sort_by_key(policy, keys_first, keys_last, values_first, + std::forward(maybeComparator)...); +} +#endif + +#if defined(KOKKOS_ENABLE_ONEDPL) +template +inline constexpr bool sort_on_device_v = + std::is_same_v || + std::is_same_v; + +#ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY +template +void sort_by_key_onedpl( + const Kokkos::Experimental::SYCL& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + MaybeComparator&&... maybeComparator) { + if (keys.stride(0) != 1 && values.stride(0) != 1) { + Kokkos::abort( + "SYCL sort_by_key only supports rank-1 Views with stride(0) = 1."); + } + + // Can't use Experimental::begin/end here since the oneDPL then assumes that + // the data is on the host. + auto queue = exec.sycl_queue(); + auto policy = oneapi::dpl::execution::make_device_policy(queue); + const int n = keys.extent(0); + oneapi::dpl::sort_by_key(policy, keys.data(), keys.data() + n, values.data(), + std::forward(maybeComparator)...); +} +#endif +#endif + +template +void applyPermutation(const ExecutionSpace& space, + const PermutationView& permutation, + const ViewType& view) { + static_assert(std::is_integral::value); + + auto view_copy = Kokkos::create_mirror( + Kokkos::view_alloc(space, typename ExecutionSpace::memory_space{}, + Kokkos::WithoutInitializing), + view); + Kokkos::deep_copy(space, view_copy, view); + Kokkos::parallel_for( + "Kokkos::sort_by_key_via_sort::permute_" + view.label(), + Kokkos::RangePolicy(space, 0, view.extent(0)), + KOKKOS_LAMBDA(int i) { view(i) = view_copy(permutation(i)); }); +} + +// FIXME_NVCC: nvcc has trouble compiling lambdas inside a function with +// variadic templates (sort_by_key_via_sort). Switch to using functors instead. +template +struct IotaFunctor { + Permute _permute; + KOKKOS_FUNCTION void operator()(int i) const { _permute(i) = i; } +}; +template +struct LessFunctor { + Keys _keys; + KOKKOS_FUNCTION bool operator()(int i, int j) const { + return _keys(i) < _keys(j); + } +}; + +// FIXME_NVCC+MSVC: We can't use a lambda instead of a functor which gave us +// "For this host platform/dialect, an extended lambda cannot be defined inside +// the 'if' or 'else' block of a constexpr if statement" +template +struct KeyComparisonFunctor { + Keys m_keys; + Comparator m_comparator; + KOKKOS_FUNCTION bool operator()(int i, int j) const { + return m_comparator(m_keys(i), m_keys(j)); + } +}; + +template +void sort_by_key_via_sort( + const ExecutionSpace& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + MaybeComparator&&... maybeComparator) { + static_assert(sizeof...(MaybeComparator) <= 1); + + auto const n = keys.size(); + + Kokkos::View permute( + Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, + "Kokkos::sort_by_key_via_sort::permute"), + n); + + // iota + Kokkos::parallel_for("Kokkos::sort_by_key_via_sort::iota", + Kokkos::RangePolicy(exec, 0, n), + IotaFunctor{permute}); + + using Layout = + typename Kokkos::View::array_layout; + if constexpr (!sort_on_device_v) { + auto host_keys = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::HostSpace{}, Kokkos::WithoutInitializing), + keys); + auto host_permute = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::HostSpace{}, Kokkos::WithoutInitializing), + permute); + Kokkos::deep_copy(exec, host_keys, keys); + Kokkos::deep_copy(exec, host_permute, permute); + + exec.fence("Kokkos::Impl::sort_by_key_via_sort: before host sort"); + Kokkos::DefaultHostExecutionSpace host_exec; + + if constexpr (sizeof...(MaybeComparator) == 0) { + Kokkos::sort(host_exec, host_permute, + LessFunctor{host_keys}); + } else { + auto keys_comparator = + std::get<0>(std::tuple(maybeComparator...)); + Kokkos::sort( + host_exec, host_permute, + KeyComparisonFunctor{ + host_keys, keys_comparator}); + } + host_exec.fence("Kokkos::Impl::sort_by_key_via_sort: after host sort"); + Kokkos::deep_copy(exec, permute, host_permute); + } else { +#ifdef KOKKOS_ENABLE_SYCL + auto* raw_keys_in_comparator = keys.data(); + auto stride = keys.stride(0); + if constexpr (sizeof...(MaybeComparator) == 0) { + Kokkos::sort( + exec, permute, KOKKOS_LAMBDA(int i, int j) { + return raw_keys_in_comparator[i * stride] < + raw_keys_in_comparator[j * stride]; + }); + } else { + auto keys_comparator = + std::get<0>(std::tuple(maybeComparator...)); + Kokkos::sort( + exec, permute, KOKKOS_LAMBDA(int i, int j) { + return keys_comparator(raw_keys_in_comparator[i * stride], + raw_keys_in_comparator[j * stride]); + }); + } +#else + if constexpr (sizeof...(MaybeComparator) == 0) { + Kokkos::sort(exec, permute, LessFunctor{keys}); + } else { + auto keys_comparator = + std::get<0>(std::tuple(maybeComparator...)); + Kokkos::sort( + exec, permute, + KeyComparisonFunctor{ + keys, keys_comparator}); + } +#endif + } + + applyPermutation(exec, permute, keys); + applyPermutation(exec, permute, values); +} + +// ------------------------------------------------------ +// +// specialize cases for sorting by key without comparator +// +// ------------------------------------------------------ + +#if defined(KOKKOS_ENABLE_CUDA) +template +void sort_by_key_device_view_without_comparator( + const Kokkos::Cuda& exec, + const Kokkos::View& keys, + const Kokkos::View& values) { + sort_by_key_cudathrust(exec, keys, values); +} +#endif + +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +void sort_by_key_device_view_without_comparator( + const Kokkos::HIP& exec, + const Kokkos::View& keys, + const Kokkos::View& values) { + sort_by_key_rocthrust(exec, keys, values); +} +#endif + +#if defined(KOKKOS_ENABLE_ONEDPL) +template +void sort_by_key_device_view_without_comparator( + const Kokkos::Experimental::SYCL& exec, + const Kokkos::View& keys, + const Kokkos::View& values) { +#ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY + if (keys.stride(0) == 1 && values.stride(0) == 1) + sort_by_key_onedpl(exec, keys, values); + else +#endif + sort_by_key_via_sort(exec, keys, values); +} +#endif + +// fallback case +template +std::enable_if_t::value> +sort_by_key_device_view_without_comparator( + const ExecutionSpace& exec, + const Kokkos::View& keys, + const Kokkos::View& values) { + sort_by_key_via_sort(exec, keys, values); +} + +// --------------------------------------------------- +// +// specialize cases for sorting by key with comparator +// +// --------------------------------------------------- + +#if defined(KOKKOS_ENABLE_CUDA) +template +void sort_by_key_device_view_with_comparator( + const Kokkos::Cuda& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + const ComparatorType& comparator) { + sort_by_key_cudathrust(exec, keys, values, comparator); +} +#endif + +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +void sort_by_key_device_view_with_comparator( + const Kokkos::HIP& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + const ComparatorType& comparator) { + sort_by_key_rocthrust(exec, keys, values, comparator); +} +#endif + +#if defined(KOKKOS_ENABLE_ONEDPL) +template +void sort_by_key_device_view_with_comparator( + const Kokkos::Experimental::SYCL& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + const ComparatorType& comparator) { +#ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY + if (keys.stride(0) == 1 && values.stride(0) == 1) + sort_by_key_onedpl(exec, keys, values, comparator); + else +#endif + sort_by_key_via_sort(exec, keys, values, comparator); +} +#endif + +// fallback case +template +std::enable_if_t::value> +sort_by_key_device_view_with_comparator( + const ExecutionSpace& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + const ComparatorType& comparator) { + sort_by_key_via_sort(exec, keys, values, comparator); +} + +#undef KOKKOS_ONEDPL_HAS_SORT_BY_KEY + +} // namespace Kokkos::Impl +#endif diff --git a/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp b/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp new file mode 100644 index 0000000000..0894622891 --- /dev/null +++ b/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp @@ -0,0 +1,416 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SORT_FREE_FUNCS_IMPL_HPP_ +#define KOKKOS_SORT_FREE_FUNCS_IMPL_HPP_ + +#include "../Kokkos_BinOpsPublicAPI.hpp" +#include "../Kokkos_BinSortPublicAPI.hpp" +#include +#include +#include + +#if defined(KOKKOS_ENABLE_CUDA) + +// Workaround for `Instruction 'shfl' without '.sync' is not supported on +// .target sm_70 and higher from PTX ISA version 6.4`. +// Also see https://github.com/NVIDIA/cub/pull/170. +#if !defined(CUB_USE_COOPERATIVE_GROUPS) +#define CUB_USE_COOPERATIVE_GROUPS +#endif + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wshadow" + +#if defined(KOKKOS_COMPILER_CLANG) +// Some versions of Clang fail to compile Thrust, failing with errors like +// this: +// /thrust/system/cuda/detail/core/agent_launcher.h:557:11: +// error: use of undeclared identifier 'va_printf' +// The exact combination of versions for Clang and Thrust (or CUDA) for this +// failure was not investigated, however even very recent version combination +// (Clang 10.0.0 and Cuda 10.0) demonstrated failure. +// +// Defining _CubLog here locally allows us to avoid that code path, however +// disabling some debugging diagnostics +#pragma push_macro("_CubLog") +#ifdef _CubLog +#undef _CubLog +#endif +#define _CubLog +#include +#include +#pragma pop_macro("_CubLog") +#else +#include +#include +#endif + +#pragma GCC diagnostic pop + +#endif + +#if defined(KOKKOS_ENABLE_ROCTHRUST) +#include +#include +#endif + +#if defined(KOKKOS_ENABLE_ONEDPL) +#include +#include +#endif + +namespace Kokkos { +namespace Impl { + +template +struct better_off_calling_std_sort : std::false_type {}; + +#if defined KOKKOS_ENABLE_SERIAL +template <> +struct better_off_calling_std_sort : std::true_type {}; +#endif + +#if defined KOKKOS_ENABLE_OPENMP +template <> +struct better_off_calling_std_sort : std::true_type {}; +#endif + +#if defined KOKKOS_ENABLE_THREADS +template <> +struct better_off_calling_std_sort : std::true_type {}; +#endif + +#if defined KOKKOS_ENABLE_HPX +template <> +struct better_off_calling_std_sort : std::true_type { +}; +#endif + +template +inline constexpr bool better_off_calling_std_sort_v = + better_off_calling_std_sort::value; + +template +struct min_max_functor { + using minmax_scalar = + Kokkos::MinMaxScalar; + + ViewType view; + min_max_functor(const ViewType& view_) : view(view_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const size_t& i, minmax_scalar& minmax) const { + if (view(i) < minmax.min_val) minmax.min_val = view(i); + if (view(i) > minmax.max_val) minmax.max_val = view(i); + } +}; + +template +void sort_via_binsort(const ExecutionSpace& exec, + const Kokkos::View& view) { + // Although we are using BinSort below, which could work on rank-2 views, + // for now view must be rank-1 because the min_max_functor + // used below only works for rank-1 views + using ViewType = Kokkos::View; + static_assert(ViewType::rank == 1, + "Kokkos::sort: currently only supports rank-1 Views."); + + if (view.extent(0) <= 1) { + return; + } + + Kokkos::MinMaxScalar result; + Kokkos::MinMax reducer(result); + parallel_reduce("Kokkos::Sort::FindExtent", + Kokkos::RangePolicy( + exec, 0, view.extent(0)), + min_max_functor(view), reducer); + if (result.min_val == result.max_val) return; + // For integral types the number of bins may be larger than the range + // in which case we can exactly have one unique value per bin + // and then don't need to sort bins. + bool sort_in_bins = true; + // TODO: figure out better max_bins then this ... + int64_t max_bins = view.extent(0) / 2; + if (std::is_integral::value) { + // Cast to double to avoid possible overflow when using integer + auto const max_val = static_cast(result.max_val); + auto const min_val = static_cast(result.min_val); + // using 10M as the cutoff for special behavior (roughly 40MB for the count + // array) + if ((max_val - min_val) < 10000000) { + max_bins = max_val - min_val + 1; + sort_in_bins = false; + } + } + if (std::is_floating_point::value) { + KOKKOS_ASSERT(std::isfinite(static_cast(result.max_val) - + static_cast(result.min_val))); + } + + using CompType = BinOp1D; + BinSort bin_sort( + view, CompType(max_bins, result.min_val, result.max_val), sort_in_bins); + bin_sort.create_permute_vector(exec); + bin_sort.sort(exec, view); +} + +#if defined(KOKKOS_ENABLE_CUDA) +template +void sort_cudathrust(const Cuda& space, + const Kokkos::View& view, + MaybeComparator&&... maybeComparator) { + using ViewType = Kokkos::View; + static_assert(ViewType::rank == 1, + "Kokkos::sort: currently only supports rank-1 Views."); + + if (view.extent(0) <= 1) { + return; + } + const auto exec = thrust::cuda::par.on(space.cuda_stream()); + auto first = ::Kokkos::Experimental::begin(view); + auto last = ::Kokkos::Experimental::end(view); + thrust::sort(exec, first, last, + std::forward(maybeComparator)...); +} +#endif + +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +void sort_rocthrust(const HIP& space, + const Kokkos::View& view, + MaybeComparator&&... maybeComparator) { + using ViewType = Kokkos::View; + static_assert(ViewType::rank == 1, + "Kokkos::sort: currently only supports rank-1 Views."); + + if (view.extent(0) <= 1) { + return; + } + const auto exec = thrust::hip::par.on(space.hip_stream()); + auto first = ::Kokkos::Experimental::begin(view); + auto last = ::Kokkos::Experimental::end(view); + thrust::sort(exec, first, last, + std::forward(maybeComparator)...); +} +#endif + +#if defined(KOKKOS_ENABLE_ONEDPL) +template +void sort_onedpl(const Kokkos::Experimental::SYCL& space, + const Kokkos::View& view, + MaybeComparator&&... maybeComparator) { + using ViewType = Kokkos::View; + static_assert(SpaceAccessibility::accessible, + "SYCL execution space is not able to access the memory space " + "of the View argument!"); + + static_assert( + (ViewType::rank == 1) && + (std::is_same_v || + std::is_same_v || + std::is_same_v), + "SYCL sort only supports contiguous rank-1 Views with LayoutLeft, " + "LayoutRight or LayoutStride" + "For the latter, this means the View must have stride(0) = 1, enforced " + "at runtime."); + + if (view.stride(0) != 1) { + Kokkos::abort("SYCL sort only supports rank-1 Views with stride(0) = 1."); + } + + if (view.extent(0) <= 1) { + return; + } + + // Can't use Experimental::begin/end here since the oneDPL then assumes that + // the data is on the host. + auto queue = space.sycl_queue(); + auto policy = oneapi::dpl::execution::make_device_policy(queue); + const int n = view.extent(0); + oneapi::dpl::sort(policy, view.data(), view.data() + n, + std::forward(maybeComparator)...); +} +#endif + +template +void copy_to_host_run_stdsort_copy_back( + const ExecutionSpace& exec, + const Kokkos::View& view, + MaybeComparator&&... maybeComparator) { + namespace KE = ::Kokkos::Experimental; + + using ViewType = Kokkos::View; + using layout = typename ViewType::array_layout; + if constexpr (std::is_same_v) { + // for strided views we cannot just deep_copy from device to host, + // so we need to do a few more jumps + using view_value_type = typename ViewType::non_const_value_type; + using view_exespace = typename ViewType::execution_space; + using view_deep_copyable_t = Kokkos::View; + view_deep_copyable_t view_dc("view_dc", view.extent(0)); + KE::copy(exec, view, view_dc); + + // run sort on the mirror of view_dc + auto mv_h = create_mirror_view_and_copy(Kokkos::HostSpace(), view_dc); + auto first = KE::begin(mv_h); + auto last = KE::end(mv_h); + std::sort(first, last, std::forward(maybeComparator)...); + Kokkos::deep_copy(exec, view_dc, mv_h); + + // copy back to argument view + KE::copy(exec, KE::cbegin(view_dc), KE::cend(view_dc), KE::begin(view)); + } else { + auto view_h = create_mirror_view_and_copy(Kokkos::HostSpace(), view); + auto first = KE::begin(view_h); + auto last = KE::end(view_h); + std::sort(first, last, std::forward(maybeComparator)...); + Kokkos::deep_copy(exec, view, view_h); + } +} + +// -------------------------------------------------- +// +// specialize cases for sorting without comparator +// +// -------------------------------------------------- + +#if defined(KOKKOS_ENABLE_CUDA) +template +void sort_device_view_without_comparator( + const Cuda& exec, const Kokkos::View& view) { + sort_cudathrust(exec, view); +} +#endif + +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +void sort_device_view_without_comparator( + const HIP& exec, const Kokkos::View& view) { + sort_rocthrust(exec, view); +} +#endif + +#if defined(KOKKOS_ENABLE_ONEDPL) +template +void sort_device_view_without_comparator( + const Kokkos::Experimental::SYCL& exec, + const Kokkos::View& view) { + using ViewType = Kokkos::View; + static_assert( + (ViewType::rank == 1) && + (std::is_same_v || + std::is_same_v || + std::is_same_v), + "sort_device_view_without_comparator: supports rank-1 Views " + "with LayoutLeft, LayoutRight or LayoutStride"); + + if (view.stride(0) == 1) { + sort_onedpl(exec, view); + } else { + copy_to_host_run_stdsort_copy_back(exec, view); + } +} +#endif + +// fallback case +template +std::enable_if_t::value> +sort_device_view_without_comparator( + const ExecutionSpace& exec, + const Kokkos::View& view) { + sort_via_binsort(exec, view); +} + +// -------------------------------------------------- +// +// specialize cases for sorting with comparator +// +// -------------------------------------------------- + +#if defined(KOKKOS_ENABLE_CUDA) +template +void sort_device_view_with_comparator( + const Cuda& exec, const Kokkos::View& view, + const ComparatorType& comparator) { + sort_cudathrust(exec, view, comparator); +} +#endif + +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +void sort_device_view_with_comparator( + const HIP& exec, const Kokkos::View& view, + const ComparatorType& comparator) { + sort_rocthrust(exec, view, comparator); +} +#endif + +#if defined(KOKKOS_ENABLE_ONEDPL) +template +void sort_device_view_with_comparator( + const Kokkos::Experimental::SYCL& exec, + const Kokkos::View& view, + const ComparatorType& comparator) { + using ViewType = Kokkos::View; + static_assert( + (ViewType::rank == 1) && + (std::is_same_v || + std::is_same_v || + std::is_same_v), + "sort_device_view_with_comparator: supports rank-1 Views " + "with LayoutLeft, LayoutRight or LayoutStride"); + + if (view.stride(0) == 1) { + sort_onedpl(exec, view, comparator); + } else { + copy_to_host_run_stdsort_copy_back(exec, view, comparator); + } +} +#endif + +template +std::enable_if_t::value> +sort_device_view_with_comparator( + const ExecutionSpace& exec, + const Kokkos::View& view, + const ComparatorType& comparator) { + // This is a fallback case if a more specialized overload does not exist: + // for now, this fallback copies data to host, runs std::sort + // and then copies data back. Potentially, this can later be changed + // with a better solution like our own quicksort on device or similar. + + using ViewType = Kokkos::View; + using MemSpace = typename ViewType::memory_space; +// Note with HIP unified memory this code path is still the right thing to do +// if we end up here when RocThrust is not enabled. +// The create_mirror_view_and_copy will do the right thing (no copy). +#ifndef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY + static_assert(!SpaceAccessibility::accessible, + "Impl::sort_device_view_with_comparator: should not be called " + "on a view that is already accessible on the host"); +#endif + + copy_to_host_run_stdsort_copy_back(exec, view, comparator); +} + +} // namespace Impl +} // namespace Kokkos +#endif diff --git a/algorithms/src/std_algorithms/Kokkos_AdjacentDifference.hpp b/algorithms/src/std_algorithms/Kokkos_AdjacentDifference.hpp index 38dcd1a674..f254686dba 100644 --- a/algorithms/src/std_algorithms/Kokkos_AdjacentDifference.hpp +++ b/algorithms/src/std_algorithms/Kokkos_AdjacentDifference.hpp @@ -23,64 +23,85 @@ namespace Kokkos { namespace Experimental { -template -std::enable_if_t::value, - OutputIteratorType> -adjacent_difference(const ExecutionSpace& ex, InputIteratorType first_from, - InputIteratorType last_from, - OutputIteratorType first_dest) { +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename InputIteratorType, + typename OutputIteratorType, + std::enable_if_t::value && + ::Kokkos::is_execution_space::value, + int> = 0> +OutputIteratorType adjacent_difference(const ExecutionSpace& ex, + InputIteratorType first_from, + InputIteratorType last_from, + OutputIteratorType first_dest) { using value_type1 = typename InputIteratorType::value_type; using value_type2 = typename OutputIteratorType::value_type; using binary_op = Impl::StdAdjacentDifferenceDefaultBinaryOpFunctor; - return Impl::adjacent_difference_impl( + return Impl::adjacent_difference_exespace_impl( "Kokkos::adjacent_difference_iterator_api", ex, first_from, last_from, first_dest, binary_op()); } -template -std::enable_if_t::value, - OutputIteratorType> -adjacent_difference(const ExecutionSpace& ex, InputIteratorType first_from, - InputIteratorType last_from, OutputIteratorType first_dest, - BinaryOp bin_op) { - return Impl::adjacent_difference_impl( +template < + typename ExecutionSpace, typename InputIteratorType, + typename OutputIteratorType, typename BinaryOp, + std::enable_if_t::value && + ::Kokkos::is_execution_space::value, + int> = 0> +OutputIteratorType adjacent_difference(const ExecutionSpace& ex, + InputIteratorType first_from, + InputIteratorType last_from, + OutputIteratorType first_dest, + BinaryOp bin_op) { + return Impl::adjacent_difference_exespace_impl( "Kokkos::adjacent_difference_iterator_api", ex, first_from, last_from, first_dest, bin_op); } -template -std::enable_if_t::value, - OutputIteratorType> -adjacent_difference(const std::string& label, const ExecutionSpace& ex, - InputIteratorType first_from, InputIteratorType last_from, - OutputIteratorType first_dest) { +template < + typename ExecutionSpace, typename InputIteratorType, + typename OutputIteratorType, + std::enable_if_t::value && + ::Kokkos::is_execution_space::value, + int> = 0> +OutputIteratorType adjacent_difference(const std::string& label, + const ExecutionSpace& ex, + InputIteratorType first_from, + InputIteratorType last_from, + OutputIteratorType first_dest) { using value_type1 = typename InputIteratorType::value_type; using value_type2 = typename OutputIteratorType::value_type; using binary_op = Impl::StdAdjacentDifferenceDefaultBinaryOpFunctor; - return Impl::adjacent_difference_impl(label, ex, first_from, last_from, - first_dest, binary_op()); + return Impl::adjacent_difference_exespace_impl( + label, ex, first_from, last_from, first_dest, binary_op()); } -template -std::enable_if_t::value, - OutputIteratorType> -adjacent_difference(const std::string& label, const ExecutionSpace& ex, - InputIteratorType first_from, InputIteratorType last_from, - OutputIteratorType first_dest, BinaryOp bin_op) { - return Impl::adjacent_difference_impl(label, ex, first_from, last_from, - first_dest, bin_op); +template < + typename ExecutionSpace, typename InputIteratorType, + typename OutputIteratorType, typename BinaryOp, + std::enable_if_t::value && + ::Kokkos::is_execution_space::value, + int> = 0> +OutputIteratorType adjacent_difference(const std::string& label, + const ExecutionSpace& ex, + InputIteratorType first_from, + InputIteratorType last_from, + OutputIteratorType first_dest, + BinaryOp bin_op) { + return Impl::adjacent_difference_exespace_impl(label, ex, first_from, + last_from, first_dest, bin_op); } -template +template ::value, + int> = 0> auto adjacent_difference( const ExecutionSpace& ex, const ::Kokkos::View& view_from, @@ -96,13 +117,15 @@ auto adjacent_difference( using binary_op = Impl::StdAdjacentDifferenceDefaultBinaryOpFunctor; - return Impl::adjacent_difference_impl( + return Impl::adjacent_difference_exespace_impl( "Kokkos::adjacent_difference_view_api", ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), binary_op()); } -template +template ::value, + int> = 0> auto adjacent_difference( const ExecutionSpace& ex, const ::Kokkos::View& view_from, @@ -111,13 +134,15 @@ auto adjacent_difference( namespace KE = ::Kokkos::Experimental; Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); - return Impl::adjacent_difference_impl( + return Impl::adjacent_difference_exespace_impl( "Kokkos::adjacent_difference_view_api", ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), bin_op); } -template +template ::value, + int> = 0> auto adjacent_difference( const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view_from, @@ -134,13 +159,15 @@ auto adjacent_difference( Impl::StdAdjacentDifferenceDefaultBinaryOpFunctor; - return Impl::adjacent_difference_impl(label, ex, KE::cbegin(view_from), - KE::cend(view_from), - KE::begin(view_dest), binary_op()); + return Impl::adjacent_difference_exespace_impl( + label, ex, KE::cbegin(view_from), KE::cend(view_from), + KE::begin(view_dest), binary_op()); } -template +template ::value, + int> = 0> auto adjacent_difference( const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view_from, @@ -149,9 +176,85 @@ auto adjacent_difference( namespace KE = ::Kokkos::Experimental; Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); - return Impl::adjacent_difference_impl(label, ex, KE::cbegin(view_from), - KE::cend(view_from), - KE::begin(view_dest), bin_op); + return Impl::adjacent_difference_exespace_impl( + label, ex, KE::cbegin(view_from), KE::cend(view_from), + KE::begin(view_dest), bin_op); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template ::value && + ::Kokkos::is_team_handle::value, + int> = 0> +KOKKOS_FUNCTION OutputIteratorType adjacent_difference( + const TeamHandleType& teamHandle, InputIteratorType first_from, + InputIteratorType last_from, OutputIteratorType first_dest) { + using value_type1 = typename InputIteratorType::value_type; + using value_type2 = typename OutputIteratorType::value_type; + using binary_op = + Impl::StdAdjacentDifferenceDefaultBinaryOpFunctor; + return Impl::adjacent_difference_team_impl(teamHandle, first_from, last_from, + first_dest, binary_op()); +} + +template ::value && + ::Kokkos::is_team_handle::value, + int> = 0> +KOKKOS_FUNCTION OutputIteratorType +adjacent_difference(const TeamHandleType& teamHandle, + InputIteratorType first_from, InputIteratorType last_from, + OutputIteratorType first_dest, BinaryOp bin_op) { + return Impl::adjacent_difference_team_impl(teamHandle, first_from, last_from, + first_dest, bin_op); +} + +template < + typename TeamHandleType, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_team_handle::value, int> = 0> +KOKKOS_FUNCTION auto adjacent_difference( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view_from, + const ::Kokkos::View& view_dest) { + namespace KE = ::Kokkos::Experimental; + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + + using view_type1 = ::Kokkos::View; + using view_type2 = ::Kokkos::View; + using value_type1 = typename view_type1::value_type; + using value_type2 = typename view_type2::value_type; + using binary_op = + Impl::StdAdjacentDifferenceDefaultBinaryOpFunctor; + return Impl::adjacent_difference_team_impl(teamHandle, KE::cbegin(view_from), + KE::cend(view_from), + KE::begin(view_dest), binary_op()); +} + +template < + typename TeamHandleType, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename BinaryOp, + std::enable_if_t<::Kokkos::is_team_handle::value, int> = 0> +KOKKOS_FUNCTION auto adjacent_difference( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view_from, + const ::Kokkos::View& view_dest, + BinaryOp bin_op) { + namespace KE = ::Kokkos::Experimental; + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + return Impl::adjacent_difference_team_impl(teamHandle, KE::cbegin(view_from), + KE::cend(view_from), + KE::begin(view_dest), bin_op); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_AdjacentFind.hpp b/algorithms/src/std_algorithms/Kokkos_AdjacentFind.hpp index 43c2b66010..ac476ca5bf 100644 --- a/algorithms/src/std_algorithms/Kokkos_AdjacentFind.hpp +++ b/algorithms/src/std_algorithms/Kokkos_AdjacentFind.hpp @@ -23,71 +23,144 @@ namespace Kokkos { namespace Experimental { +// +// overload set accepting execution space +// + // overload set1 -template +template < + typename ExecutionSpace, typename IteratorType, + std::enable_if_t, int> = 0> IteratorType adjacent_find(const ExecutionSpace& ex, IteratorType first, IteratorType last) { - return Impl::adjacent_find_impl("Kokkos::adjacent_find_iterator_api_default", - ex, first, last); + return Impl::adjacent_find_exespace_impl( + "Kokkos::adjacent_find_iterator_api_default", ex, first, last); } -template +template < + typename ExecutionSpace, typename IteratorType, + std::enable_if_t, int> = 0> IteratorType adjacent_find(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last) { - return Impl::adjacent_find_impl(label, ex, first, last); + return Impl::adjacent_find_exespace_impl(label, ex, first, last); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + std::enable_if_t, int> = 0> auto adjacent_find(const ExecutionSpace& ex, const ::Kokkos::View& v) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::adjacent_find_impl("Kokkos::adjacent_find_view_api_default", ex, - KE::begin(v), KE::end(v)); + return Impl::adjacent_find_exespace_impl( + "Kokkos::adjacent_find_view_api_default", ex, KE::begin(v), KE::end(v)); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + std::enable_if_t, int> = 0> auto adjacent_find(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& v) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::adjacent_find_impl(label, ex, KE::begin(v), KE::end(v)); + return Impl::adjacent_find_exespace_impl(label, ex, KE::begin(v), KE::end(v)); } // overload set2 -template +template < + typename ExecutionSpace, typename IteratorType, + typename BinaryPredicateType, + std::enable_if_t, int> = 0> IteratorType adjacent_find(const ExecutionSpace& ex, IteratorType first, IteratorType last, BinaryPredicateType pred) { - return Impl::adjacent_find_impl("Kokkos::adjacent_find_iterator_api_default", - ex, first, last, pred); + return Impl::adjacent_find_exespace_impl( + "Kokkos::adjacent_find_iterator_api_default", ex, first, last, pred); } -template +template < + typename ExecutionSpace, typename IteratorType, + typename BinaryPredicateType, + std::enable_if_t, int> = 0> IteratorType adjacent_find(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, BinaryPredicateType pred) { - return Impl::adjacent_find_impl(label, ex, first, last, pred); + return Impl::adjacent_find_exespace_impl(label, ex, first, last, pred); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename BinaryPredicateType, + std::enable_if_t, int> = 0> auto adjacent_find(const ExecutionSpace& ex, const ::Kokkos::View& v, BinaryPredicateType pred) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::adjacent_find_impl("Kokkos::adjacent_find_view_api_default", ex, - KE::begin(v), KE::end(v), pred); + return Impl::adjacent_find_exespace_impl( + "Kokkos::adjacent_find_view_api_default", ex, KE::begin(v), KE::end(v), + pred); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename BinaryPredicateType, + std::enable_if_t, int> = 0> auto adjacent_find(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& v, BinaryPredicateType pred) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::adjacent_find_impl(label, ex, KE::begin(v), KE::end(v), pred); + return Impl::adjacent_find_exespace_impl(label, ex, KE::begin(v), KE::end(v), + pred); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// + +// overload set1 +template , int> = 0> +KOKKOS_FUNCTION IteratorType adjacent_find(const TeamHandleType& teamHandle, + IteratorType first, + IteratorType last) { + return Impl::adjacent_find_team_impl(teamHandle, first, last); +} + +template , int> = 0> +KOKKOS_FUNCTION auto adjacent_find( + const TeamHandleType& teamHandle, + const ::Kokkos::View& v) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); + namespace KE = ::Kokkos::Experimental; + return Impl::adjacent_find_team_impl(teamHandle, KE::begin(v), KE::end(v)); +} + +// overload set2 +template , int> = 0> +KOKKOS_FUNCTION IteratorType adjacent_find(const TeamHandleType& teamHandle, + IteratorType first, + IteratorType last, + BinaryPredicateType pred) { + return Impl::adjacent_find_team_impl(teamHandle, first, last, pred); +} + +template , int> = 0> +KOKKOS_FUNCTION auto adjacent_find( + const TeamHandleType& teamHandle, + const ::Kokkos::View& v, + BinaryPredicateType pred) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); + namespace KE = ::Kokkos::Experimental; + return Impl::adjacent_find_team_impl(teamHandle, KE::begin(v), KE::end(v), + pred); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_AllOf.hpp b/algorithms/src/std_algorithms/Kokkos_AllOf.hpp index 2ffec7e144..d6ed4c4a7e 100644 --- a/algorithms/src/std_algorithms/Kokkos_AllOf.hpp +++ b/algorithms/src/std_algorithms/Kokkos_AllOf.hpp @@ -23,41 +23,79 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename InputIterator, typename Predicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool all_of(const ExecutionSpace& ex, InputIterator first, InputIterator last, Predicate predicate) { - return Impl::all_of_impl("Kokkos::all_of_iterator_api_default", ex, first, - last, predicate); + return Impl::all_of_exespace_impl("Kokkos::all_of_iterator_api_default", ex, + first, last, predicate); } -template +template < + typename ExecutionSpace, typename InputIterator, typename Predicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool all_of(const std::string& label, const ExecutionSpace& ex, InputIterator first, InputIterator last, Predicate predicate) { - return Impl::all_of_impl(label, ex, first, last, predicate); + return Impl::all_of_exespace_impl(label, ex, first, last, predicate); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename Predicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool all_of(const ExecutionSpace& ex, const ::Kokkos::View& v, Predicate predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::all_of_impl("Kokkos::all_of_view_api_default", ex, KE::cbegin(v), - KE::cend(v), std::move(predicate)); + return Impl::all_of_exespace_impl("Kokkos::all_of_view_api_default", ex, + KE::cbegin(v), KE::cend(v), + std::move(predicate)); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename Predicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool all_of(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& v, Predicate predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::all_of_impl(label, ex, KE::cbegin(v), KE::cend(v), - std::move(predicate)); + return Impl::all_of_exespace_impl(label, ex, KE::cbegin(v), KE::cend(v), + std::move(predicate)); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION bool all_of(const TeamHandleType& teamHandle, + InputIterator first, InputIterator last, + Predicate predicate) { + return Impl::all_of_team_impl(teamHandle, first, last, predicate); +} + +template , int> = 0> +KOKKOS_FUNCTION bool all_of(const TeamHandleType& teamHandle, + const ::Kokkos::View& v, + Predicate predicate) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); + + namespace KE = ::Kokkos::Experimental; + return Impl::all_of_team_impl(teamHandle, KE::cbegin(v), KE::cend(v), + std::move(predicate)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_AnyOf.hpp b/algorithms/src/std_algorithms/Kokkos_AnyOf.hpp index 019c466c6d..82356e6598 100644 --- a/algorithms/src/std_algorithms/Kokkos_AnyOf.hpp +++ b/algorithms/src/std_algorithms/Kokkos_AnyOf.hpp @@ -23,41 +23,79 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename InputIterator, typename Predicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool any_of(const ExecutionSpace& ex, InputIterator first, InputIterator last, Predicate predicate) { - return Impl::any_of_impl("Kokkos::any_of_view_api_default", ex, first, last, - predicate); + return Impl::any_of_exespace_impl("Kokkos::any_of_view_api_default", ex, + first, last, predicate); } -template +template < + typename ExecutionSpace, typename InputIterator, typename Predicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool any_of(const std::string& label, const ExecutionSpace& ex, InputIterator first, InputIterator last, Predicate predicate) { - return Impl::any_of_impl(label, ex, first, last, predicate); + return Impl::any_of_exespace_impl(label, ex, first, last, predicate); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename Predicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool any_of(const ExecutionSpace& ex, const ::Kokkos::View& v, Predicate predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::any_of_impl("Kokkos::any_of_view_api_default", ex, KE::cbegin(v), - KE::cend(v), std::move(predicate)); + return Impl::any_of_exespace_impl("Kokkos::any_of_view_api_default", ex, + KE::cbegin(v), KE::cend(v), + std::move(predicate)); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename Predicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool any_of(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& v, Predicate predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::any_of_impl(label, ex, KE::cbegin(v), KE::cend(v), - std::move(predicate)); + return Impl::any_of_exespace_impl(label, ex, KE::cbegin(v), KE::cend(v), + std::move(predicate)); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION bool any_of(const TeamHandleType& teamHandle, + InputIterator first, InputIterator last, + Predicate predicate) { + return Impl::any_of_team_impl(teamHandle, first, last, predicate); +} + +template , int> = 0> +KOKKOS_FUNCTION bool any_of(const TeamHandleType& teamHandle, + const ::Kokkos::View& v, + Predicate predicate) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); + + namespace KE = ::Kokkos::Experimental; + return Impl::any_of_team_impl(teamHandle, KE::cbegin(v), KE::cend(v), + std::move(predicate)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_Copy.hpp b/algorithms/src/std_algorithms/Kokkos_Copy.hpp index 028f3b66b2..c5406c72b0 100644 --- a/algorithms/src/std_algorithms/Kokkos_Copy.hpp +++ b/algorithms/src/std_algorithms/Kokkos_Copy.hpp @@ -23,44 +23,83 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename InputIterator, typename OutputIterator, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> OutputIterator copy(const ExecutionSpace& ex, InputIterator first, InputIterator last, OutputIterator d_first) { - return Impl::copy_impl("Kokkos::copy_iterator_api_default", ex, first, last, - d_first); + return Impl::copy_exespace_impl("Kokkos::copy_iterator_api_default", ex, + first, last, d_first); } -template +template < + typename ExecutionSpace, typename InputIterator, typename OutputIterator, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> OutputIterator copy(const std::string& label, const ExecutionSpace& ex, InputIterator first, InputIterator last, OutputIterator d_first) { - return Impl::copy_impl(label, ex, first, last, d_first); + return Impl::copy_exespace_impl(label, ex, first, last, d_first); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); namespace KE = ::Kokkos::Experimental; - return Impl::copy_impl("Kokkos::copy_view_api_default", ex, - KE::cbegin(source), KE::cend(source), KE::begin(dest)); + return Impl::copy_exespace_impl("Kokkos::copy_view_api_default", ex, + KE::cbegin(source), KE::cend(source), + KE::begin(dest)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); + + namespace KE = ::Kokkos::Experimental; + return Impl::copy_exespace_impl(label, ex, KE::cbegin(source), + KE::cend(source), KE::begin(dest)); +} + +// +// overload set accepting team handle +// +template , int> = 0> +KOKKOS_FUNCTION OutputIterator copy(const TeamHandleType& teamHandle, + InputIterator first, InputIterator last, + OutputIterator d_first) { + return Impl::copy_team_impl(teamHandle, first, last, d_first); +} + +template , int> = 0> +KOKKOS_FUNCTION auto copy( + const TeamHandleType& teamHandle, + const ::Kokkos::View& source, + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); namespace KE = ::Kokkos::Experimental; - return Impl::copy_impl(label, ex, KE::cbegin(source), KE::cend(source), - KE::begin(dest)); + return Impl::copy_team_impl(teamHandle, KE::cbegin(source), KE::cend(source), + KE::begin(dest)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp b/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp index deff6baf9a..82071a9362 100644 --- a/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp +++ b/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp @@ -23,42 +23,81 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename IteratorType1, typename IteratorType2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType2 copy_backward(const ExecutionSpace& ex, IteratorType1 first, IteratorType1 last, IteratorType2 d_last) { - return Impl::copy_backward_impl("Kokkos::copy_backward_iterator_api_default", - ex, first, last, d_last); + return Impl::copy_backward_exespace_impl( + "Kokkos::copy_backward_iterator_api_default", ex, first, last, d_last); } -template +template < + typename ExecutionSpace, typename IteratorType1, typename IteratorType2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType2 copy_backward(const std::string& label, const ExecutionSpace& ex, IteratorType1 first, IteratorType1 last, IteratorType2 d_last) { - return Impl::copy_backward_impl(label, ex, first, last, d_last); + return Impl::copy_backward_exespace_impl(label, ex, first, last, d_last); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_backward(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); - return Impl::copy_backward_impl("Kokkos::copy_backward_view_api_default", ex, - cbegin(source), cend(source), end(dest)); + return Impl::copy_backward_exespace_impl( + "Kokkos::copy_backward_view_api_default", ex, cbegin(source), + cend(source), end(dest)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_backward(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); + + return Impl::copy_backward_exespace_impl(label, ex, cbegin(source), + cend(source), end(dest)); +} + +// +// overload set accepting team handle +// +template , int> = 0> +KOKKOS_FUNCTION IteratorType2 copy_backward(const TeamHandleType& teamHandle, + IteratorType1 first, + IteratorType1 last, + IteratorType2 d_last) { + return Impl::copy_backward_team_impl(teamHandle, first, last, d_last); +} + +template , int> = 0> +KOKKOS_FUNCTION auto copy_backward( + const TeamHandleType& teamHandle, + const ::Kokkos::View& source, + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); - return Impl::copy_backward_impl(label, ex, cbegin(source), cend(source), - end(dest)); + return Impl::copy_backward_team_impl(teamHandle, cbegin(source), cend(source), + end(dest)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp b/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp index 3db2fc074f..599fde5737 100644 --- a/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp +++ b/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp @@ -23,46 +23,87 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename InputIterator, typename OutputIterator, + typename Predicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> OutputIterator copy_if(const ExecutionSpace& ex, InputIterator first, InputIterator last, OutputIterator d_first, Predicate pred) { - return Impl::copy_if_impl("Kokkos::copy_if_iterator_api_default", ex, first, - last, d_first, std::move(pred)); + return Impl::copy_if_exespace_impl("Kokkos::copy_if_iterator_api_default", ex, + first, last, d_first, std::move(pred)); } -template +template < + typename ExecutionSpace, typename InputIterator, typename OutputIterator, + typename Predicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> OutputIterator copy_if(const std::string& label, const ExecutionSpace& ex, InputIterator first, InputIterator last, OutputIterator d_first, Predicate pred) { - return Impl::copy_if_impl(label, ex, first, last, d_first, std::move(pred)); + return Impl::copy_if_exespace_impl(label, ex, first, last, d_first, + std::move(pred)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename Predicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_if(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest, Predicate pred) { + const ::Kokkos::View& dest, + Predicate pred) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); - return Impl::copy_if_impl("Kokkos::copy_if_view_api_default", ex, - cbegin(source), cend(source), begin(dest), - std::move(pred)); + return Impl::copy_if_exespace_impl("Kokkos::copy_if_view_api_default", ex, + cbegin(source), cend(source), begin(dest), + std::move(pred)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename Predicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_if(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest, Predicate pred) { + const ::Kokkos::View& dest, + Predicate pred) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); + + return Impl::copy_if_exespace_impl(label, ex, cbegin(source), cend(source), + begin(dest), std::move(pred)); +} + +// +// overload set accepting team handle +// +template , int> = 0> +KOKKOS_FUNCTION OutputIterator copy_if(const TeamHandleType& teamHandle, + InputIterator first, InputIterator last, + OutputIterator d_first, Predicate pred) { + return Impl::copy_if_team_impl(teamHandle, first, last, d_first, + std::move(pred)); +} + +template , int> = 0> +KOKKOS_FUNCTION auto copy_if( + const TeamHandleType& teamHandle, + const ::Kokkos::View& source, + const ::Kokkos::View& dest, Predicate pred) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); - return Impl::copy_if_impl(label, ex, cbegin(source), cend(source), - begin(dest), std::move(pred)); + return Impl::copy_if_team_impl(teamHandle, cbegin(source), cend(source), + begin(dest), std::move(pred)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_CopyN.hpp b/algorithms/src/std_algorithms/Kokkos_CopyN.hpp index a64f99b5c0..637d8d4cbc 100644 --- a/algorithms/src/std_algorithms/Kokkos_CopyN.hpp +++ b/algorithms/src/std_algorithms/Kokkos_CopyN.hpp @@ -23,45 +23,83 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename InputIterator, typename Size, + typename OutputIterator, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> OutputIterator copy_n(const ExecutionSpace& ex, InputIterator first, Size count, OutputIterator result) { - return Impl::copy_n_impl("Kokkos::copy_n_iterator_api_default", ex, first, - count, result); + return Impl::copy_n_exespace_impl("Kokkos::copy_n_iterator_api_default", ex, + first, count, result); } -template +template < + typename ExecutionSpace, typename InputIterator, typename Size, + typename OutputIterator, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> OutputIterator copy_n(const std::string& label, const ExecutionSpace& ex, InputIterator first, Size count, OutputIterator result) { - return Impl::copy_n_impl(label, ex, first, count, result); + return Impl::copy_n_exespace_impl(label, ex, first, count, result); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename Size, typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_n(const ExecutionSpace& ex, const ::Kokkos::View& source, Size count, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); namespace KE = ::Kokkos::Experimental; - return Impl::copy_n_impl("Kokkos::copy_n_view_api_default", ex, - KE::cbegin(source), count, KE::begin(dest)); + return Impl::copy_n_exespace_impl("Kokkos::copy_n_view_api_default", ex, + KE::cbegin(source), count, KE::begin(dest)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename Size, typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_n(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, Size count, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); + + namespace KE = ::Kokkos::Experimental; + return Impl::copy_n_exespace_impl(label, ex, KE::cbegin(source), count, + KE::begin(dest)); +} + +// +// overload set accepting team handle +// +template , int> = 0> +KOKKOS_FUNCTION OutputIterator copy_n(const TeamHandleType& teamHandle, + InputIterator first, Size count, + OutputIterator result) { + return Impl::copy_n_team_impl(teamHandle, first, count, result); +} + +template , int> = 0> +KOKKOS_FUNCTION auto copy_n( + const TeamHandleType& teamHandle, + const ::Kokkos::View& source, Size count, + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); namespace KE = ::Kokkos::Experimental; - return Impl::copy_n_impl(label, ex, KE::cbegin(source), count, - KE::begin(dest)); + return Impl::copy_n_team_impl(teamHandle, KE::cbegin(source), count, + KE::begin(dest)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_Count.hpp b/algorithms/src/std_algorithms/Kokkos_Count.hpp index 3ac63467ec..f179e88bab 100644 --- a/algorithms/src/std_algorithms/Kokkos_Count.hpp +++ b/algorithms/src/std_algorithms/Kokkos_Count.hpp @@ -23,41 +23,81 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename IteratorType, typename T, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> typename IteratorType::difference_type count(const ExecutionSpace& ex, IteratorType first, IteratorType last, const T& value) { - return Impl::count_impl("Kokkos::count_iterator_api_default", ex, first, last, - value); + return Impl::count_exespace_impl("Kokkos::count_iterator_api_default", ex, + first, last, value); } -template +template < + typename ExecutionSpace, typename IteratorType, typename T, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> typename IteratorType::difference_type count(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, const T& value) { - return Impl::count_impl(label, ex, first, last, value); + return Impl::count_exespace_impl(label, ex, first, last, value); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename T, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto count(const ExecutionSpace& ex, const ::Kokkos::View& v, const T& value) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::count_impl("Kokkos::count_view_api_default", ex, KE::cbegin(v), - KE::cend(v), value); + return Impl::count_exespace_impl("Kokkos::count_view_api_default", ex, + KE::cbegin(v), KE::cend(v), value); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename T, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto count(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& v, const T& value) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::count_impl(label, ex, KE::cbegin(v), KE::cend(v), value); + return Impl::count_exespace_impl(label, ex, KE::cbegin(v), KE::cend(v), + value); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// + +template , int> = 0> +KOKKOS_FUNCTION typename IteratorType::difference_type count( + const TeamHandleType& teamHandle, IteratorType first, IteratorType last, + const T& value) { + return Impl::count_team_impl(teamHandle, first, last, value); +} + +template , int> = 0> +KOKKOS_FUNCTION auto count(const TeamHandleType& teamHandle, + const ::Kokkos::View& v, + const T& value) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); + + namespace KE = ::Kokkos::Experimental; + return Impl::count_team_impl(teamHandle, KE::cbegin(v), KE::cend(v), value); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_CountIf.hpp b/algorithms/src/std_algorithms/Kokkos_CountIf.hpp index b9731d378a..967cf75e7a 100644 --- a/algorithms/src/std_algorithms/Kokkos_CountIf.hpp +++ b/algorithms/src/std_algorithms/Kokkos_CountIf.hpp @@ -23,46 +23,84 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename IteratorType, typename Predicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> typename IteratorType::difference_type count_if(const ExecutionSpace& ex, IteratorType first, IteratorType last, Predicate predicate) { - return Impl::count_if_impl("Kokkos::count_if_iterator_api_default", ex, first, - last, std::move(predicate)); + return Impl::count_if_exespace_impl("Kokkos::count_if_iterator_api_default", + ex, first, last, std::move(predicate)); } -template +template < + typename ExecutionSpace, typename IteratorType, typename Predicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> typename IteratorType::difference_type count_if(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, Predicate predicate) { - return Impl::count_if_impl(label, ex, first, last, std::move(predicate)); + return Impl::count_if_exespace_impl(label, ex, first, last, + std::move(predicate)); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename Predicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto count_if(const ExecutionSpace& ex, const ::Kokkos::View& v, Predicate predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::count_if_impl("Kokkos::count_if_view_api_default", ex, - KE::cbegin(v), KE::cend(v), std::move(predicate)); + return Impl::count_if_exespace_impl("Kokkos::count_if_view_api_default", ex, + KE::cbegin(v), KE::cend(v), + std::move(predicate)); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename Predicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto count_if(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& v, Predicate predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::count_if_impl(label, ex, KE::cbegin(v), KE::cend(v), - std::move(predicate)); + return Impl::count_if_exespace_impl(label, ex, KE::cbegin(v), KE::cend(v), + std::move(predicate)); +} + +// +// overload set accepting team handle +// +template , int> = 0> +KOKKOS_FUNCTION typename IteratorType::difference_type count_if( + const TeamHandleType& teamHandle, IteratorType first, IteratorType last, + Predicate predicate) { + return Impl::count_if_team_impl(teamHandle, first, last, + std::move(predicate)); +} + +template , int> = 0> +KOKKOS_FUNCTION auto count_if(const TeamHandleType& teamHandle, + const ::Kokkos::View& v, + Predicate predicate) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); + + namespace KE = ::Kokkos::Experimental; + return Impl::count_if_team_impl(teamHandle, KE::cbegin(v), KE::cend(v), + std::move(predicate)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_Equal.hpp b/algorithms/src/std_algorithms/Kokkos_Equal.hpp index 37c0d75ef5..593c42f87e 100644 --- a/algorithms/src/std_algorithms/Kokkos_Equal.hpp +++ b/algorithms/src/std_algorithms/Kokkos_Equal.hpp @@ -23,145 +23,260 @@ namespace Kokkos { namespace Experimental { -template -std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators< - IteratorType1, IteratorType2>::value, - bool> -equal(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1, - IteratorType2 first2) { - return Impl::equal_impl("Kokkos::equal_iterator_api_default", ex, first1, - last1, first2); -} - -template -std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators< - IteratorType1, IteratorType2>::value, - bool> -equal(const std::string& label, const ExecutionSpace& ex, IteratorType1 first1, - IteratorType1 last1, IteratorType2 first2) { - return Impl::equal_impl(label, ex, first1, last1, first2); -} - -template -std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators< - IteratorType1, IteratorType2>::value, - bool> -equal(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1, - IteratorType2 first2, BinaryPredicateType predicate) { - return Impl::equal_impl("Kokkos::equal_iterator_api_default", ex, first1, - last1, first2, std::move(predicate)); -} - -template -std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators< - IteratorType1, IteratorType2>::value, - bool> -equal(const std::string& label, const ExecutionSpace& ex, IteratorType1 first1, - IteratorType1 last1, IteratorType2 first2, - BinaryPredicateType predicate) { - return Impl::equal_impl(label, ex, first1, last1, first2, - std::move(predicate)); -} - -template +// +// overload set accepting execution space +// +template && + Kokkos::is_execution_space_v, + int> = 0> +bool equal(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1, + IteratorType2 first2) { + return Impl::equal_exespace_impl("Kokkos::equal_iterator_api_default", ex, + first1, last1, first2); +} + +template && ::Kokkos:: + is_execution_space_v, + int> = 0> +bool equal(const std::string& label, const ExecutionSpace& ex, + IteratorType1 first1, IteratorType1 last1, IteratorType2 first2) { + return Impl::equal_exespace_impl(label, ex, first1, last1, first2); +} + +template && ::Kokkos:: + is_execution_space_v, + int> = 0> +bool equal(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1, + IteratorType2 first2, BinaryPredicateType predicate) { + return Impl::equal_exespace_impl("Kokkos::equal_iterator_api_default", ex, + first1, last1, first2, std::move(predicate)); +} + +template && ::Kokkos:: + is_execution_space_v, + int> = 0> +bool equal(const std::string& label, const ExecutionSpace& ex, + IteratorType1 first1, IteratorType1 last1, IteratorType2 first2, + BinaryPredicateType predicate) { + return Impl::equal_exespace_impl(label, ex, first1, last1, first2, + std::move(predicate)); +} + +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool equal(const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); namespace KE = ::Kokkos::Experimental; - return Impl::equal_impl("Kokkos::equal_view_api_default", ex, - KE::cbegin(view1), KE::cend(view1), - KE::cbegin(view2)); + return Impl::equal_exespace_impl("Kokkos::equal_view_api_default", ex, + KE::cbegin(view1), KE::cend(view1), + KE::cbegin(view2)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool equal(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); namespace KE = ::Kokkos::Experimental; - return Impl::equal_impl(label, ex, KE::cbegin(view1), KE::cend(view1), - KE::cbegin(view2)); + return Impl::equal_exespace_impl(label, ex, KE::cbegin(view1), + KE::cend(view1), KE::cbegin(view2)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename BinaryPredicateType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool equal(const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2, + const ::Kokkos::View& view2, BinaryPredicateType predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); namespace KE = ::Kokkos::Experimental; - return Impl::equal_impl("Kokkos::equal_view_api_default", ex, - KE::cbegin(view1), KE::cend(view1), KE::cbegin(view2), - std::move(predicate)); + return Impl::equal_exespace_impl("Kokkos::equal_view_api_default", ex, + KE::cbegin(view1), KE::cend(view1), + KE::cbegin(view2), std::move(predicate)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename BinaryPredicateType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool equal(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2, + const ::Kokkos::View& view2, + BinaryPredicateType predicate) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); + + namespace KE = ::Kokkos::Experimental; + return Impl::equal_exespace_impl(label, ex, KE::cbegin(view1), + KE::cend(view1), KE::cbegin(view2), + std::move(predicate)); +} + +template && ::Kokkos:: + is_execution_space_v, + int> = 0> +bool equal(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1, + IteratorType2 first2, IteratorType2 last2) { + return Impl::equal_exespace_impl("Kokkos::equal_iterator_api_default", ex, + first1, last1, first2, last2); +} + +template && ::Kokkos:: + is_execution_space_v, + int> = 0> +bool equal(const std::string& label, const ExecutionSpace& ex, + IteratorType1 first1, IteratorType1 last1, IteratorType2 first2, + IteratorType2 last2) { + return Impl::equal_exespace_impl(label, ex, first1, last1, first2, last2); +} + +template && ::Kokkos:: + is_execution_space_v, + int> = 0> +bool equal(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1, + IteratorType2 first2, IteratorType2 last2, BinaryPredicateType predicate) { + return Impl::equal_exespace_impl("Kokkos::equal_iterator_api_default", ex, + first1, last1, first2, last2, + std::move(predicate)); +} + +template && ::Kokkos:: + is_execution_space_v, + int> = 0> +bool equal(const std::string& label, const ExecutionSpace& ex, + IteratorType1 first1, IteratorType1 last1, IteratorType2 first2, + IteratorType2 last2, BinaryPredicateType predicate) { + return Impl::equal_exespace_impl(label, ex, first1, last1, first2, last2, + std::move(predicate)); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template && ::Kokkos:: + is_team_handle_v, + int> = 0> +KOKKOS_FUNCTION bool equal(const TeamHandleType& teamHandle, + IteratorType1 first1, IteratorType1 last1, + IteratorType2 first2) { + return Impl::equal_team_impl(teamHandle, first1, last1, first2); +} + +template && ::Kokkos:: + is_team_handle_v, + int> = 0> +KOKKOS_FUNCTION bool equal(const TeamHandleType& teamHandle, + IteratorType1 first1, IteratorType1 last1, + IteratorType2 first2, + BinaryPredicateType predicate) { + return Impl::equal_team_impl(teamHandle, first1, last1, first2, + std::move(predicate)); +} + +template , int> = 0> +KOKKOS_FUNCTION bool equal( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view1, + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); namespace KE = ::Kokkos::Experimental; - return Impl::equal_impl(label, ex, KE::cbegin(view1), KE::cend(view1), - KE::cbegin(view2), std::move(predicate)); -} - -template -std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators< - IteratorType1, IteratorType2>::value, - bool> -equal(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1, - IteratorType2 first2, IteratorType2 last2) { - return Impl::equal_impl("Kokkos::equal_iterator_api_default", ex, first1, - last1, first2, last2); -} - -template -std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators< - IteratorType1, IteratorType2>::value, - bool> -equal(const std::string& label, const ExecutionSpace& ex, IteratorType1 first1, - IteratorType1 last1, IteratorType2 first2, IteratorType2 last2) { - return Impl::equal_impl(label, ex, first1, last1, first2, last2); -} - -template -std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators< - IteratorType1, IteratorType2>::value, - bool> -equal(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1, - IteratorType2 first2, IteratorType2 last2, - BinaryPredicateType predicate) { - return Impl::equal_impl("Kokkos::equal_iterator_api_default", ex, first1, - last1, first2, last2, std::move(predicate)); -} - -template -std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators< - IteratorType1, IteratorType2>::value, - bool> -equal(const std::string& label, const ExecutionSpace& ex, IteratorType1 first1, - IteratorType1 last1, IteratorType2 first2, IteratorType2 last2, - BinaryPredicateType predicate) { - return Impl::equal_impl(label, ex, first1, last1, first2, last2, - std::move(predicate)); + return Impl::equal_team_impl(teamHandle, KE::cbegin(view1), KE::cend(view1), + KE::cbegin(view2)); +} + +template , int> = 0> +KOKKOS_FUNCTION bool equal( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view1, + const ::Kokkos::View& view2, + BinaryPredicateType predicate) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); + + namespace KE = ::Kokkos::Experimental; + return Impl::equal_team_impl(teamHandle, KE::cbegin(view1), KE::cend(view1), + KE::cbegin(view2), std::move(predicate)); +} + +template && ::Kokkos:: + is_team_handle_v, + int> = 0> +KOKKOS_FUNCTION bool equal(const TeamHandleType& teamHandle, + IteratorType1 first1, IteratorType1 last1, + IteratorType2 first2, IteratorType2 last2) { + return Impl::equal_team_impl(teamHandle, first1, last1, first2, last2); +} + +template && ::Kokkos:: + is_team_handle_v, + int> = 0> +KOKKOS_FUNCTION bool equal(const TeamHandleType& teamHandle, + IteratorType1 first1, IteratorType1 last1, + IteratorType2 first2, IteratorType2 last2, + BinaryPredicateType predicate) { + return Impl::equal_team_impl(teamHandle, first1, last1, first2, last2, + std::move(predicate)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_ExclusiveScan.hpp b/algorithms/src/std_algorithms/Kokkos_ExclusiveScan.hpp index 4e05676c2c..ee3a105126 100644 --- a/algorithms/src/std_algorithms/Kokkos_ExclusiveScan.hpp +++ b/algorithms/src/std_algorithms/Kokkos_ExclusiveScan.hpp @@ -23,105 +23,130 @@ namespace Kokkos { namespace Experimental { +// +// overload set accepting execution space +// + // overload set 1 -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -exclusive_scan(const ExecutionSpace& ex, InputIteratorType first, - InputIteratorType last, OutputIteratorType first_dest, - ValueType init_value) { - static_assert(std::is_move_constructible::value, +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType exclusive_scan(const ExecutionSpace& ex, + InputIteratorType first, + InputIteratorType last, + OutputIteratorType first_dest, + ValueType init_value) { + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); - return Impl::exclusive_scan_default_op_impl( + return Impl::exclusive_scan_default_op_exespace_impl( "Kokkos::exclusive_scan_default_functors_iterator_api", ex, first, last, - first_dest, init_value); + first_dest, std::move(init_value)); } -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -exclusive_scan(const std::string& label, const ExecutionSpace& ex, - InputIteratorType first, InputIteratorType last, - OutputIteratorType first_dest, ValueType init_value) { - static_assert(std::is_move_constructible::value, +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType exclusive_scan(const std::string& label, + const ExecutionSpace& ex, + InputIteratorType first, + InputIteratorType last, + OutputIteratorType first_dest, + ValueType init_value) { + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); - return Impl::exclusive_scan_default_op_impl(label, ex, first, last, - first_dest, init_value); + return Impl::exclusive_scan_default_op_exespace_impl( + label, ex, first, last, first_dest, std::move(init_value)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto exclusive_scan(const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, ValueType init_value) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; - return Impl::exclusive_scan_default_op_impl( + return Impl::exclusive_scan_default_op_exespace_impl( "Kokkos::exclusive_scan_default_functors_view_api", ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), - init_value); + std::move(init_value)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto exclusive_scan(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, ValueType init_value) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; - return Impl::exclusive_scan_default_op_impl(label, ex, KE::cbegin(view_from), - KE::cend(view_from), - KE::begin(view_dest), init_value); + return Impl::exclusive_scan_default_op_exespace_impl( + label, ex, KE::cbegin(view_from), KE::cend(view_from), + KE::begin(view_dest), std::move(init_value)); } // overload set 2 -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -exclusive_scan(const ExecutionSpace& ex, InputIteratorType first, - InputIteratorType last, OutputIteratorType first_dest, - ValueType init_value, BinaryOpType bop) { +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType exclusive_scan(const ExecutionSpace& ex, + InputIteratorType first, + InputIteratorType last, + OutputIteratorType first_dest, + ValueType init_value, BinaryOpType bop) { Impl::static_assert_is_not_openmptarget(ex); - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); - return Impl::exclusive_scan_custom_op_impl( + return Impl::exclusive_scan_custom_op_exespace_impl( "Kokkos::exclusive_scan_custom_functors_iterator_api", ex, first, last, - first_dest, init_value, bop); + first_dest, std::move(init_value), bop); } -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -exclusive_scan(const std::string& label, const ExecutionSpace& ex, - InputIteratorType first, InputIteratorType last, - OutputIteratorType first_dest, ValueType init_value, - BinaryOpType bop) { +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType exclusive_scan(const std::string& label, + const ExecutionSpace& ex, + InputIteratorType first, + InputIteratorType last, + OutputIteratorType first_dest, + ValueType init_value, BinaryOpType bop) { Impl::static_assert_is_not_openmptarget(ex); - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); - return Impl::exclusive_scan_custom_op_impl(label, ex, first, last, first_dest, - init_value, bop); + return Impl::exclusive_scan_custom_op_exespace_impl( + label, ex, first, last, first_dest, std::move(init_value), bop); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename ValueType, + typename BinaryOpType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto exclusive_scan(const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, @@ -129,18 +154,20 @@ auto exclusive_scan(const ExecutionSpace& ex, Impl::static_assert_is_not_openmptarget(ex); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; - return Impl::exclusive_scan_custom_op_impl( + return Impl::exclusive_scan_custom_op_exespace_impl( "Kokkos::exclusive_scan_custom_functors_view_api", ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), - init_value, bop); + std::move(init_value), bop); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename ValueType, + typename BinaryOpType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto exclusive_scan(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, @@ -148,12 +175,92 @@ auto exclusive_scan(const std::string& label, const ExecutionSpace& ex, Impl::static_assert_is_not_openmptarget(ex); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; - return Impl::exclusive_scan_custom_op_impl( + return Impl::exclusive_scan_custom_op_exespace_impl( label, ex, KE::cbegin(view_from), KE::cend(view_from), - KE::begin(view_dest), init_value, bop); + KE::begin(view_dest), std::move(init_value), bop); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// + +// overload set 1 +template && + Kokkos::is_team_handle_v, + int> = 0> +KOKKOS_FUNCTION OutputIteratorType +exclusive_scan(const TeamHandleType& teamHandle, InputIteratorType first, + InputIteratorType last, OutputIteratorType first_dest, + ValueType init_value) { + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + return Impl::exclusive_scan_default_op_team_impl( + teamHandle, first, last, first_dest, std::move(init_value)); +} + +template , int> = 0> +KOKKOS_FUNCTION auto exclusive_scan( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view_from, + const ::Kokkos::View& view_dest, + ValueType init_value) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + namespace KE = ::Kokkos::Experimental; + return Impl::exclusive_scan_default_op_team_impl( + teamHandle, KE::cbegin(view_from), KE::cend(view_from), + KE::begin(view_dest), std::move(init_value)); +} + +// overload set 2 +template && + Kokkos::is_team_handle_v, + int> = 0> +KOKKOS_FUNCTION OutputIteratorType +exclusive_scan(const TeamHandleType& teamHandle, InputIteratorType first, + InputIteratorType last, OutputIteratorType first_dest, + ValueType init_value, BinaryOpType bop) { + Impl::static_assert_is_not_openmptarget(teamHandle); + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + return Impl::exclusive_scan_custom_op_team_impl( + teamHandle, first, last, first_dest, std::move(init_value), bop); +} + +template , int> = 0> +KOKKOS_FUNCTION auto exclusive_scan( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view_from, + const ::Kokkos::View& view_dest, + ValueType init_value, BinaryOpType bop) { + Impl::static_assert_is_not_openmptarget(teamHandle); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + namespace KE = ::Kokkos::Experimental; + return Impl::exclusive_scan_custom_op_team_impl( + teamHandle, KE::cbegin(view_from), KE::cend(view_from), + KE::begin(view_dest), std::move(init_value), bop); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_Fill.hpp b/algorithms/src/std_algorithms/Kokkos_Fill.hpp index 1e300a4c20..6d805ba1be 100644 --- a/algorithms/src/std_algorithms/Kokkos_Fill.hpp +++ b/algorithms/src/std_algorithms/Kokkos_Fill.hpp @@ -23,33 +23,67 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename IteratorType, typename T, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> void fill(const ExecutionSpace& ex, IteratorType first, IteratorType last, const T& value) { - Impl::fill_impl("Kokkos::fill_iterator_api_default", ex, first, last, value); + Impl::fill_exespace_impl("Kokkos::fill_iterator_api_default", ex, first, last, + value); } -template +template < + typename ExecutionSpace, typename IteratorType, typename T, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> void fill(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, const T& value) { - Impl::fill_impl(label, ex, first, last, value); + Impl::fill_exespace_impl(label, ex, first, last, value); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename T, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> void fill(const ExecutionSpace& ex, const ::Kokkos::View& view, const T& value) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - - Impl::fill_impl("Kokkos::fill_view_api_default", ex, begin(view), end(view), - value); + Impl::fill_exespace_impl("Kokkos::fill_view_api_default", ex, begin(view), + end(view), value); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename T, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> void fill(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, const T& value) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + Impl::fill_exespace_impl(label, ex, begin(view), end(view), value); +} - Impl::fill_impl(label, ex, begin(view), end(view), value); +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION void fill(const TeamHandleType& th, IteratorType first, + IteratorType last, const T& value) { + Impl::fill_team_impl(th, first, last, value); +} + +template , int> = 0> +KOKKOS_FUNCTION void fill(const TeamHandleType& th, + const ::Kokkos::View& view, + const T& value) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + Impl::fill_team_impl(th, begin(view), end(view), value); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_FillN.hpp b/algorithms/src/std_algorithms/Kokkos_FillN.hpp index 02503dfd14..66b8cd66cc 100644 --- a/algorithms/src/std_algorithms/Kokkos_FillN.hpp +++ b/algorithms/src/std_algorithms/Kokkos_FillN.hpp @@ -23,38 +23,72 @@ namespace Kokkos { namespace Experimental { -template +template < + typename ExecutionSpace, typename IteratorType, typename SizeType, + typename T, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType fill_n(const ExecutionSpace& ex, IteratorType first, SizeType n, const T& value) { - return Impl::fill_n_impl("Kokkos::fill_n_iterator_api_default", ex, first, n, - value); + return Impl::fill_n_exespace_impl("Kokkos::fill_n_iterator_api_default", ex, + first, n, value); } -template +template < + typename ExecutionSpace, typename IteratorType, typename SizeType, + typename T, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType fill_n(const std::string& label, const ExecutionSpace& ex, IteratorType first, SizeType n, const T& value) { - return Impl::fill_n_impl(label, ex, first, n, value); + return Impl::fill_n_exespace_impl(label, ex, first, n, value); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename SizeType, typename T, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto fill_n(const ExecutionSpace& ex, const ::Kokkos::View& view, SizeType n, const T& value) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - return Impl::fill_n_impl("Kokkos::fill_n_view_api_default", ex, begin(view), - n, value); + return Impl::fill_n_exespace_impl("Kokkos::fill_n_view_api_default", ex, + begin(view), n, value); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename SizeType, typename T, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto fill_n(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, SizeType n, const T& value) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - return Impl::fill_n_impl(label, ex, begin(view), n, value); + return Impl::fill_n_exespace_impl(label, ex, begin(view), n, value); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION IteratorType fill_n(const TeamHandleType& th, + IteratorType first, SizeType n, + const T& value) { + return Impl::fill_n_team_impl(th, first, n, value); +} + +template , int> = 0> +KOKKOS_FUNCTION auto fill_n(const TeamHandleType& th, + const ::Kokkos::View& view, + SizeType n, const T& value) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + return Impl::fill_n_team_impl(th, begin(view), n, value); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_Find.hpp b/algorithms/src/std_algorithms/Kokkos_Find.hpp index 65b68cf931..e5e2b0e2b0 100644 --- a/algorithms/src/std_algorithms/Kokkos_Find.hpp +++ b/algorithms/src/std_algorithms/Kokkos_Find.hpp @@ -23,36 +23,76 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename InputIterator, typename T, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> InputIterator find(const ExecutionSpace& ex, InputIterator first, InputIterator last, const T& value) { - return Impl::find_impl("Kokkos::find_iterator_api_default", ex, first, last, - value); + return Impl::find_exespace_impl("Kokkos::find_iterator_api_default", ex, + first, last, value); } -template +template < + typename ExecutionSpace, typename InputIterator, typename T, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> InputIterator find(const std::string& label, const ExecutionSpace& ex, InputIterator first, InputIterator last, const T& value) { - return Impl::find_impl(label, ex, first, last, value); + return Impl::find_exespace_impl(label, ex, first, last, value); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename T, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto find(const ExecutionSpace& ex, const ::Kokkos::View& view, const T& value) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); namespace KE = ::Kokkos::Experimental; - return Impl::find_impl("Kokkos::find_view_api_default", ex, KE::begin(view), - KE::end(view), value); + return Impl::find_exespace_impl("Kokkos::find_view_api_default", ex, + KE::begin(view), KE::end(view), value); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename T, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto find(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, const T& value) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); namespace KE = ::Kokkos::Experimental; - return Impl::find_impl(label, ex, KE::begin(view), KE::end(view), value); + return Impl::find_exespace_impl(label, ex, KE::begin(view), KE::end(view), + value); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION InputIterator find(const TeamHandleType& teamHandle, + InputIterator first, InputIterator last, + const T& value) { + return Impl::find_team_impl(teamHandle, first, last, value); +} + +template , int> = 0> +KOKKOS_FUNCTION auto find(const TeamHandleType& teamHandle, + const ::Kokkos::View& view, + const T& value) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + + namespace KE = ::Kokkos::Experimental; + return Impl::find_team_impl(teamHandle, KE::begin(view), KE::end(view), + value); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_FindEnd.hpp b/algorithms/src/std_algorithms/Kokkos_FindEnd.hpp index f6a38855eb..a4ec735fd5 100644 --- a/algorithms/src/std_algorithms/Kokkos_FindEnd.hpp +++ b/algorithms/src/std_algorithms/Kokkos_FindEnd.hpp @@ -24,24 +24,34 @@ namespace Kokkos { namespace Experimental { +// +// overload set accepting execution space +// + // overload set 1: no binary predicate passed -template +template < + typename ExecutionSpace, typename IteratorType1, typename IteratorType2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType1 find_end(const ExecutionSpace& ex, IteratorType1 first, IteratorType1 last, IteratorType2 s_first, IteratorType2 s_last) { - return Impl::find_end_impl("Kokkos::find_end_iterator_api_default", ex, first, - last, s_first, s_last); + return Impl::find_end_exespace_impl("Kokkos::find_end_iterator_api_default", + ex, first, last, s_first, s_last); } -template +template < + typename ExecutionSpace, typename IteratorType1, typename IteratorType2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType1 find_end(const std::string& label, const ExecutionSpace& ex, IteratorType1 first, IteratorType1 last, IteratorType2 s_first, IteratorType2 s_last) { - return Impl::find_end_impl(label, ex, first, last, s_first, s_last); + return Impl::find_end_exespace_impl(label, ex, first, last, s_first, s_last); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto find_end(const ExecutionSpace& ex, const ::Kokkos::View& view, const ::Kokkos::View& s_view) { @@ -49,13 +59,15 @@ auto find_end(const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view); namespace KE = ::Kokkos::Experimental; - return Impl::find_end_impl("Kokkos::find_end_view_api_default", ex, - KE::begin(view), KE::end(view), KE::begin(s_view), - KE::end(s_view)); + return Impl::find_end_exespace_impl("Kokkos::find_end_view_api_default", ex, + KE::begin(view), KE::end(view), + KE::begin(s_view), KE::end(s_view)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto find_end(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, const ::Kokkos::View& s_view) { @@ -63,31 +75,38 @@ auto find_end(const std::string& label, const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view); namespace KE = ::Kokkos::Experimental; - return Impl::find_end_impl(label, ex, KE::begin(view), KE::end(view), - KE::begin(s_view), KE::end(s_view)); + return Impl::find_end_exespace_impl(label, ex, KE::begin(view), KE::end(view), + KE::begin(s_view), KE::end(s_view)); } // overload set 2: binary predicate passed -template +template < + typename ExecutionSpace, typename IteratorType1, typename IteratorType2, + typename BinaryPredicateType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType1 find_end(const ExecutionSpace& ex, IteratorType1 first, IteratorType1 last, IteratorType2 s_first, IteratorType2 s_last, const BinaryPredicateType& pred) { - return Impl::find_end_impl("Kokkos::find_end_iterator_api_default", ex, first, - last, s_first, s_last, pred); + return Impl::find_end_exespace_impl("Kokkos::find_end_iterator_api_default", + ex, first, last, s_first, s_last, pred); } -template +template < + typename ExecutionSpace, typename IteratorType1, typename IteratorType2, + typename BinaryPredicateType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType1 find_end(const std::string& label, const ExecutionSpace& ex, IteratorType1 first, IteratorType1 last, IteratorType2 s_first, IteratorType2 s_last, const BinaryPredicateType& pred) { - return Impl::find_end_impl(label, ex, first, last, s_first, s_last, pred); + return Impl::find_end_exespace_impl(label, ex, first, last, s_first, s_last, + pred); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename BinaryPredicateType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto find_end(const ExecutionSpace& ex, const ::Kokkos::View& view, const ::Kokkos::View& s_view, @@ -96,13 +115,15 @@ auto find_end(const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view); namespace KE = ::Kokkos::Experimental; - return Impl::find_end_impl("Kokkos::find_end_view_api_default", ex, - KE::begin(view), KE::end(view), KE::begin(s_view), - KE::end(s_view), pred); + return Impl::find_end_exespace_impl("Kokkos::find_end_view_api_default", ex, + KE::begin(view), KE::end(view), + KE::begin(s_view), KE::end(s_view), pred); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename BinaryPredicateType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto find_end(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, const ::Kokkos::View& s_view, @@ -111,8 +132,71 @@ auto find_end(const std::string& label, const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view); namespace KE = ::Kokkos::Experimental; - return Impl::find_end_impl(label, ex, KE::begin(view), KE::end(view), - KE::begin(s_view), KE::end(s_view), pred); + return Impl::find_end_exespace_impl(label, ex, KE::begin(view), KE::end(view), + KE::begin(s_view), KE::end(s_view), pred); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// + +// overload set 1: no binary predicate passed +template , int> = 0> +KOKKOS_FUNCTION IteratorType1 find_end(const TeamHandleType& teamHandle, + IteratorType1 first, IteratorType1 last, + IteratorType2 s_first, + IteratorType2 s_last) { + return Impl::find_end_team_impl(teamHandle, first, last, s_first, s_last); +} + +template , int> = 0> +KOKKOS_FUNCTION auto find_end( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view, + const ::Kokkos::View& s_view) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view); + + namespace KE = ::Kokkos::Experimental; + return Impl::find_end_team_impl(teamHandle, KE::begin(view), KE::end(view), + KE::begin(s_view), KE::end(s_view)); +} + +// overload set 2: binary predicate passed +template , int> = 0> + +KOKKOS_FUNCTION IteratorType1 find_end(const TeamHandleType& teamHandle, + IteratorType1 first, IteratorType1 last, + IteratorType2 s_first, + IteratorType2 s_last, + const BinaryPredicateType& pred) { + return Impl::find_end_team_impl(teamHandle, first, last, s_first, s_last, + pred); +} + +template , int> = 0> +KOKKOS_FUNCTION auto find_end( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view, + const ::Kokkos::View& s_view, + const BinaryPredicateType& pred) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view); + + namespace KE = ::Kokkos::Experimental; + return Impl::find_end_team_impl(teamHandle, KE::begin(view), KE::end(view), + KE::begin(s_view), KE::end(s_view), pred); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_FindFirstOf.hpp b/algorithms/src/std_algorithms/Kokkos_FindFirstOf.hpp index 6b0e4993ee..341a70e2f2 100644 --- a/algorithms/src/std_algorithms/Kokkos_FindFirstOf.hpp +++ b/algorithms/src/std_algorithms/Kokkos_FindFirstOf.hpp @@ -23,24 +23,36 @@ namespace Kokkos { namespace Experimental { +// +// overload set accepting execution space +// + // overload set 1: no binary predicate passed -template +template < + typename ExecutionSpace, typename IteratorType1, typename IteratorType2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType1 find_first_of(const ExecutionSpace& ex, IteratorType1 first, IteratorType1 last, IteratorType2 s_first, IteratorType2 s_last) { - return Impl::find_first_of_impl("Kokkos::find_first_of_iterator_api_default", - ex, first, last, s_first, s_last); + return Impl::find_first_of_exespace_impl( + "Kokkos::find_first_of_iterator_api_default", ex, first, last, s_first, + s_last); } -template +template < + typename ExecutionSpace, typename IteratorType1, typename IteratorType2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType1 find_first_of(const std::string& label, const ExecutionSpace& ex, IteratorType1 first, IteratorType1 last, IteratorType2 s_first, IteratorType2 s_last) { - return Impl::find_first_of_impl(label, ex, first, last, s_first, s_last); + return Impl::find_first_of_exespace_impl(label, ex, first, last, s_first, + s_last); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto find_first_of(const ExecutionSpace& ex, const ::Kokkos::View& view, const ::Kokkos::View& s_view) { @@ -48,13 +60,15 @@ auto find_first_of(const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view); namespace KE = ::Kokkos::Experimental; - return Impl::find_first_of_impl("Kokkos::find_first_of_view_api_default", ex, - KE::begin(view), KE::end(view), - KE::begin(s_view), KE::end(s_view)); + return Impl::find_first_of_exespace_impl( + "Kokkos::find_first_of_view_api_default", ex, KE::begin(view), + KE::end(view), KE::begin(s_view), KE::end(s_view)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto find_first_of(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, const ::Kokkos::View& s_view) { @@ -62,33 +76,41 @@ auto find_first_of(const std::string& label, const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view); namespace KE = ::Kokkos::Experimental; - return Impl::find_first_of_impl(label, ex, KE::begin(view), KE::end(view), - KE::begin(s_view), KE::end(s_view)); + return Impl::find_first_of_exespace_impl(label, ex, KE::begin(view), + KE::end(view), KE::begin(s_view), + KE::end(s_view)); } // overload set 2: binary predicate passed -template +template < + typename ExecutionSpace, typename IteratorType1, typename IteratorType2, + typename BinaryPredicateType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType1 find_first_of(const ExecutionSpace& ex, IteratorType1 first, IteratorType1 last, IteratorType2 s_first, IteratorType2 s_last, const BinaryPredicateType& pred) { - return Impl::find_first_of_impl("Kokkos::find_first_of_iterator_api_default", - ex, first, last, s_first, s_last, pred); + return Impl::find_first_of_exespace_impl( + "Kokkos::find_first_of_iterator_api_default", ex, first, last, s_first, + s_last, pred); } -template +template < + typename ExecutionSpace, typename IteratorType1, typename IteratorType2, + typename BinaryPredicateType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType1 find_first_of(const std::string& label, const ExecutionSpace& ex, IteratorType1 first, IteratorType1 last, IteratorType2 s_first, IteratorType2 s_last, const BinaryPredicateType& pred) { - return Impl::find_first_of_impl(label, ex, first, last, s_first, s_last, - pred); + return Impl::find_first_of_exespace_impl(label, ex, first, last, s_first, + s_last, pred); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename BinaryPredicateType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto find_first_of(const ExecutionSpace& ex, const ::Kokkos::View& view, const ::Kokkos::View& s_view, @@ -97,13 +119,15 @@ auto find_first_of(const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view); namespace KE = ::Kokkos::Experimental; - return Impl::find_first_of_impl("Kokkos::find_first_of_view_api_default", ex, - KE::begin(view), KE::end(view), - KE::begin(s_view), KE::end(s_view), pred); + return Impl::find_first_of_exespace_impl( + "Kokkos::find_first_of_view_api_default", ex, KE::begin(view), + KE::end(view), KE::begin(s_view), KE::end(s_view), pred); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename BinaryPredicateType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto find_first_of(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, const ::Kokkos::View& s_view, @@ -112,8 +136,77 @@ auto find_first_of(const std::string& label, const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view); namespace KE = ::Kokkos::Experimental; - return Impl::find_first_of_impl(label, ex, KE::begin(view), KE::end(view), - KE::begin(s_view), KE::end(s_view), pred); + return Impl::find_first_of_exespace_impl(label, ex, KE::begin(view), + KE::end(view), KE::begin(s_view), + KE::end(s_view), pred); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// + +// overload set 1: no binary predicate passed +template , int> = 0> +KOKKOS_FUNCTION IteratorType1 find_first_of(const TeamHandleType& teamHandle, + IteratorType1 first, + IteratorType1 last, + IteratorType2 s_first, + IteratorType2 s_last) { + return Impl::find_first_of_team_impl(teamHandle, first, last, s_first, + s_last); +} + +template , int> = 0> +KOKKOS_FUNCTION auto find_first_of( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view, + const ::Kokkos::View& s_view) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view); + + namespace KE = ::Kokkos::Experimental; + return Impl::find_first_of_team_impl(teamHandle, KE::begin(view), + KE::end(view), KE::begin(s_view), + KE::end(s_view)); +} + +// overload set 2: binary predicate passed +template , int> = 0> + +KOKKOS_FUNCTION IteratorType1 find_first_of(const TeamHandleType& teamHandle, + IteratorType1 first, + IteratorType1 last, + IteratorType2 s_first, + IteratorType2 s_last, + const BinaryPredicateType& pred) { + return Impl::find_first_of_team_impl(teamHandle, first, last, s_first, s_last, + pred); +} + +template , int> = 0> +KOKKOS_FUNCTION auto find_first_of( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view, + const ::Kokkos::View& s_view, + const BinaryPredicateType& pred) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view); + + namespace KE = ::Kokkos::Experimental; + return Impl::find_first_of_team_impl(teamHandle, KE::begin(view), + KE::end(view), KE::begin(s_view), + KE::end(s_view), pred); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_FindIf.hpp b/algorithms/src/std_algorithms/Kokkos_FindIf.hpp index 911316a668..283fab7617 100644 --- a/algorithms/src/std_algorithms/Kokkos_FindIf.hpp +++ b/algorithms/src/std_algorithms/Kokkos_FindIf.hpp @@ -23,42 +23,82 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename IteratorType, typename PredicateType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType find_if(const ExecutionSpace& ex, IteratorType first, IteratorType last, PredicateType predicate) { - return Impl::find_if_or_not_impl("Kokkos::find_if_iterator_api_default", - ex, first, last, std::move(predicate)); + return Impl::find_if_or_not_exespace_impl( + "Kokkos::find_if_iterator_api_default", ex, first, last, + std::move(predicate)); } -template +template < + typename ExecutionSpace, typename IteratorType, typename PredicateType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType find_if(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, PredicateType predicate) { - return Impl::find_if_or_not_impl(label, ex, first, last, - std::move(predicate)); + return Impl::find_if_or_not_exespace_impl(label, ex, first, last, + std::move(predicate)); } -template +template ::value, + int> = 0> auto find_if(const ExecutionSpace& ex, const ::Kokkos::View& v, Predicate predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::find_if_or_not_impl("Kokkos::find_if_view_api_default", ex, - KE::begin(v), KE::end(v), - std::move(predicate)); + return Impl::find_if_or_not_exespace_impl( + "Kokkos::find_if_view_api_default", ex, KE::begin(v), KE::end(v), + std::move(predicate)); } -template +template ::value, + int> = 0> auto find_if(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& v, Predicate predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::find_if_or_not_impl(label, ex, KE::begin(v), KE::end(v), - std::move(predicate)); + return Impl::find_if_or_not_exespace_impl( + label, ex, KE::begin(v), KE::end(v), std::move(predicate)); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION IteratorType find_if(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + PredicateType predicate) { + return Impl::find_if_or_not_team_impl(teamHandle, first, last, + std::move(predicate)); +} + +template < + typename TeamHandleType, typename DataType, typename... Properties, + typename Predicate, + std::enable_if_t<::Kokkos::is_team_handle::value, int> = 0> +KOKKOS_FUNCTION auto find_if(const TeamHandleType& teamHandle, + const ::Kokkos::View& v, + Predicate predicate) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); + namespace KE = ::Kokkos::Experimental; + return Impl::find_if_or_not_team_impl(teamHandle, KE::begin(v), + KE::end(v), std::move(predicate)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_FindIfNot.hpp b/algorithms/src/std_algorithms/Kokkos_FindIfNot.hpp index 18294d7b7d..5e17a6f539 100644 --- a/algorithms/src/std_algorithms/Kokkos_FindIfNot.hpp +++ b/algorithms/src/std_algorithms/Kokkos_FindIfNot.hpp @@ -23,45 +23,84 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename IteratorType, typename Predicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType find_if_not(const ExecutionSpace& ex, IteratorType first, IteratorType last, Predicate predicate) { - return Impl::find_if_or_not_impl( + return Impl::find_if_or_not_exespace_impl( "Kokkos::find_if_not_iterator_api_default", ex, first, last, std::move(predicate)); } -template +template < + typename ExecutionSpace, typename IteratorType, typename Predicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType find_if_not(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, Predicate predicate) { - return Impl::find_if_or_not_impl(label, ex, first, last, - std::move(predicate)); + return Impl::find_if_or_not_exespace_impl(label, ex, first, last, + std::move(predicate)); } -template +template ::value, + int> = 0> auto find_if_not(const ExecutionSpace& ex, const ::Kokkos::View& v, Predicate predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::find_if_or_not_impl( + return Impl::find_if_or_not_exespace_impl( "Kokkos::find_if_not_view_api_default", ex, KE::begin(v), KE::end(v), std::move(predicate)); } -template +template ::value, + int> = 0> auto find_if_not(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& v, Predicate predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::find_if_or_not_impl(label, ex, KE::begin(v), KE::end(v), - std::move(predicate)); + return Impl::find_if_or_not_exespace_impl( + label, ex, KE::begin(v), KE::end(v), std::move(predicate)); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION IteratorType find_if_not(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + Predicate predicate) { + return Impl::find_if_or_not_team_impl(teamHandle, first, last, + std::move(predicate)); +} + +template < + typename TeamHandleType, typename DataType, typename... Properties, + typename Predicate, + std::enable_if_t<::Kokkos::is_team_handle::value, int> = 0> +KOKKOS_FUNCTION auto find_if_not( + const TeamHandleType& teamHandle, + const ::Kokkos::View& v, Predicate predicate) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); + + namespace KE = ::Kokkos::Experimental; + return Impl::find_if_or_not_team_impl( + teamHandle, KE::begin(v), KE::end(v), std::move(predicate)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_ForEach.hpp b/algorithms/src/std_algorithms/Kokkos_ForEach.hpp index d7b08e4842..05969be463 100644 --- a/algorithms/src/std_algorithms/Kokkos_ForEach.hpp +++ b/algorithms/src/std_algorithms/Kokkos_ForEach.hpp @@ -23,42 +23,79 @@ namespace Kokkos { namespace Experimental { -template -UnaryFunctorType for_each(const std::string& label, const ExecutionSpace& ex, - IteratorType first, IteratorType last, - UnaryFunctorType functor) { - return Impl::for_each_impl(label, ex, first, last, std::move(functor)); +// +// overload set accepting execution space +// +template < + class ExecutionSpace, class IteratorType, class UnaryFunctorType, + std::enable_if_t, int> = 0> +void for_each(const std::string& label, const ExecutionSpace& ex, + IteratorType first, IteratorType last, UnaryFunctorType functor) { + Impl::for_each_exespace_impl(label, ex, first, last, std::move(functor)); +} + +template < + class ExecutionSpace, class IteratorType, class UnaryFunctorType, + std::enable_if_t, int> = 0> +void for_each(const ExecutionSpace& ex, IteratorType first, IteratorType last, + UnaryFunctorType functor) { + Impl::for_each_exespace_impl("Kokkos::for_each_iterator_api_default", ex, + first, last, std::move(functor)); } -template -UnaryFunctorType for_each(const ExecutionSpace& ex, IteratorType first, - IteratorType last, UnaryFunctorType functor) { - return Impl::for_each_impl("Kokkos::for_each_iterator_api_default", ex, first, - last, std::move(functor)); +template < + class ExecutionSpace, class DataType, class... Properties, + class UnaryFunctorType, + std::enable_if_t, int> = 0> +void for_each(const std::string& label, const ExecutionSpace& ex, + const ::Kokkos::View& v, + UnaryFunctorType functor) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); + + namespace KE = ::Kokkos::Experimental; + Impl::for_each_exespace_impl(label, ex, KE::begin(v), KE::end(v), + std::move(functor)); } -template -UnaryFunctorType for_each(const std::string& label, const ExecutionSpace& ex, - const ::Kokkos::View& v, - UnaryFunctorType functor) { +template < + class ExecutionSpace, class DataType, class... Properties, + class UnaryFunctorType, + std::enable_if_t, int> = 0> +void for_each(const ExecutionSpace& ex, + const ::Kokkos::View& v, + UnaryFunctorType functor) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::for_each_impl(label, ex, KE::begin(v), KE::end(v), - std::move(functor)); + Impl::for_each_exespace_impl("Kokkos::for_each_view_api_default", ex, + KE::begin(v), KE::end(v), std::move(functor)); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// + +template , int> = 0> +KOKKOS_FUNCTION void for_each(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + UnaryFunctorType functor) { + Impl::for_each_team_impl(teamHandle, first, last, std::move(functor)); } -template -UnaryFunctorType for_each(const ExecutionSpace& ex, - const ::Kokkos::View& v, - UnaryFunctorType functor) { +template , int> = 0> +KOKKOS_FUNCTION void for_each(const TeamHandleType& teamHandle, + const ::Kokkos::View& v, + UnaryFunctorType functor) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::for_each_impl("Kokkos::for_each_view_api_default", ex, - KE::begin(v), KE::end(v), std::move(functor)); + Impl::for_each_team_impl(teamHandle, KE::begin(v), KE::end(v), + std::move(functor)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_ForEachN.hpp b/algorithms/src/std_algorithms/Kokkos_ForEachN.hpp index f1769da05b..e6fbcad891 100644 --- a/algorithms/src/std_algorithms/Kokkos_ForEachN.hpp +++ b/algorithms/src/std_algorithms/Kokkos_ForEachN.hpp @@ -23,43 +23,87 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + class ExecutionSpace, class IteratorType, class SizeType, + class UnaryFunctorType, + std::enable_if_t, int> = 0> IteratorType for_each_n(const std::string& label, const ExecutionSpace& ex, IteratorType first, SizeType n, UnaryFunctorType functor) { - return Impl::for_each_n_impl(label, ex, first, n, std::move(functor)); + return Impl::for_each_n_exespace_impl(label, ex, first, n, + std::move(functor)); } -template +template < + class ExecutionSpace, class IteratorType, class SizeType, + class UnaryFunctorType, + std::enable_if_t, int> = 0> IteratorType for_each_n(const ExecutionSpace& ex, IteratorType first, SizeType n, UnaryFunctorType functor) { - return Impl::for_each_n_impl("Kokkos::for_each_n_iterator_api_default", ex, - first, n, std::move(functor)); + return Impl::for_each_n_exespace_impl( + "Kokkos::for_each_n_iterator_api_default", ex, first, n, + std::move(functor)); } -template +template < + class ExecutionSpace, class DataType, class... Properties, class SizeType, + class UnaryFunctorType, + std::enable_if_t, int> = 0> auto for_each_n(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& v, SizeType n, UnaryFunctorType functor) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::for_each_n_impl(label, ex, KE::begin(v), n, std::move(functor)); + return Impl::for_each_n_exespace_impl(label, ex, KE::begin(v), n, + std::move(functor)); } -template +template < + class ExecutionSpace, class DataType, class... Properties, class SizeType, + class UnaryFunctorType, + std::enable_if_t, int> = 0> auto for_each_n(const ExecutionSpace& ex, const ::Kokkos::View& v, SizeType n, UnaryFunctorType functor) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::for_each_n_impl("Kokkos::for_each_n_view_api_default", ex, - KE::begin(v), n, std::move(functor)); + return Impl::for_each_n_exespace_impl("Kokkos::for_each_n_view_api_default", + ex, KE::begin(v), n, + std::move(functor)); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// + +template , int> = 0> +KOKKOS_FUNCTION IteratorType for_each_n(const TeamHandleType& teamHandle, + IteratorType first, SizeType n, + UnaryFunctorType functor) { + return Impl::for_each_n_team_impl(teamHandle, first, n, std::move(functor)); +} + +template , int> = 0> +KOKKOS_FUNCTION auto for_each_n( + const TeamHandleType& teamHandle, + const ::Kokkos::View& v, SizeType n, + UnaryFunctorType functor) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); + + namespace KE = ::Kokkos::Experimental; + return Impl::for_each_n_team_impl(teamHandle, KE::begin(v), n, + std::move(functor)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_Generate.hpp b/algorithms/src/std_algorithms/Kokkos_Generate.hpp index 13e12783e0..a3295084ee 100644 --- a/algorithms/src/std_algorithms/Kokkos_Generate.hpp +++ b/algorithms/src/std_algorithms/Kokkos_Generate.hpp @@ -23,38 +23,68 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template , int> = 0> void generate(const ExecutionSpace& ex, IteratorType first, IteratorType last, Generator g) { - Impl::generate_impl("Kokkos::generate_iterator_api_default", ex, first, last, - std::move(g)); + Impl::generate_exespace_impl("Kokkos::generate_iterator_api_default", ex, + first, last, std::move(g)); } -template +template , int> = 0> void generate(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, Generator g) { - Impl::generate_impl(label, ex, first, last, std::move(g)); + Impl::generate_exespace_impl(label, ex, first, last, std::move(g)); } -template +template , int> = 0> void generate(const ExecutionSpace& ex, const ::Kokkos::View& view, Generator g) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - Impl::generate_impl("Kokkos::generate_view_api_default", ex, begin(view), - end(view), std::move(g)); + Impl::generate_exespace_impl("Kokkos::generate_view_api_default", ex, + begin(view), end(view), std::move(g)); } -template +template , int> = 0> void generate(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, Generator g) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - Impl::generate_impl(label, ex, begin(view), end(view), std::move(g)); + Impl::generate_exespace_impl(label, ex, begin(view), end(view), std::move(g)); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION void generate(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + Generator g) { + Impl::generate_team_impl(teamHandle, first, last, std::move(g)); +} + +template , int> = 0> +KOKKOS_FUNCTION void generate( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view, Generator g) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + Impl::generate_team_impl(teamHandle, begin(view), end(view), std::move(g)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_GenerateN.hpp b/algorithms/src/std_algorithms/Kokkos_GenerateN.hpp index 4d17512228..e480062c23 100644 --- a/algorithms/src/std_algorithms/Kokkos_GenerateN.hpp +++ b/algorithms/src/std_algorithms/Kokkos_GenerateN.hpp @@ -23,40 +23,75 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template , int> = 0> IteratorType generate_n(const ExecutionSpace& ex, IteratorType first, Size count, Generator g) { - Impl::generate_n_impl("Kokkos::generate_n_iterator_api_default", ex, first, - count, std::move(g)); - return first + count; + return Impl::generate_n_exespace_impl( + "Kokkos::generate_n_iterator_api_default", ex, first, count, + std::move(g)); } -template +template , int> = 0> IteratorType generate_n(const std::string& label, const ExecutionSpace& ex, IteratorType first, Size count, Generator g) { - Impl::generate_n_impl(label, ex, first, count, std::move(g)); - return first + count; + return Impl::generate_n_exespace_impl(label, ex, first, count, std::move(g)); } -template +template , int> = 0> auto generate_n(const ExecutionSpace& ex, const ::Kokkos::View& view, Size count, Generator g) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - return Impl::generate_n_impl("Kokkos::generate_n_view_api_default", ex, - begin(view), count, std::move(g)); + return Impl::generate_n_exespace_impl("Kokkos::generate_n_view_api_default", + ex, begin(view), count, std::move(g)); } -template +template , int> = 0> auto generate_n(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, Size count, Generator g) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - return Impl::generate_n_impl(label, ex, begin(view), count, std::move(g)); + return Impl::generate_n_exespace_impl(label, ex, begin(view), count, + std::move(g)); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION IteratorType generate_n(const TeamHandleType& teamHandle, + IteratorType first, Size count, + Generator g) { + return Impl::generate_n_team_impl(teamHandle, first, count, std::move(g)); +} + +template , int> = 0> +KOKKOS_FUNCTION auto generate_n( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view, Size count, + Generator g) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + return Impl::generate_n_team_impl(teamHandle, begin(view), count, + std::move(g)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_InclusiveScan.hpp b/algorithms/src/std_algorithms/Kokkos_InclusiveScan.hpp index bcd731b850..a0e540b5e7 100644 --- a/algorithms/src/std_algorithms/Kokkos_InclusiveScan.hpp +++ b/algorithms/src/std_algorithms/Kokkos_InclusiveScan.hpp @@ -23,33 +23,45 @@ namespace Kokkos { namespace Experimental { +// +// overload set accepting execution space +// + // overload set 1 -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -inclusive_scan(const ExecutionSpace& ex, InputIteratorType first, - InputIteratorType last, OutputIteratorType first_dest) { - return Impl::inclusive_scan_default_op_impl( +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType inclusive_scan(const ExecutionSpace& ex, + InputIteratorType first, + InputIteratorType last, + OutputIteratorType first_dest) { + return Impl::inclusive_scan_default_op_exespace_impl( "Kokkos::inclusive_scan_default_functors_iterator_api", ex, first, last, first_dest); } -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -inclusive_scan(const std::string& label, const ExecutionSpace& ex, - InputIteratorType first, InputIteratorType last, - OutputIteratorType first_dest) { - return Impl::inclusive_scan_default_op_impl(label, ex, first, last, - first_dest); +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType inclusive_scan(const std::string& label, + const ExecutionSpace& ex, + InputIteratorType first, + InputIteratorType last, + OutputIteratorType first_dest) { + return Impl::inclusive_scan_default_op_exespace_impl(label, ex, first, last, + first_dest); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto inclusive_scan( const ExecutionSpace& ex, const ::Kokkos::View& view_from, @@ -57,13 +69,15 @@ auto inclusive_scan( Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); namespace KE = ::Kokkos::Experimental; - return Impl::inclusive_scan_default_op_impl( + return Impl::inclusive_scan_default_op_exespace_impl( "Kokkos::inclusive_scan_default_functors_view_api", ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto inclusive_scan( const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view_from, @@ -71,39 +85,45 @@ auto inclusive_scan( Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); namespace KE = ::Kokkos::Experimental; - return Impl::inclusive_scan_default_op_impl(label, ex, KE::cbegin(view_from), - KE::cend(view_from), - KE::begin(view_dest)); + return Impl::inclusive_scan_default_op_exespace_impl( + label, ex, KE::cbegin(view_from), KE::cend(view_from), + KE::begin(view_dest)); } // overload set 2 (accepting custom binary op) -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -inclusive_scan(const ExecutionSpace& ex, InputIteratorType first, - InputIteratorType last, OutputIteratorType first_dest, - BinaryOp binary_op) { - return Impl::inclusive_scan_custom_binary_op_impl( +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType inclusive_scan(const ExecutionSpace& ex, + InputIteratorType first, + InputIteratorType last, + OutputIteratorType first_dest, + BinaryOp binary_op) { + return Impl::inclusive_scan_custom_binary_op_exespace_impl( "Kokkos::inclusive_scan_custom_functors_iterator_api", ex, first, last, first_dest, binary_op); } -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -inclusive_scan(const std::string& label, const ExecutionSpace& ex, - InputIteratorType first, InputIteratorType last, - OutputIteratorType first_dest, BinaryOp binary_op) { - return Impl::inclusive_scan_custom_binary_op_impl(label, ex, first, last, - first_dest, binary_op); +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType inclusive_scan( + const std::string& label, const ExecutionSpace& ex, InputIteratorType first, + InputIteratorType last, OutputIteratorType first_dest, BinaryOp binary_op) { + return Impl::inclusive_scan_custom_binary_op_exespace_impl( + label, ex, first, last, first_dest, binary_op); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename BinaryOp, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto inclusive_scan(const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, @@ -111,14 +131,16 @@ auto inclusive_scan(const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); namespace KE = ::Kokkos::Experimental; - return Impl::inclusive_scan_custom_binary_op_impl( + return Impl::inclusive_scan_custom_binary_op_exespace_impl( "Kokkos::inclusive_scan_custom_functors_view_api", ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), binary_op); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename BinaryOp, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto inclusive_scan(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, @@ -126,67 +148,192 @@ auto inclusive_scan(const std::string& label, const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); namespace KE = ::Kokkos::Experimental; - return Impl::inclusive_scan_custom_binary_op_impl( + return Impl::inclusive_scan_custom_binary_op_exespace_impl( label, ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), binary_op); } // overload set 3 -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -inclusive_scan(const ExecutionSpace& ex, InputIteratorType first, - InputIteratorType last, OutputIteratorType first_dest, - BinaryOp binary_op, ValueType init_value) { - return Impl::inclusive_scan_custom_binary_op_impl( +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType inclusive_scan(const ExecutionSpace& ex, + InputIteratorType first, + InputIteratorType last, + OutputIteratorType first_dest, + BinaryOp binary_op, ValueType init_value) { + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + + return Impl::inclusive_scan_custom_binary_op_exespace_impl( "Kokkos::inclusive_scan_custom_functors_iterator_api", ex, first, last, - first_dest, binary_op, init_value); + first_dest, binary_op, std::move(init_value)); } -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -inclusive_scan(const std::string& label, const ExecutionSpace& ex, - InputIteratorType first, InputIteratorType last, - OutputIteratorType first_dest, BinaryOp binary_op, - ValueType init_value) { - return Impl::inclusive_scan_custom_binary_op_impl( - label, ex, first, last, first_dest, binary_op, init_value); +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType inclusive_scan(const std::string& label, + const ExecutionSpace& ex, + InputIteratorType first, + InputIteratorType last, + OutputIteratorType first_dest, + BinaryOp binary_op, ValueType init_value) { + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + + return Impl::inclusive_scan_custom_binary_op_exespace_impl( + label, ex, first, last, first_dest, binary_op, std::move(init_value)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename BinaryOp, + typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto inclusive_scan(const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, BinaryOp binary_op, ValueType init_value) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + namespace KE = ::Kokkos::Experimental; - return Impl::inclusive_scan_custom_binary_op_impl( + return Impl::inclusive_scan_custom_binary_op_exespace_impl( "Kokkos::inclusive_scan_custom_functors_view_api", ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), - binary_op, init_value); + binary_op, std::move(init_value)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename BinaryOp, + typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto inclusive_scan(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, BinaryOp binary_op, ValueType init_value) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + namespace KE = ::Kokkos::Experimental; - return Impl::inclusive_scan_custom_binary_op_impl( + return Impl::inclusive_scan_custom_binary_op_exespace_impl( label, ex, KE::cbegin(view_from), KE::cend(view_from), - KE::begin(view_dest), binary_op, init_value); + KE::begin(view_dest), binary_op, std::move(init_value)); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// + +// overload set 1 +template && :: + Kokkos::is_team_handle_v, + int> = 0> +KOKKOS_FUNCTION OutputIteratorType +inclusive_scan(const TeamHandleType& teamHandle, InputIteratorType first, + InputIteratorType last, OutputIteratorType first_dest) { + return Impl::inclusive_scan_default_op_team_impl(teamHandle, first, last, + first_dest); +} + +template , int> = 0> +KOKKOS_FUNCTION auto inclusive_scan( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view_from, + const ::Kokkos::View& view_dest) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + namespace KE = ::Kokkos::Experimental; + return Impl::inclusive_scan_default_op_team_impl( + teamHandle, KE::cbegin(view_from), KE::cend(view_from), + KE::begin(view_dest)); +} + +// overload set 2 (accepting custom binary op) +template && :: + Kokkos::is_team_handle_v, + int> = 0> +KOKKOS_FUNCTION OutputIteratorType inclusive_scan( + const TeamHandleType& teamHandle, InputIteratorType first, + InputIteratorType last, OutputIteratorType first_dest, BinaryOp binary_op) { + return Impl::inclusive_scan_custom_binary_op_team_impl( + teamHandle, first, last, first_dest, binary_op); +} + +template , int> = 0> +KOKKOS_FUNCTION auto inclusive_scan( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view_from, + const ::Kokkos::View& view_dest, + BinaryOp binary_op) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + namespace KE = ::Kokkos::Experimental; + return Impl::inclusive_scan_custom_binary_op_team_impl( + teamHandle, KE::cbegin(view_from), KE::cend(view_from), + KE::begin(view_dest), binary_op); +} + +// overload set 3 +template && :: + Kokkos::is_team_handle_v, + int> = 0> + +KOKKOS_FUNCTION OutputIteratorType +inclusive_scan(const TeamHandleType& teamHandle, InputIteratorType first, + InputIteratorType last, OutputIteratorType first_dest, + BinaryOp binary_op, ValueType init_value) { + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + return Impl::inclusive_scan_custom_binary_op_team_impl( + teamHandle, first, last, first_dest, binary_op, std::move(init_value)); +} + +template , int> = 0> +KOKKOS_FUNCTION auto inclusive_scan( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view_from, + const ::Kokkos::View& view_dest, + BinaryOp binary_op, ValueType init_value) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + + namespace KE = ::Kokkos::Experimental; + return Impl::inclusive_scan_custom_binary_op_team_impl( + teamHandle, KE::cbegin(view_from), KE::cend(view_from), + KE::begin(view_dest), binary_op, std::move(init_value)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_IsPartitioned.hpp b/algorithms/src/std_algorithms/Kokkos_IsPartitioned.hpp index 29d6be9e8b..42f20bc4ec 100644 --- a/algorithms/src/std_algorithms/Kokkos_IsPartitioned.hpp +++ b/algorithms/src/std_algorithms/Kokkos_IsPartitioned.hpp @@ -23,39 +23,78 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename IteratorType, typename PredicateType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool is_partitioned(const ExecutionSpace& ex, IteratorType first, IteratorType last, PredicateType p) { - return Impl::is_partitioned_impl( + return Impl::is_partitioned_exespace_impl( "Kokkos::is_partitioned_iterator_api_default", ex, first, last, std::move(p)); } -template +template < + typename ExecutionSpace, typename IteratorType, typename PredicateType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool is_partitioned(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, PredicateType p) { - return Impl::is_partitioned_impl(label, ex, first, last, std::move(p)); + return Impl::is_partitioned_exespace_impl(label, ex, first, last, + std::move(p)); } -template +template < + typename ExecutionSpace, typename PredicateType, typename DataType, + typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool is_partitioned(const ExecutionSpace& ex, const ::Kokkos::View& v, PredicateType p) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); - return Impl::is_partitioned_impl("Kokkos::is_partitioned_view_api_default", - ex, cbegin(v), cend(v), std::move(p)); + return Impl::is_partitioned_exespace_impl( + "Kokkos::is_partitioned_view_api_default", ex, cbegin(v), cend(v), + std::move(p)); } -template +template < + typename ExecutionSpace, typename PredicateType, typename DataType, + typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool is_partitioned(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& v, PredicateType p) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); - return Impl::is_partitioned_impl(label, ex, cbegin(v), cend(v), std::move(p)); + return Impl::is_partitioned_exespace_impl(label, ex, cbegin(v), cend(v), + std::move(p)); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION bool is_partitioned(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + PredicateType p) { + return Impl::is_partitioned_team_impl(teamHandle, first, last, std::move(p)); +} + +template , int> = 0> +KOKKOS_FUNCTION bool is_partitioned( + const TeamHandleType& teamHandle, + const ::Kokkos::View& v, PredicateType p) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); + return Impl::is_partitioned_team_impl(teamHandle, cbegin(v), cend(v), + std::move(p)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_IsSorted.hpp b/algorithms/src/std_algorithms/Kokkos_IsSorted.hpp index f036254a02..2c676c3ff3 100644 --- a/algorithms/src/std_algorithms/Kokkos_IsSorted.hpp +++ b/algorithms/src/std_algorithms/Kokkos_IsSorted.hpp @@ -23,55 +23,73 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename IteratorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool is_sorted(const ExecutionSpace& ex, IteratorType first, IteratorType last) { - return Impl::is_sorted_impl("Kokkos::is_sorted_iterator_api_default", ex, - first, last); + return Impl::is_sorted_exespace_impl("Kokkos::is_sorted_iterator_api_default", + ex, first, last); } -template +template < + typename ExecutionSpace, typename IteratorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool is_sorted(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last) { - return Impl::is_sorted_impl(label, ex, first, last); + return Impl::is_sorted_exespace_impl(label, ex, first, last); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool is_sorted(const ExecutionSpace& ex, const ::Kokkos::View& view) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); namespace KE = ::Kokkos::Experimental; - return Impl::is_sorted_impl("Kokkos::is_sorted_view_api_default", ex, - KE::cbegin(view), KE::cend(view)); + return Impl::is_sorted_exespace_impl("Kokkos::is_sorted_view_api_default", ex, + KE::cbegin(view), KE::cend(view)); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool is_sorted(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); namespace KE = ::Kokkos::Experimental; - return Impl::is_sorted_impl(label, ex, KE::cbegin(view), KE::cend(view)); + return Impl::is_sorted_exespace_impl(label, ex, KE::cbegin(view), + KE::cend(view)); } -template +template < + typename ExecutionSpace, typename IteratorType, typename ComparatorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool is_sorted(const ExecutionSpace& ex, IteratorType first, IteratorType last, ComparatorType comp) { Impl::static_assert_is_not_openmptarget(ex); - return Impl::is_sorted_impl("Kokkos::is_sorted_iterator_api_default", ex, - first, last, std::move(comp)); + return Impl::is_sorted_exespace_impl("Kokkos::is_sorted_iterator_api_default", + ex, first, last, std::move(comp)); } -template +template < + typename ExecutionSpace, typename IteratorType, typename ComparatorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool is_sorted(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, ComparatorType comp) { Impl::static_assert_is_not_openmptarget(ex); - return Impl::is_sorted_impl(label, ex, first, last, std::move(comp)); + return Impl::is_sorted_exespace_impl(label, ex, first, last, std::move(comp)); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename ComparatorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool is_sorted(const ExecutionSpace& ex, const ::Kokkos::View& view, ComparatorType comp) { @@ -79,13 +97,15 @@ bool is_sorted(const ExecutionSpace& ex, Impl::static_assert_is_not_openmptarget(ex); namespace KE = ::Kokkos::Experimental; - return Impl::is_sorted_impl("Kokkos::is_sorted_view_api_default", ex, - KE::cbegin(view), KE::cend(view), - std::move(comp)); + return Impl::is_sorted_exespace_impl("Kokkos::is_sorted_view_api_default", ex, + KE::cbegin(view), KE::cend(view), + std::move(comp)); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename ComparatorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool is_sorted(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, ComparatorType comp) { @@ -93,8 +113,56 @@ bool is_sorted(const std::string& label, const ExecutionSpace& ex, Impl::static_assert_is_not_openmptarget(ex); namespace KE = ::Kokkos::Experimental; - return Impl::is_sorted_impl(label, ex, KE::cbegin(view), KE::cend(view), - std::move(comp)); + return Impl::is_sorted_exespace_impl(label, ex, KE::cbegin(view), + KE::cend(view), std::move(comp)); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION bool is_sorted(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last) { + return Impl::is_sorted_team_impl(teamHandle, first, last); +} + +template , int> = 0> +KOKKOS_FUNCTION bool is_sorted( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + + namespace KE = ::Kokkos::Experimental; + return Impl::is_sorted_team_impl(teamHandle, KE::cbegin(view), + KE::cend(view)); +} + +template , int> = 0> +KOKKOS_FUNCTION bool is_sorted(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + ComparatorType comp) { + Impl::static_assert_is_not_openmptarget(teamHandle); + return Impl::is_sorted_team_impl(teamHandle, first, last, std::move(comp)); +} + +template , int> = 0> +KOKKOS_FUNCTION bool is_sorted( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view, ComparatorType comp) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + Impl::static_assert_is_not_openmptarget(teamHandle); + + namespace KE = ::Kokkos::Experimental; + return Impl::is_sorted_team_impl(teamHandle, KE::cbegin(view), KE::cend(view), + std::move(comp)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_IsSortedUntil.hpp b/algorithms/src/std_algorithms/Kokkos_IsSortedUntil.hpp index 276b3bb884..96a17b6785 100644 --- a/algorithms/src/std_algorithms/Kokkos_IsSortedUntil.hpp +++ b/algorithms/src/std_algorithms/Kokkos_IsSortedUntil.hpp @@ -23,58 +23,78 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename IteratorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType is_sorted_until(const ExecutionSpace& ex, IteratorType first, IteratorType last) { - return Impl::is_sorted_until_impl( + return Impl::is_sorted_until_exespace_impl( "Kokkos::is_sorted_until_iterator_api_default", ex, first, last); } -template +template < + typename ExecutionSpace, typename IteratorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType is_sorted_until(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last) { - return Impl::is_sorted_until_impl(label, ex, first, last); + return Impl::is_sorted_until_exespace_impl(label, ex, first, last); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto is_sorted_until(const ExecutionSpace& ex, const ::Kokkos::View& view) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); namespace KE = ::Kokkos::Experimental; - return Impl::is_sorted_until_impl("Kokkos::is_sorted_until_view_api_default", - ex, KE::begin(view), KE::end(view)); + return Impl::is_sorted_until_exespace_impl( + "Kokkos::is_sorted_until_view_api_default", ex, KE::begin(view), + KE::end(view)); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto is_sorted_until(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); namespace KE = ::Kokkos::Experimental; - return Impl::is_sorted_until_impl(label, ex, KE::begin(view), KE::end(view)); + return Impl::is_sorted_until_exespace_impl(label, ex, KE::begin(view), + KE::end(view)); } -template +template < + typename ExecutionSpace, typename IteratorType, typename ComparatorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType is_sorted_until(const ExecutionSpace& ex, IteratorType first, IteratorType last, ComparatorType comp) { Impl::static_assert_is_not_openmptarget(ex); - return Impl::is_sorted_until_impl( + return Impl::is_sorted_until_exespace_impl( "Kokkos::is_sorted_until_iterator_api_default", ex, first, last, std::move(comp)); } -template +template < + typename ExecutionSpace, typename IteratorType, typename ComparatorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType is_sorted_until(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, ComparatorType comp) { Impl::static_assert_is_not_openmptarget(ex); - return Impl::is_sorted_until_impl(label, ex, first, last, std::move(comp)); + return Impl::is_sorted_until_exespace_impl(label, ex, first, last, + std::move(comp)); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename ComparatorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto is_sorted_until(const ExecutionSpace& ex, const ::Kokkos::View& view, ComparatorType comp) { @@ -82,13 +102,15 @@ auto is_sorted_until(const ExecutionSpace& ex, Impl::static_assert_is_not_openmptarget(ex); namespace KE = ::Kokkos::Experimental; - return Impl::is_sorted_until_impl("Kokkos::is_sorted_until_view_api_default", - ex, KE::begin(view), KE::end(view), - std::move(comp)); + return Impl::is_sorted_until_exespace_impl( + "Kokkos::is_sorted_until_view_api_default", ex, KE::begin(view), + KE::end(view), std::move(comp)); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename ComparatorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto is_sorted_until(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, ComparatorType comp) { @@ -96,8 +118,57 @@ auto is_sorted_until(const std::string& label, const ExecutionSpace& ex, Impl::static_assert_is_not_openmptarget(ex); namespace KE = ::Kokkos::Experimental; - return Impl::is_sorted_until_impl(label, ex, KE::begin(view), KE::end(view), - std::move(comp)); + return Impl::is_sorted_until_exespace_impl(label, ex, KE::begin(view), + KE::end(view), std::move(comp)); +} + +// +// overload set accepting team handle +// +template , int> = 0> +KOKKOS_FUNCTION IteratorType is_sorted_until(const TeamHandleType& teamHandle, + IteratorType first, + IteratorType last) { + return Impl::is_sorted_until_team_impl(teamHandle, first, last); +} + +template , int> = 0> +KOKKOS_FUNCTION auto is_sorted_until( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + + namespace KE = ::Kokkos::Experimental; + return Impl::is_sorted_until_team_impl(teamHandle, KE::begin(view), + KE::end(view)); +} + +template , int> = 0> +KOKKOS_FUNCTION IteratorType is_sorted_until(const TeamHandleType& teamHandle, + IteratorType first, + IteratorType last, + ComparatorType comp) { + Impl::static_assert_is_not_openmptarget(teamHandle); + return Impl::is_sorted_until_team_impl(teamHandle, first, last, + std::move(comp)); +} + +template , int> = 0> +KOKKOS_FUNCTION auto is_sorted_until( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view, ComparatorType comp) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + Impl::static_assert_is_not_openmptarget(teamHandle); + + namespace KE = ::Kokkos::Experimental; + return Impl::is_sorted_until_team_impl(teamHandle, KE::begin(view), + KE::end(view), std::move(comp)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp b/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp index a796a306dd..5bb2d1039d 100644 --- a/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp +++ b/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp @@ -19,7 +19,6 @@ #include #include "impl/Kokkos_Constraints.hpp" -#include "Kokkos_Swap.hpp" namespace Kokkos { namespace Experimental { @@ -33,7 +32,7 @@ struct StdIterSwapFunctor { KOKKOS_FUNCTION void operator()(int i) const { (void)i; - ::Kokkos::Experimental::swap(*m_a, *m_b); + ::Kokkos::kokkos_swap(*m_a, *m_b); } KOKKOS_FUNCTION @@ -58,6 +57,16 @@ void iter_swap(IteratorType1 a, IteratorType2 b) { Impl::iter_swap_impl(a, b); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template +KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::kokkos_swap instead!") +KOKKOS_FUNCTION + void swap(T& a, T& b) noexcept(::Kokkos::kokkos_swap(std::declval(), + std::declval())) { + ::Kokkos::kokkos_swap(a, b); +} +#endif + } // namespace Experimental } // namespace Kokkos diff --git a/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp b/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp index 0a77ef629f..e13479c370 100644 --- a/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp +++ b/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp @@ -23,101 +23,181 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + class ExecutionSpace, class IteratorType1, class IteratorType2, + std::enable_if_t, int> = 0> bool lexicographical_compare(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1, IteratorType2 first2, IteratorType2 last2) { - return Impl::lexicographical_compare_impl( + return Impl::lexicographical_compare_exespace_impl( "Kokkos::lexicographical_compare_iterator_api_default", ex, first1, last1, first2, last2); } -template +template < + class ExecutionSpace, class IteratorType1, class IteratorType2, + std::enable_if_t, int> = 0> bool lexicographical_compare(const std::string& label, const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1, IteratorType2 first2, IteratorType2 last2) { - return Impl::lexicographical_compare_impl(label, ex, first1, last1, first2, - last2); + return Impl::lexicographical_compare_exespace_impl(label, ex, first1, last1, + first2, last2); } -template +template < + class ExecutionSpace, class DataType1, class... Properties1, + class DataType2, class... Properties2, + std::enable_if_t, int> = 0> bool lexicographical_compare( const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); namespace KE = ::Kokkos::Experimental; - return Impl::lexicographical_compare_impl( + return Impl::lexicographical_compare_exespace_impl( "Kokkos::lexicographical_compare_view_api_default", ex, KE::cbegin(view1), KE::cend(view1), KE::cbegin(view2), KE::cend(view2)); } -template +template < + class ExecutionSpace, class DataType1, class... Properties1, + class DataType2, class... Properties2, + std::enable_if_t, int> = 0> bool lexicographical_compare( const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); namespace KE = ::Kokkos::Experimental; - return Impl::lexicographical_compare_impl(label, ex, KE::cbegin(view1), - KE::cend(view1), KE::cbegin(view2), - KE::cend(view2)); + return Impl::lexicographical_compare_exespace_impl( + label, ex, KE::cbegin(view1), KE::cend(view1), KE::cbegin(view2), + KE::cend(view2)); } -template +template < + class ExecutionSpace, class IteratorType1, class IteratorType2, + class ComparatorType, + std::enable_if_t, int> = 0> bool lexicographical_compare(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1, IteratorType2 first2, IteratorType2 last2, ComparatorType comp) { - return Impl::lexicographical_compare_impl( + return Impl::lexicographical_compare_exespace_impl( "Kokkos::lexicographical_compare_iterator_api_default", ex, first1, last1, first2, last2, comp); } -template +template < + class ExecutionSpace, class IteratorType1, class IteratorType2, + class ComparatorType, + std::enable_if_t, int> = 0> bool lexicographical_compare(const std::string& label, const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1, IteratorType2 first2, IteratorType2 last2, ComparatorType comp) { - return Impl::lexicographical_compare_impl(label, ex, first1, last1, first2, - last2, comp); + return Impl::lexicographical_compare_exespace_impl(label, ex, first1, last1, + first2, last2, comp); } -template +template < + class ExecutionSpace, class DataType1, class... Properties1, + class DataType2, class... Properties2, class ComparatorType, + std::enable_if_t, int> = 0> bool lexicographical_compare( const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2, ComparatorType comp) { + const ::Kokkos::View& view2, + ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); namespace KE = ::Kokkos::Experimental; - return Impl::lexicographical_compare_impl( + return Impl::lexicographical_compare_exespace_impl( "Kokkos::lexicographical_compare_view_api_default", ex, KE::cbegin(view1), KE::cend(view1), KE::cbegin(view2), KE::cend(view2), comp); } -template +template < + class ExecutionSpace, class DataType1, class... Properties1, + class DataType2, class... Properties2, class ComparatorType, + std::enable_if_t, int> = 0> bool lexicographical_compare( const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2, ComparatorType comp) { + const ::Kokkos::View& view2, + ComparatorType comp) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); + + namespace KE = ::Kokkos::Experimental; + return Impl::lexicographical_compare_exespace_impl( + label, ex, KE::cbegin(view1), KE::cend(view1), KE::cbegin(view2), + KE::cend(view2), comp); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION bool lexicographical_compare(const TeamHandleType& teamHandle, + IteratorType1 first1, + IteratorType1 last1, + IteratorType2 first2, + IteratorType2 last2) { + return Impl::lexicographical_compare_team_impl(teamHandle, first1, last1, + first2, last2); +} + +template , int> = 0> +KOKKOS_FUNCTION bool lexicographical_compare( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view1, + const ::Kokkos::View& view2) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); + + namespace KE = ::Kokkos::Experimental; + return Impl::lexicographical_compare_team_impl( + teamHandle, KE::cbegin(view1), KE::cend(view1), KE::cbegin(view2), + KE::cend(view2)); +} + +template , int> = 0> +KOKKOS_FUNCTION bool lexicographical_compare( + const TeamHandleType& teamHandle, IteratorType1 first1, IteratorType1 last1, + IteratorType2 first2, IteratorType2 last2, ComparatorType comp) { + return Impl::lexicographical_compare_team_impl(teamHandle, first1, last1, + first2, last2, comp); +} + +template , int> = 0> +KOKKOS_FUNCTION bool lexicographical_compare( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view1, + const ::Kokkos::View& view2, + ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); namespace KE = ::Kokkos::Experimental; - return Impl::lexicographical_compare_impl(label, ex, KE::cbegin(view1), - KE::cend(view1), KE::cbegin(view2), - KE::cend(view2), comp); + return Impl::lexicographical_compare_team_impl( + teamHandle, KE::cbegin(view1), KE::cend(view1), KE::cbegin(view2), + KE::cend(view2), comp); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_MaxElement.hpp b/algorithms/src/std_algorithms/Kokkos_MaxElement.hpp index 2c1374f700..d16bac5bfc 100644 --- a/algorithms/src/std_algorithms/Kokkos_MaxElement.hpp +++ b/algorithms/src/std_algorithms/Kokkos_MaxElement.hpp @@ -23,81 +23,148 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename IteratorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto max_element(const ExecutionSpace& ex, IteratorType first, IteratorType last) { - return Impl::min_or_max_element_impl( + return Impl::min_or_max_element_exespace_impl( "Kokkos::max_element_iterator_api_default", ex, first, last); } -template +template < + typename ExecutionSpace, typename IteratorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto max_element(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last) { - return Impl::min_or_max_element_impl(label, ex, first, last); + return Impl::min_or_max_element_exespace_impl(label, ex, first, + last); } -template +template < + typename ExecutionSpace, typename IteratorType, typename ComparatorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto max_element(const ExecutionSpace& ex, IteratorType first, IteratorType last, ComparatorType comp) { Impl::static_assert_is_not_openmptarget(ex); - return Impl::min_or_max_element_impl( + return Impl::min_or_max_element_exespace_impl( "Kokkos::max_element_iterator_api_default", ex, first, last, std::move(comp)); } -template +template < + typename ExecutionSpace, typename IteratorType, typename ComparatorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto max_element(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, ComparatorType comp) { Impl::static_assert_is_not_openmptarget(ex); - return Impl::min_or_max_element_impl( + return Impl::min_or_max_element_exespace_impl( label, ex, first, last, std::move(comp)); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto max_element(const ExecutionSpace& ex, const ::Kokkos::View& v) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); - return Impl::min_or_max_element_impl( + return Impl::min_or_max_element_exespace_impl( "Kokkos::max_element_view_api_default", ex, begin(v), end(v)); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto max_element(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& v) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); - return Impl::min_or_max_element_impl(label, ex, begin(v), - end(v)); + return Impl::min_or_max_element_exespace_impl(label, ex, + begin(v), end(v)); } -template +template < + typename ExecutionSpace, typename DataType, typename ComparatorType, + typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto max_element(const ExecutionSpace& ex, const ::Kokkos::View& v, ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); Impl::static_assert_is_not_openmptarget(ex); - return Impl::min_or_max_element_impl( + return Impl::min_or_max_element_exespace_impl( "Kokkos::max_element_view_api_default", ex, begin(v), end(v), std::move(comp)); } -template +template < + typename ExecutionSpace, typename DataType, typename ComparatorType, + typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto max_element(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& v, ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); Impl::static_assert_is_not_openmptarget(ex); - return Impl::min_or_max_element_impl( + return Impl::min_or_max_element_exespace_impl( label, ex, begin(v), end(v), std::move(comp)); } +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION auto max_element(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last) { + return Impl::min_or_max_element_team_impl(teamHandle, first, + last); +} + +template , int> = 0> +KOKKOS_FUNCTION auto max_element( + const TeamHandleType& teamHandle, + const ::Kokkos::View& v) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); + + return Impl::min_or_max_element_team_impl(teamHandle, begin(v), + end(v)); +} + +template , int> = 0> +KOKKOS_FUNCTION auto max_element(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + ComparatorType comp) { + Impl::static_assert_is_not_openmptarget(teamHandle); + return Impl::min_or_max_element_team_impl( + teamHandle, first, last, std::move(comp)); +} + +template , int> = 0> +KOKKOS_FUNCTION auto max_element( + const TeamHandleType& teamHandle, + const ::Kokkos::View& v, ComparatorType comp) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); + Impl::static_assert_is_not_openmptarget(teamHandle); + return Impl::min_or_max_element_team_impl( + teamHandle, begin(v), end(v), std::move(comp)); +} + } // namespace Experimental } // namespace Kokkos diff --git a/algorithms/src/std_algorithms/Kokkos_MinElement.hpp b/algorithms/src/std_algorithms/Kokkos_MinElement.hpp index 1d03b7c962..2a53fce3e2 100644 --- a/algorithms/src/std_algorithms/Kokkos_MinElement.hpp +++ b/algorithms/src/std_algorithms/Kokkos_MinElement.hpp @@ -23,81 +23,148 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename IteratorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto min_element(const ExecutionSpace& ex, IteratorType first, IteratorType last) { - return Impl::min_or_max_element_impl( + return Impl::min_or_max_element_exespace_impl( "Kokkos::min_element_iterator_api_default", ex, first, last); } -template +template < + typename ExecutionSpace, typename IteratorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto min_element(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last) { - return Impl::min_or_max_element_impl(label, ex, first, last); + return Impl::min_or_max_element_exespace_impl(label, ex, first, + last); } -template +template < + typename ExecutionSpace, typename IteratorType, typename ComparatorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto min_element(const ExecutionSpace& ex, IteratorType first, IteratorType last, ComparatorType comp) { Impl::static_assert_is_not_openmptarget(ex); - return Impl::min_or_max_element_impl( + return Impl::min_or_max_element_exespace_impl( "Kokkos::min_element_iterator_api_default", ex, first, last, std::move(comp)); } -template +template < + typename ExecutionSpace, typename IteratorType, typename ComparatorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto min_element(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, ComparatorType comp) { Impl::static_assert_is_not_openmptarget(ex); - return Impl::min_or_max_element_impl( + return Impl::min_or_max_element_exespace_impl( label, ex, first, last, std::move(comp)); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto min_element(const ExecutionSpace& ex, const ::Kokkos::View& v) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); - return Impl::min_or_max_element_impl( + return Impl::min_or_max_element_exespace_impl( "Kokkos::min_element_view_api_default", ex, begin(v), end(v)); } -template +template < + typename ExecutionSpace, typename DataType, typename ComparatorType, + typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto min_element(const ExecutionSpace& ex, const ::Kokkos::View& v, ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); Impl::static_assert_is_not_openmptarget(ex); - return Impl::min_or_max_element_impl( + return Impl::min_or_max_element_exespace_impl( "Kokkos::min_element_view_api_default", ex, begin(v), end(v), std::move(comp)); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto min_element(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& v) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); - return Impl::min_or_max_element_impl(label, ex, begin(v), - end(v)); + return Impl::min_or_max_element_exespace_impl(label, ex, + begin(v), end(v)); } -template +template < + typename ExecutionSpace, typename DataType, typename ComparatorType, + typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto min_element(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& v, ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); Impl::static_assert_is_not_openmptarget(ex); - return Impl::min_or_max_element_impl( + return Impl::min_or_max_element_exespace_impl( label, ex, begin(v), end(v), std::move(comp)); } +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION auto min_element(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last) { + return Impl::min_or_max_element_team_impl(teamHandle, first, + last); +} + +template , int> = 0> +KOKKOS_FUNCTION auto min_element( + const TeamHandleType& teamHandle, + const ::Kokkos::View& v) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); + + return Impl::min_or_max_element_team_impl(teamHandle, begin(v), + end(v)); +} + +template , int> = 0> +KOKKOS_FUNCTION auto min_element(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + ComparatorType comp) { + Impl::static_assert_is_not_openmptarget(teamHandle); + return Impl::min_or_max_element_team_impl( + teamHandle, first, last, std::move(comp)); +} + +template , int> = 0> +KOKKOS_FUNCTION auto min_element( + const TeamHandleType& teamHandle, + const ::Kokkos::View& v, ComparatorType comp) { + Impl::static_assert_is_not_openmptarget(teamHandle); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); + return Impl::min_or_max_element_team_impl( + teamHandle, begin(v), end(v), std::move(comp)); +} + } // namespace Experimental } // namespace Kokkos diff --git a/algorithms/src/std_algorithms/Kokkos_MinMaxElement.hpp b/algorithms/src/std_algorithms/Kokkos_MinMaxElement.hpp index d481b499cc..c3a1f73ef6 100644 --- a/algorithms/src/std_algorithms/Kokkos_MinMaxElement.hpp +++ b/algorithms/src/std_algorithms/Kokkos_MinMaxElement.hpp @@ -23,82 +23,151 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename IteratorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto minmax_element(const ExecutionSpace& ex, IteratorType first, IteratorType last) { - return Impl::minmax_element_impl( + return Impl::minmax_element_exespace_impl( "Kokkos::minmax_element_iterator_api_default", ex, first, last); } -template +template < + typename ExecutionSpace, typename IteratorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto minmax_element(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last) { - return Impl::minmax_element_impl(label, ex, first, last); + return Impl::minmax_element_exespace_impl(label, ex, + first, last); } -template +template < + typename ExecutionSpace, typename IteratorType, typename ComparatorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto minmax_element(const ExecutionSpace& ex, IteratorType first, IteratorType last, ComparatorType comp) { Impl::static_assert_is_not_openmptarget(ex); - return Impl::minmax_element_impl( + return Impl::minmax_element_exespace_impl( "Kokkos::minmax_element_iterator_api_default", ex, first, last, std::move(comp)); } -template +template < + typename ExecutionSpace, typename IteratorType, typename ComparatorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto minmax_element(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, ComparatorType comp) { Impl::static_assert_is_not_openmptarget(ex); - return Impl::minmax_element_impl( + return Impl::minmax_element_exespace_impl( label, ex, first, last, std::move(comp)); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto minmax_element(const ExecutionSpace& ex, const ::Kokkos::View& v) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); - return Impl::minmax_element_impl( + return Impl::minmax_element_exespace_impl( "Kokkos::minmax_element_view_api_default", ex, begin(v), end(v)); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto minmax_element(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& v) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); - return Impl::minmax_element_impl(label, ex, begin(v), - end(v)); + return Impl::minmax_element_exespace_impl( + label, ex, begin(v), end(v)); } -template +template < + typename ExecutionSpace, typename DataType, typename ComparatorType, + typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto minmax_element(const ExecutionSpace& ex, const ::Kokkos::View& v, ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); Impl::static_assert_is_not_openmptarget(ex); - return Impl::minmax_element_impl( + return Impl::minmax_element_exespace_impl( "Kokkos::minmax_element_view_api_default", ex, begin(v), end(v), std::move(comp)); } -template +template < + typename ExecutionSpace, typename DataType, typename ComparatorType, + typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto minmax_element(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& v, ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); Impl::static_assert_is_not_openmptarget(ex); - return Impl::minmax_element_impl( + return Impl::minmax_element_exespace_impl( label, ex, begin(v), end(v), std::move(comp)); } +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION auto minmax_element(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last) { + return Impl::minmax_element_team_impl(teamHandle, first, + last); +} + +template , int> = 0> +KOKKOS_FUNCTION auto minmax_element(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + ComparatorType comp) { + Impl::static_assert_is_not_openmptarget(teamHandle); + + return Impl::minmax_element_team_impl( + teamHandle, first, last, std::move(comp)); +} + +template , int> = 0> +KOKKOS_FUNCTION auto minmax_element( + const TeamHandleType& teamHandle, + const ::Kokkos::View& v) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); + + return Impl::minmax_element_team_impl(teamHandle, + begin(v), end(v)); +} + +template , int> = 0> +KOKKOS_FUNCTION auto minmax_element( + const TeamHandleType& teamHandle, + const ::Kokkos::View& v, ComparatorType comp) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); + Impl::static_assert_is_not_openmptarget(teamHandle); + + return Impl::minmax_element_team_impl( + teamHandle, begin(v), end(v), std::move(comp)); +} + } // namespace Experimental } // namespace Kokkos diff --git a/algorithms/src/std_algorithms/Kokkos_Mismatch.hpp b/algorithms/src/std_algorithms/Kokkos_Mismatch.hpp index 13c994ca90..090afe69e3 100644 --- a/algorithms/src/std_algorithms/Kokkos_Mismatch.hpp +++ b/algorithms/src/std_algorithms/Kokkos_Mismatch.hpp @@ -30,46 +30,60 @@ namespace Experimental { // // makes API ambiguous (with the overload accepting views). -template +// +// overload set accepting execution space +// +template < + class ExecutionSpace, class IteratorType1, class IteratorType2, + std::enable_if_t, int> = 0> ::Kokkos::pair mismatch(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1, IteratorType2 first2, IteratorType2 last2) { - return Impl::mismatch_impl("Kokkos::mismatch_iterator_api_default", ex, - first1, last1, first2, last2); + return Impl::mismatch_exespace_impl("Kokkos::mismatch_iterator_api_default", + ex, first1, last1, first2, last2); } -template +template < + class ExecutionSpace, class IteratorType1, class IteratorType2, + class BinaryPredicateType, + std::enable_if_t, int> = 0> ::Kokkos::pair mismatch( const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1, IteratorType2 first2, IteratorType2 last2, BinaryPredicateType&& predicate) { - return Impl::mismatch_impl("Kokkos::mismatch_iterator_api_default", ex, - first1, last1, first2, last2, - std::forward(predicate)); + return Impl::mismatch_exespace_impl( + "Kokkos::mismatch_iterator_api_default", ex, first1, last1, first2, last2, + std::forward(predicate)); } -template +template < + class ExecutionSpace, class IteratorType1, class IteratorType2, + std::enable_if_t, int> = 0> ::Kokkos::pair mismatch( const std::string& label, const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1, IteratorType2 first2, IteratorType2 last2) { - return Impl::mismatch_impl(label, ex, first1, last1, first2, last2); + return Impl::mismatch_exespace_impl(label, ex, first1, last1, first2, last2); } -template +template < + class ExecutionSpace, class IteratorType1, class IteratorType2, + class BinaryPredicateType, + std::enable_if_t, int> = 0> ::Kokkos::pair mismatch( const std::string& label, const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1, IteratorType2 first2, IteratorType2 last2, BinaryPredicateType&& predicate) { - return Impl::mismatch_impl(label, ex, first1, last1, first2, last2, - std::forward(predicate)); + return Impl::mismatch_exespace_impl( + label, ex, first1, last1, first2, last2, + std::forward(predicate)); } -template +template < + class ExecutionSpace, class DataType1, class... Properties1, + class DataType2, class... Properties2, + std::enable_if_t, int> = 0> auto mismatch(const ExecutionSpace& ex, const ::Kokkos::View& view1, const ::Kokkos::View& view2) { @@ -77,13 +91,15 @@ auto mismatch(const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); namespace KE = ::Kokkos::Experimental; - return Impl::mismatch_impl("Kokkos::mismatch_view_api_default", ex, - KE::begin(view1), KE::end(view1), KE::begin(view2), - KE::end(view2)); + return Impl::mismatch_exespace_impl("Kokkos::mismatch_view_api_default", ex, + KE::begin(view1), KE::end(view1), + KE::begin(view2), KE::end(view2)); } -template +template < + class ExecutionSpace, class DataType1, class... Properties1, + class DataType2, class... Properties2, class BinaryPredicateType, + std::enable_if_t, int> = 0> auto mismatch(const ExecutionSpace& ex, const ::Kokkos::View& view1, const ::Kokkos::View& view2, @@ -92,14 +108,16 @@ auto mismatch(const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); namespace KE = ::Kokkos::Experimental; - return Impl::mismatch_impl("Kokkos::mismatch_view_api_default", ex, - KE::begin(view1), KE::end(view1), KE::begin(view2), - KE::end(view2), - std::forward(predicate)); + return Impl::mismatch_exespace_impl( + "Kokkos::mismatch_view_api_default", ex, KE::begin(view1), KE::end(view1), + KE::begin(view2), KE::end(view2), + std::forward(predicate)); } -template +template < + class ExecutionSpace, class DataType1, class... Properties1, + class DataType2, class... Properties2, + std::enable_if_t, int> = 0> auto mismatch(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view1, const ::Kokkos::View& view2) { @@ -107,12 +125,15 @@ auto mismatch(const std::string& label, const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); namespace KE = ::Kokkos::Experimental; - return Impl::mismatch_impl(label, ex, KE::begin(view1), KE::end(view1), - KE::begin(view2), KE::end(view2)); + return Impl::mismatch_exespace_impl(label, ex, KE::begin(view1), + KE::end(view1), KE::begin(view2), + KE::end(view2)); } -template +template < + class ExecutionSpace, class DataType1, class... Properties1, + class DataType2, class... Properties2, class BinaryPredicateType, + std::enable_if_t, int> = 0> auto mismatch(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view1, const ::Kokkos::View& view2, @@ -121,9 +142,65 @@ auto mismatch(const std::string& label, const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); namespace KE = ::Kokkos::Experimental; - return Impl::mismatch_impl(label, ex, KE::begin(view1), KE::end(view1), - KE::begin(view2), KE::end(view2), - std::forward(predicate)); + return Impl::mismatch_exespace_impl( + label, ex, KE::begin(view1), KE::end(view1), KE::begin(view2), + KE::end(view2), std::forward(predicate)); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION ::Kokkos::pair mismatch( + const TeamHandleType& teamHandle, IteratorType1 first1, IteratorType1 last1, + IteratorType2 first2, IteratorType2 last2) { + return Impl::mismatch_team_impl(teamHandle, first1, last1, first2, last2); +} + +template , int> = 0> +KOKKOS_FUNCTION ::Kokkos::pair mismatch( + const TeamHandleType& teamHandle, IteratorType1 first1, IteratorType1 last1, + IteratorType2 first2, IteratorType2 last2, + BinaryPredicateType&& predicate) { + return Impl::mismatch_team_impl(teamHandle, first1, last1, first2, last2, + std::forward(predicate)); +} + +template , int> = 0> +KOKKOS_FUNCTION auto mismatch( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view1, + const ::Kokkos::View& view2) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); + + namespace KE = ::Kokkos::Experimental; + return Impl::mismatch_team_impl(teamHandle, KE::begin(view1), KE::end(view1), + KE::begin(view2), KE::end(view2)); +} + +template , int> = 0> +KOKKOS_FUNCTION auto mismatch( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view1, + const ::Kokkos::View& view2, + BinaryPredicateType&& predicate) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); + + namespace KE = ::Kokkos::Experimental; + return Impl::mismatch_team_impl(teamHandle, KE::begin(view1), KE::end(view1), + KE::begin(view2), KE::end(view2), + std::forward(predicate)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_Move.hpp b/algorithms/src/std_algorithms/Kokkos_Move.hpp index d49acd9f70..ac308ea184 100644 --- a/algorithms/src/std_algorithms/Kokkos_Move.hpp +++ b/algorithms/src/std_algorithms/Kokkos_Move.hpp @@ -23,41 +23,81 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename InputIterator, typename OutputIterator, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> OutputIterator move(const ExecutionSpace& ex, InputIterator first, InputIterator last, OutputIterator d_first) { - return Impl::move_impl("Kokkos::move_iterator_api_default", ex, first, last, - d_first); + return Impl::move_exespace_impl("Kokkos::move_iterator_api_default", ex, + first, last, d_first); } -template +template < + typename ExecutionSpace, typename InputIterator, typename OutputIterator, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> OutputIterator move(const std::string& label, const ExecutionSpace& ex, InputIterator first, InputIterator last, OutputIterator d_first) { - return Impl::move_impl(label, ex, first, last, d_first); + return Impl::move_exespace_impl(label, ex, first, last, d_first); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto move(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); - return Impl::move_impl("Kokkos::move_view_api_default", ex, begin(source), - end(source), begin(dest)); + return Impl::move_exespace_impl("Kokkos::move_view_api_default", ex, + begin(source), end(source), begin(dest)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto move(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); + + return Impl::move_exespace_impl(label, ex, begin(source), end(source), + begin(dest)); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION OutputIterator move(const TeamHandleType& teamHandle, + InputIterator first, InputIterator last, + OutputIterator d_first) { + return Impl::move_team_impl(teamHandle, first, last, d_first); +} + +template , int> = 0> +KOKKOS_FUNCTION auto move( + const TeamHandleType& teamHandle, + const ::Kokkos::View& source, + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); - return Impl::move_impl(label, ex, begin(source), end(source), begin(dest)); + return Impl::move_team_impl(teamHandle, begin(source), end(source), + begin(dest)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp b/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp index 60d50fa881..2789ab2179 100644 --- a/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp +++ b/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp @@ -23,42 +23,83 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename IteratorType1, typename IteratorType2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType2 move_backward(const ExecutionSpace& ex, IteratorType1 first, IteratorType1 last, IteratorType2 d_last) { - return Impl::move_backward_impl("Kokkos::move_backward_iterator_api_default", - ex, first, last, d_last); + return Impl::move_backward_exespace_impl( + "Kokkos::move_backward_iterator_api_default", ex, first, last, d_last); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto move_backward(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); - return Impl::move_backward_impl("Kokkos::move_backward_view_api_default", ex, - begin(source), end(source), end(dest)); + return Impl::move_backward_exespace_impl( + "Kokkos::move_backward_view_api_default", ex, begin(source), end(source), + end(dest)); } -template +template < + typename ExecutionSpace, typename IteratorType1, typename IteratorType2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType2 move_backward(const std::string& label, const ExecutionSpace& ex, IteratorType1 first, IteratorType1 last, IteratorType2 d_last) { - return Impl::move_backward_impl(label, ex, first, last, d_last); + return Impl::move_backward_exespace_impl(label, ex, first, last, d_last); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto move_backward(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); + + return Impl::move_backward_exespace_impl(label, ex, begin(source), + end(source), end(dest)); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION IteratorType2 move_backward(const TeamHandleType& teamHandle, + IteratorType1 first, + IteratorType1 last, + IteratorType2 d_last) { + return Impl::move_backward_team_impl(teamHandle, first, last, d_last); +} + +template , int> = 0> +KOKKOS_FUNCTION auto move_backward( + const TeamHandleType& teamHandle, + const ::Kokkos::View& source, + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); - return Impl::move_backward_impl(label, ex, begin(source), end(source), - end(dest)); + return Impl::move_backward_team_impl(teamHandle, begin(source), end(source), + end(dest)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_NoneOf.hpp b/algorithms/src/std_algorithms/Kokkos_NoneOf.hpp index cf5de3b72b..f7baab3fc0 100644 --- a/algorithms/src/std_algorithms/Kokkos_NoneOf.hpp +++ b/algorithms/src/std_algorithms/Kokkos_NoneOf.hpp @@ -23,41 +23,80 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename IteratorType, typename Predicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool none_of(const ExecutionSpace& ex, IteratorType first, IteratorType last, Predicate predicate) { - return Impl::none_of_impl("Kokkos::none_of_iterator_api_default", ex, first, - last, predicate); + return Impl::none_of_exespace_impl("Kokkos::none_of_iterator_api_default", ex, + first, last, predicate); } -template +template < + typename ExecutionSpace, typename IteratorType, typename Predicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool none_of(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, Predicate predicate) { - return Impl::none_of_impl(label, ex, first, last, predicate); + return Impl::none_of_exespace_impl(label, ex, first, last, predicate); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename Predicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool none_of(const ExecutionSpace& ex, const ::Kokkos::View& v, Predicate predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::none_of_impl("Kokkos::none_of_view_api_default", ex, - KE::cbegin(v), KE::cend(v), std::move(predicate)); + return Impl::none_of_exespace_impl("Kokkos::none_of_view_api_default", ex, + KE::cbegin(v), KE::cend(v), + std::move(predicate)); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename Predicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool none_of(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& v, Predicate predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::none_of_impl(label, ex, KE::cbegin(v), KE::cend(v), - std::move(predicate)); + return Impl::none_of_exespace_impl(label, ex, KE::cbegin(v), KE::cend(v), + std::move(predicate)); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template +KOKKOS_FUNCTION + std::enable_if_t<::Kokkos::is_team_handle::value, bool> + none_of(const TeamHandleType& teamHandle, IteratorType first, + IteratorType last, Predicate predicate) { + return Impl::none_of_team_impl(teamHandle, first, last, predicate); +} + +template +KOKKOS_FUNCTION + std::enable_if_t<::Kokkos::is_team_handle::value, bool> + none_of(const TeamHandleType& teamHandle, + const ::Kokkos::View& v, + Predicate predicate) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); + + namespace KE = ::Kokkos::Experimental; + return Impl::none_of_team_impl(teamHandle, KE::cbegin(v), KE::cend(v), + std::move(predicate)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_PartitionCopy.hpp b/algorithms/src/std_algorithms/Kokkos_PartitionCopy.hpp index 38c0a35b62..a1feee8d6d 100644 --- a/algorithms/src/std_algorithms/Kokkos_PartitionCopy.hpp +++ b/algorithms/src/std_algorithms/Kokkos_PartitionCopy.hpp @@ -23,57 +23,103 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename InputIteratorType, + typename OutputIteratorTrueType, typename OutputIteratorFalseType, + typename PredicateType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> ::Kokkos::pair partition_copy( const ExecutionSpace& ex, InputIteratorType from_first, InputIteratorType from_last, OutputIteratorTrueType to_first_true, OutputIteratorFalseType to_first_false, PredicateType p) { - return Impl::partition_copy_impl( + return Impl::partition_copy_exespace_impl( "Kokkos::partition_copy_iterator_api_default", ex, from_first, from_last, to_first_true, to_first_false, std::move(p)); } -template +template < + typename ExecutionSpace, typename InputIteratorType, + typename OutputIteratorTrueType, typename OutputIteratorFalseType, + typename PredicateType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> ::Kokkos::pair partition_copy( const std::string& label, const ExecutionSpace& ex, InputIteratorType from_first, InputIteratorType from_last, OutputIteratorTrueType to_first_true, OutputIteratorFalseType to_first_false, PredicateType p) { - return Impl::partition_copy_impl(label, ex, from_first, from_last, - to_first_true, to_first_false, std::move(p)); + return Impl::partition_copy_exespace_impl(label, ex, from_first, from_last, + to_first_true, to_first_false, + std::move(p)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename DataType3, + typename... Properties3, typename PredicateType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto partition_copy( const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest_true, const ::Kokkos::View& view_dest_false, PredicateType p) { - return Impl::partition_copy_impl("Kokkos::partition_copy_view_api_default", - ex, cbegin(view_from), cend(view_from), - begin(view_dest_true), - begin(view_dest_false), std::move(p)); + return Impl::partition_copy_exespace_impl( + "Kokkos::partition_copy_view_api_default", ex, cbegin(view_from), + cend(view_from), begin(view_dest_true), begin(view_dest_false), + std::move(p)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename DataType3, + typename... Properties3, typename PredicateType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto partition_copy( const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest_true, const ::Kokkos::View& view_dest_false, PredicateType p) { - return Impl::partition_copy_impl(label, ex, cbegin(view_from), - cend(view_from), begin(view_dest_true), - begin(view_dest_false), std::move(p)); + return Impl::partition_copy_exespace_impl( + label, ex, cbegin(view_from), cend(view_from), begin(view_dest_true), + begin(view_dest_false), std::move(p)); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION ::Kokkos::pair +partition_copy(const TeamHandleType& teamHandle, InputIteratorType from_first, + InputIteratorType from_last, + OutputIteratorTrueType to_first_true, + OutputIteratorFalseType to_first_false, PredicateType p) { + return Impl::partition_copy_team_impl(teamHandle, from_first, from_last, + to_first_true, to_first_false, + std::move(p)); +} + +template , int> = 0> +KOKKOS_FUNCTION auto partition_copy( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view_from, + const ::Kokkos::View& view_dest_true, + const ::Kokkos::View& view_dest_false, + PredicateType p) { + return Impl::partition_copy_team_impl(teamHandle, cbegin(view_from), + cend(view_from), begin(view_dest_true), + begin(view_dest_false), std::move(p)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_PartitionPoint.hpp b/algorithms/src/std_algorithms/Kokkos_PartitionPoint.hpp index 24798e377e..60cbeeda87 100644 --- a/algorithms/src/std_algorithms/Kokkos_PartitionPoint.hpp +++ b/algorithms/src/std_algorithms/Kokkos_PartitionPoint.hpp @@ -23,38 +23,78 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename IteratorType, typename UnaryPredicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType partition_point(const ExecutionSpace& ex, IteratorType first, IteratorType last, UnaryPredicate p) { - return Impl::partition_point_impl( + return Impl::partition_point_exespace_impl( "Kokkos::partitioned_point_iterator_api_default", ex, first, last, std::move(p)); } -template +template < + typename ExecutionSpace, typename IteratorType, typename UnaryPredicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType partition_point(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, UnaryPredicate p) { - return Impl::partition_point_impl(label, ex, first, last, std::move(p)); + return Impl::partition_point_exespace_impl(label, ex, first, last, + std::move(p)); } -template +template < + typename ExecutionSpace, typename UnaryPredicate, typename DataType, + typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto partition_point(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& v, UnaryPredicate p) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); - return Impl::partition_point_impl(label, ex, begin(v), end(v), std::move(p)); + return Impl::partition_point_exespace_impl(label, ex, begin(v), end(v), + std::move(p)); } -template +template < + typename ExecutionSpace, typename UnaryPredicate, typename DataType, + typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto partition_point(const ExecutionSpace& ex, const ::Kokkos::View& v, UnaryPredicate p) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); - return Impl::partition_point_impl("Kokkos::partition_point_view_api_default", - ex, begin(v), end(v), std::move(p)); + return Impl::partition_point_exespace_impl( + "Kokkos::partition_point_view_api_default", ex, begin(v), end(v), + std::move(p)); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION IteratorType partition_point(const TeamHandleType& teamHandle, + IteratorType first, + IteratorType last, + UnaryPredicate p) { + return Impl::partition_point_team_impl(teamHandle, first, last, std::move(p)); +} + +template , int> = 0> +KOKKOS_FUNCTION auto partition_point( + const TeamHandleType& teamHandle, + const ::Kokkos::View& v, UnaryPredicate p) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); + return Impl::partition_point_team_impl(teamHandle, begin(v), end(v), + std::move(p)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_Reduce.hpp b/algorithms/src/std_algorithms/Kokkos_Reduce.hpp index a31fa1497a..b84f00f8bb 100644 --- a/algorithms/src/std_algorithms/Kokkos_Reduce.hpp +++ b/algorithms/src/std_algorithms/Kokkos_Reduce.hpp @@ -23,28 +23,38 @@ namespace Kokkos { namespace Experimental { +// +// overload set accepting execution space +// + // // overload set 1 // -template +template ::value, + int> = 0> typename IteratorType::value_type reduce(const ExecutionSpace& ex, IteratorType first, IteratorType last) { - return Impl::reduce_default_functors_impl( + return Impl::reduce_default_functors_exespace_impl( "Kokkos::reduce_default_functors_iterator_api", ex, first, last, typename IteratorType::value_type()); } -template +template ::value, + int> = 0> typename IteratorType::value_type reduce(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last) { - return Impl::reduce_default_functors_impl( + return Impl::reduce_default_functors_exespace_impl( label, ex, first, last, typename IteratorType::value_type()); } -template +template ::value, + int> = 0> auto reduce(const ExecutionSpace& ex, const ::Kokkos::View& view) { namespace KE = ::Kokkos::Experimental; @@ -53,12 +63,14 @@ auto reduce(const ExecutionSpace& ex, using view_type = ::Kokkos::View; using value_type = typename view_type::value_type; - return Impl::reduce_default_functors_impl( + return Impl::reduce_default_functors_exespace_impl( "Kokkos::reduce_default_functors_view_api", ex, KE::cbegin(view), KE::cend(view), value_type()); } -template +template ::value, + int> = 0> auto reduce(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view) { namespace KE = ::Kokkos::Experimental; @@ -67,37 +79,43 @@ auto reduce(const std::string& label, const ExecutionSpace& ex, using view_type = ::Kokkos::View; using value_type = typename view_type::value_type; - return Impl::reduce_default_functors_impl(label, ex, KE::cbegin(view), - KE::cend(view), value_type()); + return Impl::reduce_default_functors_exespace_impl( + label, ex, KE::cbegin(view), KE::cend(view), value_type()); } // // overload set2: // -template +template ::value, + int> = 0> ValueType reduce(const ExecutionSpace& ex, IteratorType first, IteratorType last, ValueType init_reduction_value) { static_assert(std::is_move_constructible::value, "ValueType must be move constructible."); - return Impl::reduce_default_functors_impl( + return Impl::reduce_default_functors_exespace_impl( "Kokkos::reduce_default_functors_iterator_api", ex, first, last, init_reduction_value); } -template +template ::value, + int> = 0> ValueType reduce(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, ValueType init_reduction_value) { static_assert(std::is_move_constructible::value, "ValueType must be move constructible."); - return Impl::reduce_default_functors_impl(label, ex, first, last, - init_reduction_value); + return Impl::reduce_default_functors_exespace_impl(label, ex, first, last, + init_reduction_value); } -template +template ::value, + int> = 0> ValueType reduce(const ExecutionSpace& ex, const ::Kokkos::View& view, ValueType init_reduction_value) { @@ -107,13 +125,15 @@ ValueType reduce(const ExecutionSpace& ex, namespace KE = ::Kokkos::Experimental; Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - return Impl::reduce_default_functors_impl( + return Impl::reduce_default_functors_exespace_impl( "Kokkos::reduce_default_functors_view_api", ex, KE::cbegin(view), KE::cend(view), init_reduction_value); } -template +template ::value, + int> = 0> ValueType reduce(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, ValueType init_reduction_value) { @@ -123,40 +143,46 @@ ValueType reduce(const std::string& label, const ExecutionSpace& ex, namespace KE = ::Kokkos::Experimental; Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - return Impl::reduce_default_functors_impl( + return Impl::reduce_default_functors_exespace_impl( label, ex, KE::cbegin(view), KE::cend(view), init_reduction_value); } // // overload set 3 // -template +template ::value, + int> = 0> ValueType reduce(const ExecutionSpace& ex, IteratorType first, IteratorType last, ValueType init_reduction_value, BinaryOp joiner) { static_assert(std::is_move_constructible::value, "ValueType must be move constructible."); - return Impl::reduce_custom_functors_impl( + return Impl::reduce_custom_functors_exespace_impl( "Kokkos::reduce_default_functors_iterator_api", ex, first, last, init_reduction_value, joiner); } -template +template ::value, + int> = 0> ValueType reduce(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, ValueType init_reduction_value, BinaryOp joiner) { static_assert(std::is_move_constructible::value, "ValueType must be move constructible."); - return Impl::reduce_custom_functors_impl(label, ex, first, last, - init_reduction_value, joiner); + return Impl::reduce_custom_functors_exespace_impl( + label, ex, first, last, init_reduction_value, joiner); } -template +template ::value, + int> = 0> ValueType reduce(const ExecutionSpace& ex, const ::Kokkos::View& view, ValueType init_reduction_value, BinaryOp joiner) { @@ -166,13 +192,15 @@ ValueType reduce(const ExecutionSpace& ex, namespace KE = ::Kokkos::Experimental; Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - return Impl::reduce_custom_functors_impl( + return Impl::reduce_custom_functors_exespace_impl( "Kokkos::reduce_custom_functors_view_api", ex, KE::cbegin(view), KE::cend(view), init_reduction_value, joiner); } -template +template ::value, + int> = 0> ValueType reduce(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, ValueType init_reduction_value, BinaryOp joiner) { @@ -182,9 +210,114 @@ ValueType reduce(const std::string& label, const ExecutionSpace& ex, namespace KE = ::Kokkos::Experimental; Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - return Impl::reduce_custom_functors_impl(label, ex, KE::cbegin(view), - KE::cend(view), init_reduction_value, - joiner); + return Impl::reduce_custom_functors_exespace_impl( + label, ex, KE::cbegin(view), KE::cend(view), init_reduction_value, + joiner); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// + +// +// overload set 1 +// +template < + typename TeamHandleType, typename IteratorType, + std::enable_if_t<::Kokkos::is_team_handle::value, int> = 0> +KOKKOS_FUNCTION typename IteratorType::value_type reduce( + const TeamHandleType& teamHandle, IteratorType first, IteratorType last) { + return Impl::reduce_default_functors_team_impl( + teamHandle, first, last, typename IteratorType::value_type()); +} + +template < + typename TeamHandleType, typename DataType, typename... Properties, + std::enable_if_t<::Kokkos::is_team_handle::value, int> = 0> +KOKKOS_FUNCTION auto reduce( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view) { + namespace KE = ::Kokkos::Experimental; + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + + using view_type = ::Kokkos::View; + using value_type = typename view_type::value_type; + + return Impl::reduce_default_functors_team_impl(teamHandle, KE::cbegin(view), + KE::cend(view), value_type()); +} + +// +// overload set2: +// +template < + typename TeamHandleType, typename IteratorType, typename ValueType, + std::enable_if_t<::Kokkos::is_team_handle::value, int> = 0> +KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + ValueType init_reduction_value) { + static_assert(std::is_move_constructible::value, + "ValueType must be move constructible."); + + return Impl::reduce_default_functors_team_impl(teamHandle, first, last, + init_reduction_value); +} + +template < + typename TeamHandleType, typename DataType, typename... Properties, + typename ValueType, + std::enable_if_t<::Kokkos::is_team_handle::value, int> = 0> +KOKKOS_FUNCTION ValueType +reduce(const TeamHandleType& teamHandle, + const ::Kokkos::View& view, + ValueType init_reduction_value) { + static_assert(std::is_move_constructible::value, + "ValueType must be move constructible."); + + namespace KE = ::Kokkos::Experimental; + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + + return Impl::reduce_default_functors_team_impl( + teamHandle, KE::cbegin(view), KE::cend(view), init_reduction_value); +} + +// +// overload set 3 +// +template < + typename TeamHandleType, typename IteratorType, typename ValueType, + typename BinaryOp, + std::enable_if_t<::Kokkos::is_team_handle::value, int> = 0> +KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + ValueType init_reduction_value, + BinaryOp joiner) { + static_assert(std::is_move_constructible::value, + "ValueType must be move constructible."); + + return Impl::reduce_custom_functors_team_impl(teamHandle, first, last, + init_reduction_value, joiner); +} + +template < + typename TeamHandleType, typename DataType, typename... Properties, + typename ValueType, typename BinaryOp, + std::enable_if_t<::Kokkos::is_team_handle::value, int> = 0> +KOKKOS_FUNCTION ValueType +reduce(const TeamHandleType& teamHandle, + const ::Kokkos::View& view, + ValueType init_reduction_value, BinaryOp joiner) { + static_assert(std::is_move_constructible::value, + "ValueType must be move constructible."); + + namespace KE = ::Kokkos::Experimental; + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + + return Impl::reduce_custom_functors_team_impl(teamHandle, KE::cbegin(view), + KE::cend(view), + init_reduction_value, joiner); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_Remove.hpp b/algorithms/src/std_algorithms/Kokkos_Remove.hpp index c8602d2f53..8a429d8d51 100644 --- a/algorithms/src/std_algorithms/Kokkos_Remove.hpp +++ b/algorithms/src/std_algorithms/Kokkos_Remove.hpp @@ -23,38 +23,74 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename Iterator, typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> Iterator remove(const ExecutionSpace& ex, Iterator first, Iterator last, const ValueType& value) { - return Impl::remove_impl("Kokkos::remove_iterator_api_default", ex, first, - last, value); + return Impl::remove_exespace_impl("Kokkos::remove_iterator_api_default", ex, + first, last, value); } -template +template < + typename ExecutionSpace, typename Iterator, typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> Iterator remove(const std::string& label, const ExecutionSpace& ex, Iterator first, Iterator last, const ValueType& value) { - return Impl::remove_impl(label, ex, first, last, value); + return Impl::remove_exespace_impl(label, ex, first, last, value); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto remove(const ExecutionSpace& ex, const ::Kokkos::View& view, const ValueType& value) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - return Impl::remove_impl("Kokkos::remove_iterator_api_default", ex, - ::Kokkos::Experimental::begin(view), - ::Kokkos::Experimental::end(view), value); + return Impl::remove_exespace_impl("Kokkos::remove_iterator_api_default", ex, + ::Kokkos::Experimental::begin(view), + ::Kokkos::Experimental::end(view), value); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto remove(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, const ValueType& value) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - return Impl::remove_impl(label, ex, ::Kokkos::Experimental::begin(view), - ::Kokkos::Experimental::end(view), value); + return Impl::remove_exespace_impl(label, ex, + ::Kokkos::Experimental::begin(view), + ::Kokkos::Experimental::end(view), value); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION Iterator remove(const TeamHandleType& teamHandle, + Iterator first, Iterator last, + const ValueType& value) { + return Impl::remove_team_impl(teamHandle, first, last, value); +} + +template , int> = 0> +KOKKOS_FUNCTION auto remove(const TeamHandleType& teamHandle, + const ::Kokkos::View& view, + const ValueType& value) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + return Impl::remove_team_impl(teamHandle, ::Kokkos::Experimental::begin(view), + ::Kokkos::Experimental::end(view), value); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_RemoveCopy.hpp b/algorithms/src/std_algorithms/Kokkos_RemoveCopy.hpp index c2c06f6202..4b8fa9fe07 100644 --- a/algorithms/src/std_algorithms/Kokkos_RemoveCopy.hpp +++ b/algorithms/src/std_algorithms/Kokkos_RemoveCopy.hpp @@ -23,26 +23,36 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename InputIterator, typename OutputIterator, + typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> OutputIterator remove_copy(const ExecutionSpace& ex, InputIterator first_from, InputIterator last_from, OutputIterator first_dest, const ValueType& value) { - return Impl::remove_copy_impl("Kokkos::remove_copy_iterator_api_default", ex, - first_from, last_from, first_dest, value); + return Impl::remove_copy_exespace_impl( + "Kokkos::remove_copy_iterator_api_default", ex, first_from, last_from, + first_dest, value); } -template +template < + typename ExecutionSpace, typename InputIterator, typename OutputIterator, + typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> OutputIterator remove_copy(const std::string& label, const ExecutionSpace& ex, InputIterator first_from, InputIterator last_from, OutputIterator first_dest, const ValueType& value) { - return Impl::remove_copy_impl(label, ex, first_from, last_from, first_dest, - value); + return Impl::remove_copy_exespace_impl(label, ex, first_from, last_from, + first_dest, value); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto remove_copy(const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, @@ -50,15 +60,17 @@ auto remove_copy(const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); - return Impl::remove_copy_impl("Kokkos::remove_copy_iterator_api_default", ex, - ::Kokkos::Experimental::cbegin(view_from), - ::Kokkos::Experimental::cend(view_from), - ::Kokkos::Experimental::begin(view_dest), - value); + return Impl::remove_copy_exespace_impl( + "Kokkos::remove_copy_iterator_api_default", ex, + ::Kokkos::Experimental::cbegin(view_from), + ::Kokkos::Experimental::cend(view_from), + ::Kokkos::Experimental::begin(view_dest), value); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto remove_copy(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, @@ -66,12 +78,46 @@ auto remove_copy(const std::string& label, const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); - return Impl::remove_copy_impl( + return Impl::remove_copy_exespace_impl( label, ex, ::Kokkos::Experimental::cbegin(view_from), ::Kokkos::Experimental::cend(view_from), ::Kokkos::Experimental::begin(view_dest), value); } +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION OutputIterator remove_copy(const TeamHandleType& teamHandle, + InputIterator first_from, + InputIterator last_from, + OutputIterator first_dest, + const ValueType& value) { + return Impl::remove_copy_team_impl(teamHandle, first_from, last_from, + first_dest, value); +} + +template , int> = 0> +KOKKOS_FUNCTION auto remove_copy( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view_from, + const ::Kokkos::View& view_dest, + const ValueType& value) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + + return Impl::remove_copy_team_impl( + teamHandle, ::Kokkos::Experimental::cbegin(view_from), + ::Kokkos::Experimental::cend(view_from), + ::Kokkos::Experimental::begin(view_dest), value); +} + } // namespace Experimental } // namespace Kokkos diff --git a/algorithms/src/std_algorithms/Kokkos_RemoveCopyIf.hpp b/algorithms/src/std_algorithms/Kokkos_RemoveCopyIf.hpp index 6d642ed6f0..45e2b54bb6 100644 --- a/algorithms/src/std_algorithms/Kokkos_RemoveCopyIf.hpp +++ b/algorithms/src/std_algorithms/Kokkos_RemoveCopyIf.hpp @@ -23,30 +23,39 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename InputIterator, typename OutputIterator, + typename UnaryPredicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> OutputIterator remove_copy_if(const ExecutionSpace& ex, InputIterator first_from, InputIterator last_from, OutputIterator first_dest, const UnaryPredicate& pred) { - return Impl::remove_copy_if_impl( + return Impl::remove_copy_if_exespace_impl( "Kokkos::remove_copy_if_iterator_api_default", ex, first_from, last_from, first_dest, pred); } -template +template < + typename ExecutionSpace, typename InputIterator, typename OutputIterator, + typename UnaryPredicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> OutputIterator remove_copy_if(const std::string& label, const ExecutionSpace& ex, InputIterator first_from, InputIterator last_from, OutputIterator first_dest, const UnaryPredicate& pred) { - return Impl::remove_copy_if_impl(label, ex, first_from, last_from, first_dest, - pred); + return Impl::remove_copy_if_exespace_impl(label, ex, first_from, last_from, + first_dest, pred); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename UnaryPredicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto remove_copy_if(const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, @@ -54,15 +63,17 @@ auto remove_copy_if(const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); - return Impl::remove_copy_if_impl( + return Impl::remove_copy_if_exespace_impl( "Kokkos::remove_copy_if_iterator_api_default", ex, ::Kokkos::Experimental::cbegin(view_from), ::Kokkos::Experimental::cend(view_from), ::Kokkos::Experimental::begin(view_dest), pred); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename UnaryPredicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto remove_copy_if(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, @@ -70,12 +81,46 @@ auto remove_copy_if(const std::string& label, const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); - return Impl::remove_copy_if_impl( + return Impl::remove_copy_if_exespace_impl( label, ex, ::Kokkos::Experimental::cbegin(view_from), ::Kokkos::Experimental::cend(view_from), ::Kokkos::Experimental::begin(view_dest), pred); } +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION OutputIterator remove_copy_if(const TeamHandleType& teamHandle, + InputIterator first_from, + InputIterator last_from, + OutputIterator first_dest, + const UnaryPredicate& pred) { + return Impl::remove_copy_if_team_impl(teamHandle, first_from, last_from, + first_dest, pred); +} + +template , int> = 0> +KOKKOS_FUNCTION auto remove_copy_if( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view_from, + const ::Kokkos::View& view_dest, + const UnaryPredicate& pred) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + + return Impl::remove_copy_if_team_impl( + teamHandle, ::Kokkos::Experimental::cbegin(view_from), + ::Kokkos::Experimental::cend(view_from), + ::Kokkos::Experimental::begin(view_dest), pred); +} + } // namespace Experimental } // namespace Kokkos diff --git a/algorithms/src/std_algorithms/Kokkos_RemoveIf.hpp b/algorithms/src/std_algorithms/Kokkos_RemoveIf.hpp index 4062e8d373..38461a37f2 100644 --- a/algorithms/src/std_algorithms/Kokkos_RemoveIf.hpp +++ b/algorithms/src/std_algorithms/Kokkos_RemoveIf.hpp @@ -23,39 +23,77 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename Iterator, typename UnaryPredicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> Iterator remove_if(const ExecutionSpace& ex, Iterator first, Iterator last, UnaryPredicate pred) { - return Impl::remove_if_impl("Kokkos::remove_if_iterator_api_default", ex, - first, last, pred); + return Impl::remove_if_exespace_impl("Kokkos::remove_if_iterator_api_default", + ex, first, last, pred); } -template +template < + typename ExecutionSpace, typename Iterator, typename UnaryPredicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> Iterator remove_if(const std::string& label, const ExecutionSpace& ex, Iterator first, Iterator last, UnaryPredicate pred) { - return Impl::remove_if_impl(label, ex, first, last, pred); + return Impl::remove_if_exespace_impl(label, ex, first, last, pred); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename UnaryPredicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto remove_if(const ExecutionSpace& ex, const ::Kokkos::View& view, UnaryPredicate pred) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - return Impl::remove_if_impl("Kokkos::remove_if_iterator_api_default", ex, - ::Kokkos::Experimental::begin(view), - ::Kokkos::Experimental::end(view), pred); + return Impl::remove_if_exespace_impl("Kokkos::remove_if_iterator_api_default", + ex, ::Kokkos::Experimental::begin(view), + ::Kokkos::Experimental::end(view), pred); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename UnaryPredicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto remove_if(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, UnaryPredicate pred) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - return Impl::remove_if_impl(label, ex, ::Kokkos::Experimental::begin(view), - ::Kokkos::Experimental::end(view), pred); + return Impl::remove_if_exespace_impl(label, ex, + ::Kokkos::Experimental::begin(view), + ::Kokkos::Experimental::end(view), pred); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION Iterator remove_if(const TeamHandleType& teamHandle, + Iterator first, Iterator last, + UnaryPredicate pred) { + return Impl::remove_if_team_impl(teamHandle, first, last, pred); +} + +template , int> = 0> +KOKKOS_FUNCTION auto remove_if( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view, UnaryPredicate pred) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + + return Impl::remove_if_team_impl(teamHandle, + ::Kokkos::Experimental::begin(view), + ::Kokkos::Experimental::end(view), pred); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_Replace.hpp b/algorithms/src/std_algorithms/Kokkos_Replace.hpp index 4d1490ded0..29afc4f0c2 100644 --- a/algorithms/src/std_algorithms/Kokkos_Replace.hpp +++ b/algorithms/src/std_algorithms/Kokkos_Replace.hpp @@ -23,40 +23,77 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename Iterator, typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> void replace(const ExecutionSpace& ex, Iterator first, Iterator last, const ValueType& old_value, const ValueType& new_value) { - return Impl::replace_impl("Kokkos::replace_iterator_api", ex, first, last, - old_value, new_value); + Impl::replace_exespace_impl("Kokkos::replace_iterator_api", ex, first, last, + old_value, new_value); } -template +template < + typename ExecutionSpace, typename Iterator, typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> void replace(const std::string& label, const ExecutionSpace& ex, Iterator first, Iterator last, const ValueType& old_value, const ValueType& new_value) { - return Impl::replace_impl(label, ex, first, last, old_value, new_value); + Impl::replace_exespace_impl(label, ex, first, last, old_value, new_value); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> void replace(const ExecutionSpace& ex, const ::Kokkos::View& view, const ValueType& old_value, const ValueType& new_value) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); namespace KE = ::Kokkos::Experimental; - return Impl::replace_impl("Kokkos::replace_view_api", ex, KE::begin(view), - KE::end(view), old_value, new_value); + Impl::replace_exespace_impl("Kokkos::replace_view_api", ex, KE::begin(view), + KE::end(view), old_value, new_value); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> void replace(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, const ValueType& old_value, const ValueType& new_value) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); namespace KE = ::Kokkos::Experimental; - return Impl::replace_impl(label, ex, KE::begin(view), KE::end(view), - old_value, new_value); + Impl::replace_exespace_impl(label, ex, KE::begin(view), KE::end(view), + old_value, new_value); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION void replace(const TeamHandleType& teamHandle, Iterator first, + Iterator last, const ValueType& old_value, + const ValueType& new_value) { + Impl::replace_team_impl(teamHandle, first, last, old_value, new_value); +} + +template , int> = 0> +KOKKOS_FUNCTION void replace( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view, + const ValueType& old_value, const ValueType& new_value) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + namespace KE = ::Kokkos::Experimental; + Impl::replace_team_impl(teamHandle, KE::begin(view), KE::end(view), old_value, + new_value); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_ReplaceCopy.hpp b/algorithms/src/std_algorithms/Kokkos_ReplaceCopy.hpp index e7f464e4bd..04d5767e89 100644 --- a/algorithms/src/std_algorithms/Kokkos_ReplaceCopy.hpp +++ b/algorithms/src/std_algorithms/Kokkos_ReplaceCopy.hpp @@ -23,30 +23,39 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename InputIterator, typename OutputIterator, + typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> OutputIterator replace_copy(const ExecutionSpace& ex, InputIterator first_from, InputIterator last_from, OutputIterator first_dest, const ValueType& old_value, const ValueType& new_value) { - return Impl::replace_copy_impl("Kokkos::replace_copy_iterator_api", ex, - first_from, last_from, first_dest, old_value, - new_value); + return Impl::replace_copy_exespace_impl("Kokkos::replace_copy_iterator_api", + ex, first_from, last_from, first_dest, + old_value, new_value); } -template +template < + typename ExecutionSpace, typename InputIterator, typename OutputIterator, + typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> OutputIterator replace_copy(const std::string& label, const ExecutionSpace& ex, InputIterator first_from, InputIterator last_from, OutputIterator first_dest, const ValueType& old_value, const ValueType& new_value) { - return Impl::replace_copy_impl(label, ex, first_from, last_from, first_dest, - old_value, new_value); + return Impl::replace_copy_exespace_impl(label, ex, first_from, last_from, + first_dest, old_value, new_value); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto replace_copy(const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, @@ -54,13 +63,15 @@ auto replace_copy(const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); namespace KE = ::Kokkos::Experimental; - return Impl::replace_copy_impl("Kokkos::replace_copy_view_api", ex, - KE::cbegin(view_from), KE::cend(view_from), - KE::begin(view_dest), old_value, new_value); + return Impl::replace_copy_exespace_impl( + "Kokkos::replace_copy_view_api", ex, KE::cbegin(view_from), + KE::cend(view_from), KE::begin(view_dest), old_value, new_value); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto replace_copy(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, @@ -68,9 +79,43 @@ auto replace_copy(const std::string& label, const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); namespace KE = ::Kokkos::Experimental; - return Impl::replace_copy_impl(label, ex, KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), - old_value, new_value); + return Impl::replace_copy_exespace_impl( + label, ex, KE::cbegin(view_from), KE::cend(view_from), + KE::begin(view_dest), old_value, new_value); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION OutputIterator replace_copy(const TeamHandleType& teamHandle, + InputIterator first_from, + InputIterator last_from, + OutputIterator first_dest, + const ValueType& old_value, + const ValueType& new_value) { + return Impl::replace_copy_team_impl(teamHandle, first_from, last_from, + first_dest, old_value, new_value); +} + +template , int> = 0> +KOKKOS_FUNCTION auto replace_copy( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view_from, + const ::Kokkos::View& view_dest, + const ValueType& old_value, const ValueType& new_value) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + namespace KE = ::Kokkos::Experimental; + return Impl::replace_copy_team_impl(teamHandle, KE::cbegin(view_from), + KE::cend(view_from), KE::begin(view_dest), + old_value, new_value); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_ReplaceCopyIf.hpp b/algorithms/src/std_algorithms/Kokkos_ReplaceCopyIf.hpp index 71ae8f8452..b87163f194 100644 --- a/algorithms/src/std_algorithms/Kokkos_ReplaceCopyIf.hpp +++ b/algorithms/src/std_algorithms/Kokkos_ReplaceCopyIf.hpp @@ -23,33 +23,42 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename InputIterator, typename OutputIterator, + typename PredicateType, typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> OutputIterator replace_copy_if(const ExecutionSpace& ex, InputIterator first_from, InputIterator last_from, OutputIterator first_dest, PredicateType pred, const ValueType& new_value) { - return Impl::replace_copy_if_impl("Kokkos::replace_copy_if_iterator_api", ex, - first_from, last_from, first_dest, pred, - new_value); + return Impl::replace_copy_if_exespace_impl( + "Kokkos::replace_copy_if_iterator_api", ex, first_from, last_from, + first_dest, pred, new_value); } -template +template < + typename ExecutionSpace, typename InputIterator, typename OutputIterator, + typename PredicateType, typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> OutputIterator replace_copy_if(const std::string& label, const ExecutionSpace& ex, InputIterator first_from, InputIterator last_from, OutputIterator first_dest, PredicateType pred, const ValueType& new_value) { - return Impl::replace_copy_if_impl(label, ex, first_from, last_from, - first_dest, pred, new_value); + return Impl::replace_copy_if_exespace_impl(label, ex, first_from, last_from, + first_dest, pred, new_value); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename PredicateType, + typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto replace_copy_if(const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, @@ -57,14 +66,16 @@ auto replace_copy_if(const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); namespace KE = ::Kokkos::Experimental; - return Impl::replace_copy_if_impl("Kokkos::replace_copy_if_view_api", ex, - KE::cbegin(view_from), KE::cend(view_from), - KE::begin(view_dest), pred, new_value); + return Impl::replace_copy_if_exespace_impl( + "Kokkos::replace_copy_if_view_api", ex, KE::cbegin(view_from), + KE::cend(view_from), KE::begin(view_dest), pred, new_value); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename PredicateType, + typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto replace_copy_if(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, @@ -72,9 +83,44 @@ auto replace_copy_if(const std::string& label, const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); namespace KE = ::Kokkos::Experimental; - return Impl::replace_copy_if_impl(label, ex, KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), - pred, new_value); + return Impl::replace_copy_if_exespace_impl( + label, ex, KE::cbegin(view_from), KE::cend(view_from), + KE::begin(view_dest), pred, new_value); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION OutputIterator replace_copy_if(const TeamHandleType& teamHandle, + InputIterator first_from, + InputIterator last_from, + OutputIterator first_dest, + PredicateType pred, + const ValueType& new_value) { + return Impl::replace_copy_if_team_impl(teamHandle, first_from, last_from, + first_dest, pred, new_value); +} + +template , int> = 0> +KOKKOS_FUNCTION auto replace_copy_if( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view_from, + const ::Kokkos::View& view_dest, + PredicateType pred, const ValueType& new_value) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + namespace KE = ::Kokkos::Experimental; + return Impl::replace_copy_if_team_impl(teamHandle, KE::cbegin(view_from), + KE::cend(view_from), + KE::begin(view_dest), pred, new_value); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_ReplaceIf.hpp b/algorithms/src/std_algorithms/Kokkos_ReplaceIf.hpp index 7f06540e06..73af1f16f0 100644 --- a/algorithms/src/std_algorithms/Kokkos_ReplaceIf.hpp +++ b/algorithms/src/std_algorithms/Kokkos_ReplaceIf.hpp @@ -23,43 +23,82 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename InputIterator, typename Predicate, + typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> void replace_if(const ExecutionSpace& ex, InputIterator first, InputIterator last, Predicate pred, const ValueType& new_value) { - return Impl::replace_if_impl("Kokkos::replace_if_iterator_api", ex, first, - last, pred, new_value); + Impl::replace_if_exespace_impl("Kokkos::replace_if_iterator_api", ex, first, + last, pred, new_value); } -template +template < + typename ExecutionSpace, typename InputIterator, typename Predicate, + typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> void replace_if(const std::string& label, const ExecutionSpace& ex, InputIterator first, InputIterator last, Predicate pred, const ValueType& new_value) { - return Impl::replace_if_impl(label, ex, first, last, pred, new_value); + Impl::replace_if_exespace_impl(label, ex, first, last, pred, new_value); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename Predicate, typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> void replace_if(const ExecutionSpace& ex, const ::Kokkos::View& view, Predicate pred, const ValueType& new_value) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); namespace KE = ::Kokkos::Experimental; - return Impl::replace_if_impl("Kokkos::replace_if_view_api", ex, - KE::begin(view), KE::end(view), pred, new_value); + Impl::replace_if_exespace_impl("Kokkos::replace_if_view_api", ex, + KE::begin(view), KE::end(view), pred, + new_value); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename Predicate, typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> void replace_if(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, Predicate pred, const ValueType& new_value) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); namespace KE = ::Kokkos::Experimental; - return Impl::replace_if_impl(label, ex, KE::begin(view), KE::end(view), pred, - new_value); + Impl::replace_if_exespace_impl(label, ex, KE::begin(view), KE::end(view), + pred, new_value); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION void replace_if(const TeamHandleType& teamHandle, + InputIterator first, InputIterator last, + Predicate pred, const ValueType& new_value) { + Impl::replace_if_team_impl(teamHandle, first, last, pred, new_value); +} + +template , int> = 0> +KOKKOS_FUNCTION void replace_if( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view, Predicate pred, + const ValueType& new_value) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + namespace KE = ::Kokkos::Experimental; + Impl::replace_if_team_impl(teamHandle, KE::begin(view), KE::end(view), pred, + new_value); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_Reverse.hpp b/algorithms/src/std_algorithms/Kokkos_Reverse.hpp index 9f2fc5f3cc..a0786d3a2e 100644 --- a/algorithms/src/std_algorithms/Kokkos_Reverse.hpp +++ b/algorithms/src/std_algorithms/Kokkos_Reverse.hpp @@ -23,34 +23,67 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename InputIterator, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> void reverse(const ExecutionSpace& ex, InputIterator first, InputIterator last) { - return Impl::reverse_impl("Kokkos::reverse_iterator_api_default", ex, first, - last); + return Impl::reverse_exespace_impl("Kokkos::reverse_iterator_api_default", ex, + first, last); } -template +template < + typename ExecutionSpace, typename InputIterator, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> void reverse(const std::string& label, const ExecutionSpace& ex, InputIterator first, InputIterator last) { - return Impl::reverse_impl(label, ex, first, last); + return Impl::reverse_exespace_impl(label, ex, first, last); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> void reverse(const ExecutionSpace& ex, const ::Kokkos::View& view) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); namespace KE = ::Kokkos::Experimental; - return Impl::reverse_impl("Kokkos::reverse_view_api_default", ex, - KE::begin(view), KE::end(view)); + return Impl::reverse_exespace_impl("Kokkos::reverse_view_api_default", ex, + KE::begin(view), KE::end(view)); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> void reverse(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); namespace KE = ::Kokkos::Experimental; - return Impl::reverse_impl(label, ex, KE::begin(view), KE::end(view)); + return Impl::reverse_exespace_impl(label, ex, KE::begin(view), KE::end(view)); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION void reverse(const TeamHandleType& teamHandle, + InputIterator first, InputIterator last) { + return Impl::reverse_team_impl(teamHandle, first, last); +} + +template , int> = 0> +KOKKOS_FUNCTION void reverse( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + namespace KE = ::Kokkos::Experimental; + return Impl::reverse_team_impl(teamHandle, KE::begin(view), KE::end(view)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp b/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp index 279bb22086..66f39c4eaa 100644 --- a/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp +++ b/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp @@ -23,42 +23,83 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename InputIterator, typename OutputIterator, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> OutputIterator reverse_copy(const ExecutionSpace& ex, InputIterator first, InputIterator last, OutputIterator d_first) { - return Impl::reverse_copy_impl("Kokkos::reverse_copy_iterator_api_default", - ex, first, last, d_first); + return Impl::reverse_copy_exespace_impl( + "Kokkos::reverse_copy_iterator_api_default", ex, first, last, d_first); } -template +template < + typename ExecutionSpace, typename InputIterator, typename OutputIterator, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> OutputIterator reverse_copy(const std::string& label, const ExecutionSpace& ex, InputIterator first, InputIterator last, OutputIterator d_first) { - return Impl::reverse_copy_impl(label, ex, first, last, d_first); + return Impl::reverse_copy_exespace_impl(label, ex, first, last, d_first); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto reverse_copy(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); - return Impl::reverse_copy_impl("Kokkos::reverse_copy_view_api_default", ex, - cbegin(source), cend(source), begin(dest)); + return Impl::reverse_copy_exespace_impl( + "Kokkos::reverse_copy_view_api_default", ex, cbegin(source), cend(source), + begin(dest)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto reverse_copy(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); + + return Impl::reverse_copy_exespace_impl(label, ex, cbegin(source), + cend(source), begin(dest)); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION OutputIterator reverse_copy(const TeamHandleType& teamHandle, + InputIterator first, + InputIterator last, + OutputIterator d_first) { + return Impl::reverse_copy_team_impl(teamHandle, first, last, d_first); +} + +template , int> = 0> +KOKKOS_FUNCTION auto reverse_copy( + const TeamHandleType& teamHandle, + const ::Kokkos::View& source, + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); - return Impl::reverse_copy_impl(label, ex, cbegin(source), cend(source), - begin(dest)); + return Impl::reverse_copy_team_impl(teamHandle, cbegin(source), cend(source), + begin(dest)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_Rotate.hpp b/algorithms/src/std_algorithms/Kokkos_Rotate.hpp index 738e9bf137..aff04b47d6 100644 --- a/algorithms/src/std_algorithms/Kokkos_Rotate.hpp +++ b/algorithms/src/std_algorithms/Kokkos_Rotate.hpp @@ -23,36 +23,71 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename IteratorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType rotate(const ExecutionSpace& ex, IteratorType first, IteratorType n_first, IteratorType last) { - return Impl::rotate_impl("Kokkos::rotate_iterator_api_default", ex, first, - n_first, last); + return Impl::rotate_exespace_impl("Kokkos::rotate_iterator_api_default", ex, + first, n_first, last); } -template +template < + typename ExecutionSpace, typename IteratorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType rotate(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType n_first, IteratorType last) { - return Impl::rotate_impl(label, ex, first, n_first, last); + return Impl::rotate_exespace_impl(label, ex, first, n_first, last); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto rotate(const ExecutionSpace& ex, const ::Kokkos::View& view, std::size_t n_location) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - return Impl::rotate_impl("Kokkos::rotate_view_api_default", ex, begin(view), - begin(view) + n_location, end(view)); + return Impl::rotate_exespace_impl("Kokkos::rotate_view_api_default", ex, + begin(view), begin(view) + n_location, + end(view)); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto rotate(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, std::size_t n_location) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - return Impl::rotate_impl(label, ex, begin(view), begin(view) + n_location, - end(view)); + return Impl::rotate_exespace_impl(label, ex, begin(view), + begin(view) + n_location, end(view)); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION IteratorType rotate(const TeamHandleType& teamHandle, + IteratorType first, IteratorType n_first, + IteratorType last) { + return Impl::rotate_team_impl(teamHandle, first, n_first, last); +} + +template , int> = 0> +KOKKOS_FUNCTION auto rotate(const TeamHandleType& teamHandle, + const ::Kokkos::View& view, + std::size_t n_location) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + return Impl::rotate_team_impl(teamHandle, begin(view), + begin(view) + n_location, end(view)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_RotateCopy.hpp b/algorithms/src/std_algorithms/Kokkos_RotateCopy.hpp index f5d826c4bb..cce37fccfa 100644 --- a/algorithms/src/std_algorithms/Kokkos_RotateCopy.hpp +++ b/algorithms/src/std_algorithms/Kokkos_RotateCopy.hpp @@ -23,23 +23,34 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename InputIterator, typename OutputIterator, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> OutputIterator rotate_copy(const ExecutionSpace& ex, InputIterator first, InputIterator n_first, InputIterator last, OutputIterator d_first) { - return Impl::rotate_copy_impl("Kokkos::rotate_copy_iterator_api_default", ex, - first, n_first, last, d_first); + return Impl::rotate_copy_exespace_impl( + "Kokkos::rotate_copy_iterator_api_default", ex, first, n_first, last, + d_first); } -template +template < + typename ExecutionSpace, typename InputIterator, typename OutputIterator, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> OutputIterator rotate_copy(const std::string& label, const ExecutionSpace& ex, InputIterator first, InputIterator n_first, InputIterator last, OutputIterator d_first) { - return Impl::rotate_copy_impl(label, ex, first, n_first, last, d_first); + return Impl::rotate_copy_exespace_impl(label, ex, first, n_first, last, + d_first); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto rotate_copy(const ExecutionSpace& ex, const ::Kokkos::View& source, std::size_t n_location, @@ -47,13 +58,15 @@ auto rotate_copy(const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); - return Impl::rotate_copy_impl("Kokkos::rotate_copy_view_api_default", ex, - cbegin(source), cbegin(source) + n_location, - cend(source), begin(dest)); + return Impl::rotate_copy_exespace_impl( + "Kokkos::rotate_copy_view_api_default", ex, cbegin(source), + cbegin(source) + n_location, cend(source), begin(dest)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto rotate_copy(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, std::size_t n_location, @@ -61,9 +74,41 @@ auto rotate_copy(const std::string& label, const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); - return Impl::rotate_copy_impl(label, ex, cbegin(source), - cbegin(source) + n_location, cend(source), - begin(dest)); + return Impl::rotate_copy_exespace_impl(label, ex, cbegin(source), + cbegin(source) + n_location, + cend(source), begin(dest)); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION OutputIterator rotate_copy(const TeamHandleType& teamHandle, + InputIterator first, + InputIterator n_first, + InputIterator last, + OutputIterator d_first) { + return Impl::rotate_copy_team_impl(teamHandle, first, n_first, last, d_first); +} + +template , int> = 0> +KOKKOS_FUNCTION auto rotate_copy( + const TeamHandleType& teamHandle, + const ::Kokkos::View& source, + std::size_t n_location, + const ::Kokkos::View& dest) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); + + return Impl::rotate_copy_team_impl(teamHandle, cbegin(source), + cbegin(source) + n_location, cend(source), + begin(dest)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_Search.hpp b/algorithms/src/std_algorithms/Kokkos_Search.hpp index b1154b297e..43258a484e 100644 --- a/algorithms/src/std_algorithms/Kokkos_Search.hpp +++ b/algorithms/src/std_algorithms/Kokkos_Search.hpp @@ -23,24 +23,34 @@ namespace Kokkos { namespace Experimental { +// +// overload set accepting execution space +// + // overload set 1: no binary predicate passed -template +template < + typename ExecutionSpace, typename IteratorType1, typename IteratorType2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType1 search(const ExecutionSpace& ex, IteratorType1 first, IteratorType1 last, IteratorType2 s_first, IteratorType2 s_last) { - return Impl::search_impl("Kokkos::search_iterator_api_default", ex, first, - last, s_first, s_last); + return Impl::search_exespace_impl("Kokkos::search_iterator_api_default", ex, + first, last, s_first, s_last); } -template +template < + typename ExecutionSpace, typename IteratorType1, typename IteratorType2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType1 search(const std::string& label, const ExecutionSpace& ex, IteratorType1 first, IteratorType1 last, IteratorType2 s_first, IteratorType2 s_last) { - return Impl::search_impl(label, ex, first, last, s_first, s_last); + return Impl::search_exespace_impl(label, ex, first, last, s_first, s_last); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto search(const ExecutionSpace& ex, const ::Kokkos::View& view, const ::Kokkos::View& s_view) { @@ -48,13 +58,15 @@ auto search(const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view); namespace KE = ::Kokkos::Experimental; - return Impl::search_impl("Kokkos::search_view_api_default", ex, - KE::begin(view), KE::end(view), KE::begin(s_view), - KE::end(s_view)); + return Impl::search_exespace_impl("Kokkos::search_view_api_default", ex, + KE::begin(view), KE::end(view), + KE::begin(s_view), KE::end(s_view)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto search(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, const ::Kokkos::View& s_view) { @@ -62,31 +74,38 @@ auto search(const std::string& label, const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view); namespace KE = ::Kokkos::Experimental; - return Impl::search_impl(label, ex, KE::begin(view), KE::end(view), - KE::begin(s_view), KE::end(s_view)); + return Impl::search_exespace_impl(label, ex, KE::begin(view), KE::end(view), + KE::begin(s_view), KE::end(s_view)); } // overload set 2: binary predicate passed -template +template < + typename ExecutionSpace, typename IteratorType1, typename IteratorType2, + typename BinaryPredicateType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType1 search(const ExecutionSpace& ex, IteratorType1 first, IteratorType1 last, IteratorType2 s_first, IteratorType2 s_last, const BinaryPredicateType& pred) { - return Impl::search_impl("Kokkos::search_iterator_api_default", ex, first, - last, s_first, s_last, pred); + return Impl::search_exespace_impl("Kokkos::search_iterator_api_default", ex, + first, last, s_first, s_last, pred); } -template +template < + typename ExecutionSpace, typename IteratorType1, typename IteratorType2, + typename BinaryPredicateType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType1 search(const std::string& label, const ExecutionSpace& ex, IteratorType1 first, IteratorType1 last, IteratorType2 s_first, IteratorType2 s_last, const BinaryPredicateType& pred) { - return Impl::search_impl(label, ex, first, last, s_first, s_last, pred); + return Impl::search_exespace_impl(label, ex, first, last, s_first, s_last, + pred); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename BinaryPredicateType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto search(const ExecutionSpace& ex, const ::Kokkos::View& view, const ::Kokkos::View& s_view, @@ -95,13 +114,15 @@ auto search(const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view); namespace KE = ::Kokkos::Experimental; - return Impl::search_impl("Kokkos::search_view_api_default", ex, - KE::begin(view), KE::end(view), KE::begin(s_view), - KE::end(s_view), pred); + return Impl::search_exespace_impl("Kokkos::search_view_api_default", ex, + KE::begin(view), KE::end(view), + KE::begin(s_view), KE::end(s_view), pred); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename BinaryPredicateType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto search(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, const ::Kokkos::View& s_view, @@ -110,8 +131,70 @@ auto search(const std::string& label, const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view); namespace KE = ::Kokkos::Experimental; - return Impl::search_impl(label, ex, KE::begin(view), KE::end(view), - KE::begin(s_view), KE::end(s_view), pred); + return Impl::search_exespace_impl(label, ex, KE::begin(view), KE::end(view), + KE::begin(s_view), KE::end(s_view), pred); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// + +// overload set 1: no binary predicate passed +template , int> = 0> +KOKKOS_FUNCTION IteratorType1 search(const TeamHandleType& teamHandle, + IteratorType1 first, IteratorType1 last, + IteratorType2 s_first, + IteratorType2 s_last) { + return Impl::search_team_impl(teamHandle, first, last, s_first, s_last); +} + +template , int> = 0> +KOKKOS_FUNCTION auto search( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view, + const ::Kokkos::View& s_view) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view); + + namespace KE = ::Kokkos::Experimental; + return Impl::search_team_impl(teamHandle, KE::begin(view), KE::end(view), + KE::begin(s_view), KE::end(s_view)); +} + +// overload set 2: binary predicate passed +template , int> = 0> + +KOKKOS_FUNCTION IteratorType1 search(const TeamHandleType& teamHandle, + IteratorType1 first, IteratorType1 last, + IteratorType2 s_first, + IteratorType2 s_last, + const BinaryPredicateType& pred) { + return Impl::search_team_impl(teamHandle, first, last, s_first, s_last, pred); +} + +template , int> = 0> +KOKKOS_FUNCTION auto search( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view, + const ::Kokkos::View& s_view, + const BinaryPredicateType& pred) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(s_view); + + namespace KE = ::Kokkos::Experimental; + return Impl::search_team_impl(teamHandle, KE::begin(view), KE::end(view), + KE::begin(s_view), KE::end(s_view), pred); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_SearchN.hpp b/algorithms/src/std_algorithms/Kokkos_SearchN.hpp index a649c8f205..0f8aa5f1c1 100644 --- a/algorithms/src/std_algorithms/Kokkos_SearchN.hpp +++ b/algorithms/src/std_algorithms/Kokkos_SearchN.hpp @@ -23,68 +23,86 @@ namespace Kokkos { namespace Experimental { +// +// overload set accepting execution space +// + // overload set 1: no binary predicate passed -template +template < + class ExecutionSpace, class IteratorType, class SizeType, class ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType search_n(const ExecutionSpace& ex, IteratorType first, IteratorType last, SizeType count, const ValueType& value) { - return Impl::search_n_impl("Kokkos::search_n_iterator_api_default", ex, first, - last, count, value); + return Impl::search_n_exespace_impl("Kokkos::search_n_iterator_api_default", + ex, first, last, count, value); } -template +template < + class ExecutionSpace, class IteratorType, class SizeType, class ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType search_n(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, SizeType count, const ValueType& value) { - return Impl::search_n_impl(label, ex, first, last, count, value); + return Impl::search_n_exespace_impl(label, ex, first, last, count, value); } template + class SizeType, class ValueType, + std::enable_if_t<::Kokkos::is_execution_space::value, + int> = 0> auto search_n(const ExecutionSpace& ex, const ::Kokkos::View& view, SizeType count, const ValueType& value) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); namespace KE = ::Kokkos::Experimental; - return Impl::search_n_impl("Kokkos::search_n_view_api_default", ex, - KE::begin(view), KE::end(view), count, value); + return Impl::search_n_exespace_impl("Kokkos::search_n_view_api_default", ex, + KE::begin(view), KE::end(view), count, + value); } template + class SizeType, class ValueType, + std::enable_if_t<::Kokkos::is_execution_space::value, + int> = 0> auto search_n(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, SizeType count, const ValueType& value) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); namespace KE = ::Kokkos::Experimental; - return Impl::search_n_impl(label, ex, KE::begin(view), KE::end(view), count, - value); + return Impl::search_n_exespace_impl(label, ex, KE::begin(view), KE::end(view), + count, value); } // overload set 2: binary predicate passed -template +template < + class ExecutionSpace, class IteratorType, class SizeType, class ValueType, + class BinaryPredicateType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType search_n(const ExecutionSpace& ex, IteratorType first, IteratorType last, SizeType count, const ValueType& value, const BinaryPredicateType& pred) { - return Impl::search_n_impl("Kokkos::search_n_iterator_api_default", ex, first, - last, count, value, pred); + return Impl::search_n_exespace_impl("Kokkos::search_n_iterator_api_default", + ex, first, last, count, value, pred); } -template +template < + class ExecutionSpace, class IteratorType, class SizeType, class ValueType, + class BinaryPredicateType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType search_n(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, SizeType count, const ValueType& value, const BinaryPredicateType& pred) { - return Impl::search_n_impl(label, ex, first, last, count, value, pred); + return Impl::search_n_exespace_impl(label, ex, first, last, count, value, + pred); } template + class SizeType, class ValueType, class BinaryPredicateType, + std::enable_if_t<::Kokkos::is_execution_space::value, + int> = 0> auto search_n(const ExecutionSpace& ex, const ::Kokkos::View& view, SizeType count, const ValueType& value, @@ -92,13 +110,15 @@ auto search_n(const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); namespace KE = ::Kokkos::Experimental; - return Impl::search_n_impl("Kokkos::search_n_view_api_default", ex, - KE::begin(view), KE::end(view), count, value, - pred); + return Impl::search_n_exespace_impl("Kokkos::search_n_view_api_default", ex, + KE::begin(view), KE::end(view), count, + value, pred); } template + class SizeType, class ValueType, class BinaryPredicateType, + std::enable_if_t<::Kokkos::is_execution_space::value, + int> = 0> auto search_n(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, SizeType count, const ValueType& value, @@ -106,8 +126,65 @@ auto search_n(const std::string& label, const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); namespace KE = ::Kokkos::Experimental; - return Impl::search_n_impl(label, ex, KE::begin(view), KE::end(view), count, - value, pred); + return Impl::search_n_exespace_impl(label, ex, KE::begin(view), KE::end(view), + count, value, pred); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// + +// overload set 1: no binary predicate passed +template , int> = 0> +KOKKOS_FUNCTION IteratorType search_n(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + SizeType count, const ValueType& value) { + return Impl::search_n_team_impl(teamHandle, first, last, count, value); +} + +template < + class TeamHandleType, class DataType, class... Properties, class SizeType, + class ValueType, + std::enable_if_t<::Kokkos::is_team_handle::value, int> = 0> +KOKKOS_FUNCTION auto search_n( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view, SizeType count, + const ValueType& value) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + + namespace KE = ::Kokkos::Experimental; + return Impl::search_n_team_impl(teamHandle, KE::begin(view), KE::end(view), + count, value); +} + +// overload set 2: binary predicate passed +template , int> = 0> +KOKKOS_FUNCTION IteratorType search_n(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + SizeType count, const ValueType& value, + const BinaryPredicateType& pred) { + return Impl::search_n_team_impl(teamHandle, first, last, count, value, pred); +} + +template < + class TeamHandleType, class DataType, class... Properties, class SizeType, + class ValueType, class BinaryPredicateType, + std::enable_if_t<::Kokkos::is_team_handle::value, int> = 0> +KOKKOS_FUNCTION auto search_n( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view, SizeType count, + const ValueType& value, const BinaryPredicateType& pred) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + + namespace KE = ::Kokkos::Experimental; + return Impl::search_n_team_impl(teamHandle, KE::begin(view), KE::end(view), + count, value, pred); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_ShiftLeft.hpp b/algorithms/src/std_algorithms/Kokkos_ShiftLeft.hpp index 4b91a17ab8..b3e04a3b97 100644 --- a/algorithms/src/std_algorithms/Kokkos_ShiftLeft.hpp +++ b/algorithms/src/std_algorithms/Kokkos_ShiftLeft.hpp @@ -23,36 +23,70 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename IteratorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType shift_left(const ExecutionSpace& ex, IteratorType first, IteratorType last, typename IteratorType::difference_type n) { - return Impl::shift_left_impl("Kokkos::shift_left_iterator_api_default", ex, - first, last, n); + return Impl::shift_left_exespace_impl( + "Kokkos::shift_left_iterator_api_default", ex, first, last, n); } -template +template < + typename ExecutionSpace, typename IteratorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType shift_left(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, typename IteratorType::difference_type n) { - return Impl::shift_left_impl(label, ex, first, last, n); + return Impl::shift_left_exespace_impl(label, ex, first, last, n); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto shift_left(const ExecutionSpace& ex, const ::Kokkos::View& view, typename decltype(begin(view))::difference_type n) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - return Impl::shift_left_impl("Kokkos::shift_left_view_api_default", ex, - begin(view), end(view), n); + return Impl::shift_left_exespace_impl("Kokkos::shift_left_view_api_default", + ex, begin(view), end(view), n); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto shift_left(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, typename decltype(begin(view))::difference_type n) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - return Impl::shift_left_impl(label, ex, begin(view), end(view), n); + return Impl::shift_left_exespace_impl(label, ex, begin(view), end(view), n); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION IteratorType +shift_left(const TeamHandleType& teamHandle, IteratorType first, + IteratorType last, typename IteratorType::difference_type n) { + return Impl::shift_left_team_impl(teamHandle, first, last, n); +} + +template , int> = 0> +KOKKOS_FUNCTION auto shift_left( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view, + typename decltype(begin(view))::difference_type n) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + return Impl::shift_left_team_impl(teamHandle, begin(view), end(view), n); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_ShiftRight.hpp b/algorithms/src/std_algorithms/Kokkos_ShiftRight.hpp index 2ea50fd74e..0f7ed53948 100644 --- a/algorithms/src/std_algorithms/Kokkos_ShiftRight.hpp +++ b/algorithms/src/std_algorithms/Kokkos_ShiftRight.hpp @@ -23,36 +23,70 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename IteratorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType shift_right(const ExecutionSpace& ex, IteratorType first, IteratorType last, typename IteratorType::difference_type n) { - return Impl::shift_right_impl("Kokkos::shift_right_iterator_api_default", ex, - first, last, n); + return Impl::shift_right_exespace_impl( + "Kokkos::shift_right_iterator_api_default", ex, first, last, n); } -template +template < + typename ExecutionSpace, typename IteratorType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType shift_right(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, typename IteratorType::difference_type n) { - return Impl::shift_right_impl(label, ex, first, last, n); + return Impl::shift_right_exespace_impl(label, ex, first, last, n); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto shift_right(const ExecutionSpace& ex, const ::Kokkos::View& view, typename decltype(begin(view))::difference_type n) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - return Impl::shift_right_impl("Kokkos::shift_right_view_api_default", ex, - begin(view), end(view), n); + return Impl::shift_right_exespace_impl("Kokkos::shift_right_view_api_default", + ex, begin(view), end(view), n); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto shift_right(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, typename decltype(begin(view))::difference_type n) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - return Impl::shift_right_impl(label, ex, begin(view), end(view), n); + return Impl::shift_right_exespace_impl(label, ex, begin(view), end(view), n); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION IteratorType +shift_right(const TeamHandleType& teamHandle, IteratorType first, + IteratorType last, typename IteratorType::difference_type n) { + return Impl::shift_right_team_impl(teamHandle, first, last, n); +} + +template , int> = 0> +KOKKOS_FUNCTION auto shift_right( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view, + typename decltype(begin(view))::difference_type n) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + return Impl::shift_right_team_impl(teamHandle, begin(view), end(view), n); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp b/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp index 5fbf045318..d66763d304 100644 --- a/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp +++ b/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp @@ -23,44 +23,84 @@ namespace Kokkos { namespace Experimental { -template +// +// overload set accepting execution space +// +template , int> = 0> IteratorType2 swap_ranges(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1, IteratorType2 first2) { - return Impl::swap_ranges_impl("Kokkos::swap_ranges_iterator_api_default", ex, - first1, last1, first2); + return Impl::swap_ranges_exespace_impl( + "Kokkos::swap_ranges_iterator_api_default", ex, first1, last1, first2); } -template +template , int> = 0> auto swap_ranges(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); assert(source.extent(0) == dest.extent(0)); - return Impl::swap_ranges_impl("Kokkos::swap_ranges_view_api_default", ex, - begin(source), end(source), begin(dest)); + return Impl::swap_ranges_exespace_impl("Kokkos::swap_ranges_view_api_default", + ex, begin(source), end(source), + begin(dest)); } -template +template , int> = 0> IteratorType2 swap_ranges(const std::string& label, const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1, IteratorType2 first2) { - return Impl::swap_ranges_impl(label, ex, first1, last1, first2); + return Impl::swap_ranges_exespace_impl(label, ex, first1, last1, first2); } -template +template , int> = 0> auto swap_ranges(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); + + assert(source.extent(0) == dest.extent(0)); + return Impl::swap_ranges_exespace_impl(label, ex, begin(source), end(source), + begin(dest)); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION IteratorType2 swap_ranges(const TeamHandleType& teamHandle, + IteratorType1 first1, + IteratorType1 last1, + IteratorType2 first2) { + return Impl::swap_ranges_team_impl(teamHandle, first1, last1, first2); +} + +template , int> = 0> +KOKKOS_FUNCTION auto swap_ranges( + const TeamHandleType& teamHandle, + const ::Kokkos::View& source, + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); assert(source.extent(0) == dest.extent(0)); - return Impl::swap_ranges_impl(label, ex, begin(source), end(source), - begin(dest)); + return Impl::swap_ranges_team_impl(teamHandle, begin(source), end(source), + begin(dest)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_Transform.hpp b/algorithms/src/std_algorithms/Kokkos_Transform.hpp index 27dee30426..84cbed524d 100644 --- a/algorithms/src/std_algorithms/Kokkos_Transform.hpp +++ b/algorithms/src/std_algorithms/Kokkos_Transform.hpp @@ -23,113 +23,200 @@ namespace Kokkos { namespace Experimental { -template -std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators< - InputIterator, OutputIterator>::value, - OutputIterator> -transform(const ExecutionSpace& ex, InputIterator first1, InputIterator last1, - OutputIterator d_first, UnaryOperation unary_op) { - return Impl::transform_impl("Kokkos::transform_iterator_api_default", ex, - first1, last1, d_first, std::move(unary_op)); +// +// overload set accepting execution space +// +template < + typename ExecutionSpace, typename InputIterator, typename OutputIterator, + typename UnaryOperation, + std::enable_if_t && + is_execution_space_v, + int> = 0> +OutputIterator transform(const ExecutionSpace& ex, InputIterator first1, + InputIterator last1, OutputIterator d_first, + UnaryOperation unary_op) { + return Impl::transform_exespace_impl("Kokkos::transform_iterator_api_default", + ex, first1, last1, d_first, + std::move(unary_op)); } -template -std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators< - InputIterator, OutputIterator>::value, - OutputIterator> -transform(const std::string& label, const ExecutionSpace& ex, - InputIterator first1, InputIterator last1, OutputIterator d_first, - UnaryOperation unary_op) { - return Impl::transform_impl(label, ex, first1, last1, d_first, - std::move(unary_op)); +template < + typename ExecutionSpace, typename InputIterator, typename OutputIterator, + typename UnaryOperation, + std::enable_if_t && + is_execution_space_v, + int> = 0> +OutputIterator transform(const std::string& label, const ExecutionSpace& ex, + InputIterator first1, InputIterator last1, + OutputIterator d_first, UnaryOperation unary_op) { + return Impl::transform_exespace_impl(label, ex, first1, last1, d_first, + std::move(unary_op)); } -template +template , int> = 0> auto transform(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest, + const ::Kokkos::View& dest, UnaryOperation unary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); - return Impl::transform_impl("Kokkos::transform_view_api_default", ex, - begin(source), end(source), begin(dest), - std::move(unary_op)); + return Impl::transform_exespace_impl("Kokkos::transform_view_api_default", ex, + begin(source), end(source), begin(dest), + std::move(unary_op)); } -template +template , int> = 0> auto transform(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest, + const ::Kokkos::View& dest, UnaryOperation unary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); - return Impl::transform_impl(label, ex, begin(source), end(source), - begin(dest), std::move(unary_op)); + return Impl::transform_exespace_impl(label, ex, begin(source), end(source), + begin(dest), std::move(unary_op)); } -template -std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators< - InputIterator1, InputIterator2, OutputIterator>::value, - OutputIterator> -transform(const ExecutionSpace& ex, InputIterator1 first1, InputIterator1 last1, - InputIterator2 first2, OutputIterator d_first, - BinaryOperation binary_op) { - return Impl::transform_impl("Kokkos::transform_iterator_api_default", ex, - first1, last1, first2, d_first, - std::move(binary_op)); +template < + typename ExecutionSpace, typename InputIterator1, typename InputIterator2, + typename OutputIterator, typename BinaryOperation, + std::enable_if_t< + Impl::are_iterators_v && + is_execution_space_v, + int> = 0> +OutputIterator transform(const ExecutionSpace& ex, InputIterator1 first1, + InputIterator1 last1, InputIterator2 first2, + OutputIterator d_first, BinaryOperation binary_op) { + return Impl::transform_exespace_impl("Kokkos::transform_iterator_api_default", + ex, first1, last1, first2, d_first, + std::move(binary_op)); } -template -std::enable_if_t< ::Kokkos::Experimental::Impl::are_iterators< - InputIterator1, InputIterator2, OutputIterator>::value, - OutputIterator> -transform(const std::string& label, const ExecutionSpace& ex, - InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, - OutputIterator d_first, BinaryOperation binary_op) { - return Impl::transform_impl(label, ex, first1, last1, first2, d_first, - std::move(binary_op)); +template < + typename ExecutionSpace, typename InputIterator1, typename InputIterator2, + typename OutputIterator, typename BinaryOperation, + std::enable_if_t< + Impl::are_iterators_v && + is_execution_space_v, + int> = 0> +OutputIterator transform(const std::string& label, const ExecutionSpace& ex, + InputIterator1 first1, InputIterator1 last1, + InputIterator2 first2, OutputIterator d_first, + BinaryOperation binary_op) { + return Impl::transform_exespace_impl(label, ex, first1, last1, first2, + d_first, std::move(binary_op)); } -template +template , int> = 0> auto transform(const ExecutionSpace& ex, const ::Kokkos::View& source1, const ::Kokkos::View& source2, - ::Kokkos::View& dest, + const ::Kokkos::View& dest, BinaryOperation binary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); - return Impl::transform_impl("Kokkos::transform_view_api_default", ex, - begin(source1), end(source1), begin(source2), - begin(dest), std::move(binary_op)); + return Impl::transform_exespace_impl( + "Kokkos::transform_view_api_default", ex, begin(source1), end(source1), + begin(source2), begin(dest), std::move(binary_op)); } -template +template , int> = 0> auto transform(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source1, const ::Kokkos::View& source2, - ::Kokkos::View& dest, + const ::Kokkos::View& dest, BinaryOperation binary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); - return Impl::transform_impl(label, ex, begin(source1), end(source1), - begin(source2), begin(dest), - std::move(binary_op)); + return Impl::transform_exespace_impl(label, ex, begin(source1), end(source1), + begin(source2), begin(dest), + std::move(binary_op)); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template < + typename TeamHandleType, typename InputIterator, typename OutputIterator, + typename UnaryOperation, + std::enable_if_t && + is_team_handle_v, + int> = 0> +KOKKOS_FUNCTION OutputIterator transform(const TeamHandleType& teamHandle, + InputIterator first1, + InputIterator last1, + OutputIterator d_first, + UnaryOperation unary_op) { + return Impl::transform_team_impl(teamHandle, first1, last1, d_first, + std::move(unary_op)); +} + +template , int> = 0> +KOKKOS_FUNCTION auto transform( + const TeamHandleType& teamHandle, + const ::Kokkos::View& source, + const ::Kokkos::View& dest, + UnaryOperation unary_op) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); + + return Impl::transform_team_impl(teamHandle, begin(source), end(source), + begin(dest), std::move(unary_op)); +} + +template < + typename TeamHandleType, typename InputIterator1, typename InputIterator2, + typename OutputIterator, typename BinaryOperation, + std::enable_if_t< + Impl::are_iterators_v && + is_team_handle_v, + int> = 0> +KOKKOS_FUNCTION OutputIterator transform(const TeamHandleType& teamHandle, + InputIterator1 first1, + InputIterator1 last1, + InputIterator2 first2, + OutputIterator d_first, + BinaryOperation binary_op) { + return Impl::transform_team_impl(teamHandle, first1, last1, first2, d_first, + std::move(binary_op)); +} + +template , int> = 0> +KOKKOS_FUNCTION auto transform( + const TeamHandleType& teamHandle, + const ::Kokkos::View& source1, + const ::Kokkos::View& source2, + const ::Kokkos::View& dest, + BinaryOperation binary_op) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); + + return Impl::transform_team_impl(teamHandle, begin(source1), end(source1), + begin(source2), begin(dest), + std::move(binary_op)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_TransformExclusiveScan.hpp b/algorithms/src/std_algorithms/Kokkos_TransformExclusiveScan.hpp index 9d85aee06f..37fc0f860e 100644 --- a/algorithms/src/std_algorithms/Kokkos_TransformExclusiveScan.hpp +++ b/algorithms/src/std_algorithms/Kokkos_TransformExclusiveScan.hpp @@ -23,44 +23,52 @@ namespace Kokkos { namespace Experimental { -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -transform_exclusive_scan(const ExecutionSpace& ex, InputIteratorType first, - InputIteratorType last, OutputIteratorType first_dest, - ValueType init_value, BinaryOpType binary_op, - UnaryOpType unary_op) { +// +// overload set accepting execution space +// +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType transform_exclusive_scan( + const ExecutionSpace& ex, InputIteratorType first, InputIteratorType last, + OutputIteratorType first_dest, ValueType init_value, BinaryOpType binary_op, + UnaryOpType unary_op) { Impl::static_assert_is_not_openmptarget(ex); - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); - return Impl::transform_exclusive_scan_impl( + return Impl::transform_exclusive_scan_exespace_impl( "Kokkos::transform_exclusive_scan_custom_functors_iterator_api", ex, - first, last, first_dest, init_value, binary_op, unary_op); + first, last, first_dest, std::move(init_value), binary_op, unary_op); } -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -transform_exclusive_scan(const std::string& label, const ExecutionSpace& ex, - InputIteratorType first, InputIteratorType last, - OutputIteratorType first_dest, ValueType init_value, - BinaryOpType binary_op, UnaryOpType unary_op) { +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType transform_exclusive_scan( + const std::string& label, const ExecutionSpace& ex, InputIteratorType first, + InputIteratorType last, OutputIteratorType first_dest, ValueType init_value, + BinaryOpType binary_op, UnaryOpType unary_op) { Impl::static_assert_is_not_openmptarget(ex); - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); - return Impl::transform_exclusive_scan_impl(label, ex, first, last, first_dest, - init_value, binary_op, unary_op); + return Impl::transform_exclusive_scan_exespace_impl( + label, ex, first, last, first_dest, std::move(init_value), binary_op, + unary_op); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename ValueType, + typename BinaryOpType, typename UnaryOpType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto transform_exclusive_scan( const ExecutionSpace& ex, const ::Kokkos::View& view_from, @@ -69,18 +77,20 @@ auto transform_exclusive_scan( Impl::static_assert_is_not_openmptarget(ex); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; - return Impl::transform_exclusive_scan_impl( + return Impl::transform_exclusive_scan_exespace_impl( "Kokkos::transform_exclusive_scan_custom_functors_view_api", ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), - init_value, binary_op, unary_op); + std::move(init_value), binary_op, unary_op); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename ValueType, + typename BinaryOpType, typename UnaryOpType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto transform_exclusive_scan( const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view_from, @@ -89,12 +99,56 @@ auto transform_exclusive_scan( Impl::static_assert_is_not_openmptarget(ex); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; - return Impl::transform_exclusive_scan_impl( + return Impl::transform_exclusive_scan_exespace_impl( label, ex, KE::cbegin(view_from), KE::cend(view_from), - KE::begin(view_dest), init_value, binary_op, unary_op); + KE::begin(view_dest), std::move(init_value), binary_op, unary_op); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template && :: + Kokkos::is_team_handle_v, + int> = 0> +KOKKOS_FUNCTION OutputIteratorType transform_exclusive_scan( + const TeamHandleType& teamHandle, InputIteratorType first, + InputIteratorType last, OutputIteratorType first_dest, ValueType init_value, + BinaryOpType binary_op, UnaryOpType unary_op) { + Impl::static_assert_is_not_openmptarget(teamHandle); + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + return Impl::transform_exclusive_scan_team_impl( + teamHandle, first, last, first_dest, std::move(init_value), binary_op, + unary_op); +} + +template , int> = 0> +KOKKOS_FUNCTION auto transform_exclusive_scan( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view_from, + const ::Kokkos::View& view_dest, + ValueType init_value, BinaryOpType binary_op, UnaryOpType unary_op) { + Impl::static_assert_is_not_openmptarget(teamHandle); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + namespace KE = ::Kokkos::Experimental; + return Impl::transform_exclusive_scan_team_impl( + teamHandle, KE::cbegin(view_from), KE::cend(view_from), + KE::begin(view_dest), std::move(init_value), binary_op, unary_op); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_TransformInclusiveScan.hpp b/algorithms/src/std_algorithms/Kokkos_TransformInclusiveScan.hpp index 7489af7e37..5f694dbfd9 100644 --- a/algorithms/src/std_algorithms/Kokkos_TransformInclusiveScan.hpp +++ b/algorithms/src/std_algorithms/Kokkos_TransformInclusiveScan.hpp @@ -23,40 +23,53 @@ namespace Kokkos { namespace Experimental { +// +// overload set accepting execution space +// + // overload set 1 (no init value) -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -transform_inclusive_scan(const ExecutionSpace& ex, InputIteratorType first, - InputIteratorType last, OutputIteratorType first_dest, - BinaryOpType binary_op, UnaryOpType unary_op) { +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType transform_inclusive_scan(const ExecutionSpace& ex, + InputIteratorType first, + InputIteratorType last, + OutputIteratorType first_dest, + BinaryOpType binary_op, + UnaryOpType unary_op) { Impl::static_assert_is_not_openmptarget(ex); - return Impl::transform_inclusive_scan_impl( + return Impl::transform_inclusive_scan_exespace_impl( "Kokkos::transform_inclusive_scan_custom_functors_iterator_api", ex, first, last, first_dest, binary_op, unary_op); } -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -transform_inclusive_scan(const std::string& label, const ExecutionSpace& ex, - InputIteratorType first, InputIteratorType last, - OutputIteratorType first_dest, BinaryOpType binary_op, - UnaryOpType unary_op) { +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType transform_inclusive_scan( + const std::string& label, const ExecutionSpace& ex, InputIteratorType first, + InputIteratorType last, OutputIteratorType first_dest, + BinaryOpType binary_op, UnaryOpType unary_op) { Impl::static_assert_is_not_openmptarget(ex); - return Impl::transform_inclusive_scan_impl(label, ex, first, last, first_dest, - binary_op, unary_op); + return Impl::transform_inclusive_scan_exespace_impl( + label, ex, first, last, first_dest, binary_op, unary_op); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename BinaryOpType, + typename UnaryOpType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto transform_inclusive_scan( const ExecutionSpace& ex, const ::Kokkos::View& view_from, @@ -66,15 +79,17 @@ auto transform_inclusive_scan( Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); namespace KE = ::Kokkos::Experimental; - return Impl::transform_inclusive_scan_impl( + return Impl::transform_inclusive_scan_exespace_impl( "Kokkos::transform_inclusive_scan_custom_functors_view_api", ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), binary_op, unary_op); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename BinaryOpType, + typename UnaryOpType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto transform_inclusive_scan( const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view_from, @@ -84,46 +99,59 @@ auto transform_inclusive_scan( Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); namespace KE = ::Kokkos::Experimental; - return Impl::transform_inclusive_scan_impl( + return Impl::transform_inclusive_scan_exespace_impl( label, ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), binary_op, unary_op); } // overload set 2 (init value) -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -transform_inclusive_scan(const ExecutionSpace& ex, InputIteratorType first, - InputIteratorType last, OutputIteratorType first_dest, - BinaryOpType binary_op, UnaryOpType unary_op, - ValueType init_value) { +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType transform_inclusive_scan( + const ExecutionSpace& ex, InputIteratorType first, InputIteratorType last, + OutputIteratorType first_dest, BinaryOpType binary_op, UnaryOpType unary_op, + ValueType init_value) { Impl::static_assert_is_not_openmptarget(ex); - return Impl::transform_inclusive_scan_impl( + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + + return Impl::transform_inclusive_scan_exespace_impl( "Kokkos::transform_inclusive_scan_custom_functors_iterator_api", ex, - first, last, first_dest, binary_op, unary_op, init_value); + first, last, first_dest, binary_op, unary_op, std::move(init_value)); } -template -std::enable_if_t<::Kokkos::Experimental::Impl::are_iterators< - InputIteratorType, OutputIteratorType>::value, - OutputIteratorType> -transform_inclusive_scan(const std::string& label, const ExecutionSpace& ex, - InputIteratorType first, InputIteratorType last, - OutputIteratorType first_dest, BinaryOpType binary_op, - UnaryOpType unary_op, ValueType init_value) { +template && :: + Kokkos::is_execution_space_v, + int> = 0> +OutputIteratorType transform_inclusive_scan( + const std::string& label, const ExecutionSpace& ex, InputIteratorType first, + InputIteratorType last, OutputIteratorType first_dest, + BinaryOpType binary_op, UnaryOpType unary_op, ValueType init_value) { Impl::static_assert_is_not_openmptarget(ex); - return Impl::transform_inclusive_scan_impl(label, ex, first, last, first_dest, - binary_op, unary_op, init_value); + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + + return Impl::transform_inclusive_scan_exespace_impl( + label, ex, first, last, first_dest, binary_op, unary_op, + std::move(init_value)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename BinaryOpType, + typename UnaryOpType, typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto transform_inclusive_scan( const ExecutionSpace& ex, const ::Kokkos::View& view_from, @@ -132,16 +160,21 @@ auto transform_inclusive_scan( Impl::static_assert_is_not_openmptarget(ex); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + namespace KE = ::Kokkos::Experimental; - return Impl::transform_inclusive_scan_impl( + return Impl::transform_inclusive_scan_exespace_impl( "Kokkos::transform_inclusive_scan_custom_functors_view_api", ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), - binary_op, unary_op, init_value); + binary_op, unary_op, std::move(init_value)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename BinaryOpType, + typename UnaryOpType, typename ValueType, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto transform_inclusive_scan( const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view_from, @@ -150,10 +183,97 @@ auto transform_inclusive_scan( Impl::static_assert_is_not_openmptarget(ex); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + namespace KE = ::Kokkos::Experimental; - return Impl::transform_inclusive_scan_impl( + return Impl::transform_inclusive_scan_exespace_impl( label, ex, KE::cbegin(view_from), KE::cend(view_from), - KE::begin(view_dest), binary_op, unary_op, init_value); + KE::begin(view_dest), binary_op, unary_op, std::move(init_value)); +} + +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// + +// overload set 1 (no init value) +template && + Kokkos::is_team_handle_v, + int> = 0> +KOKKOS_FUNCTION OutputIteratorType transform_inclusive_scan( + const TeamHandleType& teamHandle, InputIteratorType first, + InputIteratorType last, OutputIteratorType first_dest, + BinaryOpType binary_op, UnaryOpType unary_op) { + Impl::static_assert_is_not_openmptarget(teamHandle); + + return Impl::transform_inclusive_scan_team_impl( + teamHandle, first, last, first_dest, binary_op, unary_op); +} + +template , int> = 0> +KOKKOS_FUNCTION auto transform_inclusive_scan( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view_from, + const ::Kokkos::View& view_dest, + BinaryOpType binary_op, UnaryOpType unary_op) { + Impl::static_assert_is_not_openmptarget(teamHandle); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + namespace KE = ::Kokkos::Experimental; + return Impl::transform_inclusive_scan_team_impl( + teamHandle, KE::cbegin(view_from), KE::cend(view_from), + KE::begin(view_dest), binary_op, unary_op); +} + +// overload set 2 (init value) +template && + Kokkos::is_team_handle_v, + int> = 0> +KOKKOS_FUNCTION OutputIteratorType transform_inclusive_scan( + const TeamHandleType& teamHandle, InputIteratorType first, + InputIteratorType last, OutputIteratorType first_dest, + BinaryOpType binary_op, UnaryOpType unary_op, ValueType init_value) { + Impl::static_assert_is_not_openmptarget(teamHandle); + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + + return Impl::transform_inclusive_scan_team_impl( + teamHandle, first, last, first_dest, binary_op, unary_op, + std::move(init_value)); +} + +template , int> = 0> +KOKKOS_FUNCTION auto transform_inclusive_scan( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view_from, + const ::Kokkos::View& view_dest, + BinaryOpType binary_op, UnaryOpType unary_op, ValueType init_value) { + Impl::static_assert_is_not_openmptarget(teamHandle); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); + static_assert(std::is_move_constructible_v, + "ValueType must be move constructible."); + + namespace KE = ::Kokkos::Experimental; + return Impl::transform_inclusive_scan_team_impl( + teamHandle, KE::cbegin(view_from), KE::cend(view_from), + KE::begin(view_dest), binary_op, unary_op, std::move(init_value)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp b/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp index b5ec9066d2..101f5113f6 100644 --- a/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp +++ b/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp @@ -23,34 +23,44 @@ namespace Kokkos { namespace Experimental { +// +// overload set accepting execution space +// + // ---------------------------- // overload set1: // no custom functors passed, so equivalent to // transform_reduce(first1, last1, first2, init, plus<>(), multiplies<>()); // ---------------------------- -template +template ::value, + int> = 0> ValueType transform_reduce(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1, IteratorType2 first2, ValueType init_reduction_value) { - return Impl::transform_reduce_default_functors_impl( + return Impl::transform_reduce_default_functors_exespace_impl( "Kokkos::transform_reduce_default_functors_iterator_api", ex, first1, last1, first2, std::move(init_reduction_value)); } -template +template ::value, + int> = 0> ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1, IteratorType2 first2, ValueType init_reduction_value) { - return Impl::transform_reduce_default_functors_impl( + return Impl::transform_reduce_default_functors_exespace_impl( label, ex, first1, last1, first2, std::move(init_reduction_value)); } // overload1 accepting views -template +template ::value, + int> = 0> ValueType transform_reduce( const ExecutionSpace& ex, const ::Kokkos::View& first_view, @@ -60,14 +70,16 @@ ValueType transform_reduce( Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(second_view); - return Impl::transform_reduce_default_functors_impl( + return Impl::transform_reduce_default_functors_exespace_impl( "Kokkos::transform_reduce_default_functors_iterator_api", ex, KE::cbegin(first_view), KE::cend(first_view), KE::cbegin(second_view), std::move(init_reduction_value)); } -template +template ::value, + int> = 0> ValueType transform_reduce( const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& first_view, @@ -77,7 +89,7 @@ ValueType transform_reduce( Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(second_view); - return Impl::transform_reduce_default_functors_impl( + return Impl::transform_reduce_default_functors_exespace_impl( label, ex, KE::cbegin(first_view), KE::cend(first_view), KE::cbegin(second_view), std::move(init_reduction_value)); } @@ -95,8 +107,11 @@ ValueType transform_reduce( // https://en.cppreference.com/w/cpp/algorithm/transform_reduce // api accepting iterators -template +template < + typename ExecutionSpace, typename IteratorType1, typename IteratorType2, + typename ValueType, typename BinaryJoinerType, typename BinaryTransform, + std::enable_if_t<::Kokkos::is_execution_space::value, int> = + 0> ValueType transform_reduce(const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1, IteratorType2 first2, ValueType init_reduction_value, @@ -105,14 +120,17 @@ ValueType transform_reduce(const ExecutionSpace& ex, IteratorType1 first1, static_assert(std::is_move_constructible::value, "ValueType must be move constructible."); - return Impl::transform_reduce_custom_functors_impl( + return Impl::transform_reduce_custom_functors_exespace_impl( "Kokkos::transform_reduce_custom_functors_iterator_api", ex, first1, last1, first2, std::move(init_reduction_value), std::move(joiner), std::move(transformer)); } -template +template < + typename ExecutionSpace, typename IteratorType1, typename IteratorType2, + typename ValueType, typename BinaryJoinerType, typename BinaryTransform, + std::enable_if_t<::Kokkos::is_execution_space::value, int> = + 0> ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex, IteratorType1 first1, IteratorType1 last1, IteratorType2 first2, ValueType init_reduction_value, @@ -121,15 +139,17 @@ ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex, static_assert(std::is_move_constructible::value, "ValueType must be move constructible."); - return Impl::transform_reduce_custom_functors_impl( + return Impl::transform_reduce_custom_functors_exespace_impl( label, ex, first1, last1, first2, std::move(init_reduction_value), std::move(joiner), std::move(transformer)); } // accepting views -template +template ::value, + int> = 0> ValueType transform_reduce( const ExecutionSpace& ex, const ::Kokkos::View& first_view, @@ -143,16 +163,18 @@ ValueType transform_reduce( Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(second_view); - return Impl::transform_reduce_custom_functors_impl( + return Impl::transform_reduce_custom_functors_exespace_impl( "Kokkos::transform_reduce_custom_functors_view_api", ex, KE::cbegin(first_view), KE::cend(first_view), KE::cbegin(second_view), std::move(init_reduction_value), std::move(joiner), std::move(transformer)); } -template +template ::value, + int> = 0> ValueType transform_reduce( const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& first_view, @@ -166,7 +188,7 @@ ValueType transform_reduce( Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(second_view); - return Impl::transform_reduce_custom_functors_impl( + return Impl::transform_reduce_custom_functors_exespace_impl( label, ex, KE::cbegin(first_view), KE::cend(first_view), KE::cbegin(second_view), std::move(init_reduction_value), std::move(joiner), std::move(transformer)); @@ -176,43 +198,50 @@ ValueType transform_reduce( // overload set3: // // accepting iterators -template -// need this to avoid ambiguous call -std::enable_if_t< - ::Kokkos::Experimental::Impl::are_iterators::value, ValueType> -transform_reduce(const ExecutionSpace& ex, IteratorType first1, - IteratorType last1, ValueType init_reduction_value, - BinaryJoinerType joiner, UnaryTransform transformer) { +template ::value && + is_execution_space::value, + int> = 0> +ValueType transform_reduce(const ExecutionSpace& ex, IteratorType first1, + IteratorType last1, ValueType init_reduction_value, + BinaryJoinerType joiner, + UnaryTransform transformer) { static_assert(std::is_move_constructible::value, "ValueType must be move constructible."); - return Impl::transform_reduce_custom_functors_impl( + return Impl::transform_reduce_custom_functors_exespace_impl( "Kokkos::transform_reduce_custom_functors_iterator_api", ex, first1, last1, std::move(init_reduction_value), std::move(joiner), std::move(transformer)); } -template -// need this to avoid ambiguous call -std::enable_if_t< - ::Kokkos::Experimental::Impl::are_iterators::value, ValueType> -transform_reduce(const std::string& label, const ExecutionSpace& ex, - IteratorType first1, IteratorType last1, - ValueType init_reduction_value, BinaryJoinerType joiner, - UnaryTransform transformer) { +template ::value && + is_execution_space::value, + int> = 0> +ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex, + IteratorType first1, IteratorType last1, + ValueType init_reduction_value, + BinaryJoinerType joiner, + UnaryTransform transformer) { static_assert(std::is_move_constructible::value, "ValueType must be move constructible."); - return Impl::transform_reduce_custom_functors_impl( + return Impl::transform_reduce_custom_functors_exespace_impl( label, ex, first1, last1, std::move(init_reduction_value), std::move(joiner), std::move(transformer)); } // accepting views -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename ValueType, typename BinaryJoinerType, typename UnaryTransform, + std::enable_if_t<::Kokkos::is_execution_space::value, int> = + 0> ValueType transform_reduce(const ExecutionSpace& ex, const ::Kokkos::View& view, ValueType init_reduction_value, @@ -224,14 +253,17 @@ ValueType transform_reduce(const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - return Impl::transform_reduce_custom_functors_impl( + return Impl::transform_reduce_custom_functors_exespace_impl( "Kokkos::transform_reduce_custom_functors_view_api", ex, KE::cbegin(view), KE::cend(view), std::move(init_reduction_value), std::move(joiner), std::move(transformer)); } -template +template < + typename ExecutionSpace, typename DataType, typename... Properties, + typename ValueType, typename BinaryJoinerType, typename UnaryTransform, + std::enable_if_t<::Kokkos::is_execution_space::value, int> = + 0> ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, ValueType init_reduction_value, @@ -243,12 +275,154 @@ ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - return Impl::transform_reduce_custom_functors_impl( + return Impl::transform_reduce_custom_functors_exespace_impl( label, ex, KE::cbegin(view), KE::cend(view), std::move(init_reduction_value), std::move(joiner), std::move(transformer)); } +// +// overload set accepting a team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// + +// ---------------------------- +// overload set1: +// no custom functors passed, so equivalent to +// transform_reduce(first1, last1, first2, init, plus<>(), multiplies<>()); +// ---------------------------- +template < + typename TeamHandleType, typename IteratorType1, typename IteratorType2, + typename ValueType, + std::enable_if_t<::Kokkos::is_team_handle::value, int> = 0> +KOKKOS_FUNCTION ValueType transform_reduce(const TeamHandleType& teamHandle, + IteratorType1 first1, + IteratorType1 last1, + IteratorType2 first2, + ValueType init_reduction_value) { + return Impl::transform_reduce_default_functors_team_impl( + teamHandle, first1, last1, first2, std::move(init_reduction_value)); +} + +// overload1 accepting views +template < + typename TeamHandleType, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename ValueType, + std::enable_if_t<::Kokkos::is_team_handle::value, int> = 0> +KOKKOS_FUNCTION ValueType +transform_reduce(const TeamHandleType& teamHandle, + const ::Kokkos::View& first_view, + const ::Kokkos::View& second_view, + ValueType init_reduction_value) { + namespace KE = ::Kokkos::Experimental; + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(second_view); + + return Impl::transform_reduce_default_functors_team_impl( + teamHandle, KE::cbegin(first_view), KE::cend(first_view), + KE::cbegin(second_view), std::move(init_reduction_value)); +} + +// +// overload set2: +// accepts a custom transform and joiner functor +// + +// Note the std refers to the arg BinaryReductionOp +// but in the Kokkos naming convention, it corresponds +// to a "joiner" that knows how to join two values +// NOTE: "joiner/transformer" need to be commutative. + +// https://en.cppreference.com/w/cpp/algorithm/transform_reduce + +// api accepting iterators +template < + typename TeamHandleType, typename IteratorType1, typename IteratorType2, + typename ValueType, typename BinaryJoinerType, typename BinaryTransform, + std::enable_if_t<::Kokkos::is_team_handle::value, int> = 0> +KOKKOS_FUNCTION ValueType transform_reduce( + const TeamHandleType& teamHandle, IteratorType1 first1, IteratorType1 last1, + IteratorType2 first2, ValueType init_reduction_value, + BinaryJoinerType joiner, BinaryTransform transformer) { + static_assert(std::is_move_constructible::value, + "ValueType must be move constructible."); + + return Impl::transform_reduce_custom_functors_team_impl( + teamHandle, first1, last1, first2, std::move(init_reduction_value), + std::move(joiner), std::move(transformer)); +} + +// accepting views +template < + typename TeamHandleType, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename ValueType, + typename BinaryJoinerType, typename BinaryTransform, + std::enable_if_t<::Kokkos::is_team_handle::value, int> = 0> +KOKKOS_FUNCTION ValueType +transform_reduce(const TeamHandleType& teamHandle, + const ::Kokkos::View& first_view, + const ::Kokkos::View& second_view, + ValueType init_reduction_value, BinaryJoinerType joiner, + BinaryTransform transformer) { + namespace KE = ::Kokkos::Experimental; + static_assert(std::is_move_constructible::value, + "ValueType must be move constructible."); + + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(second_view); + + return Impl::transform_reduce_custom_functors_team_impl( + teamHandle, KE::cbegin(first_view), KE::cend(first_view), + KE::cbegin(second_view), std::move(init_reduction_value), + std::move(joiner), std::move(transformer)); +} + +// +// overload set3: +// +// accepting iterators +template ::value && + is_team_handle::value, + int> = 0> +KOKKOS_FUNCTION ValueType transform_reduce(const TeamHandleType& teamHandle, + IteratorType first1, + IteratorType last1, + ValueType init_reduction_value, + BinaryJoinerType joiner, + UnaryTransform transformer) { + static_assert(std::is_move_constructible::value, + "ValueType must be move constructible."); + + return Impl::transform_reduce_custom_functors_team_impl( + teamHandle, first1, last1, std::move(init_reduction_value), + std::move(joiner), std::move(transformer)); +} + +// accepting views +template < + typename TeamHandleType, typename DataType, typename... Properties, + typename ValueType, typename BinaryJoinerType, typename UnaryTransform, + std::enable_if_t<::Kokkos::is_team_handle::value, int> = 0> +KOKKOS_FUNCTION ValueType +transform_reduce(const TeamHandleType& teamHandle, + const ::Kokkos::View& view, + ValueType init_reduction_value, BinaryJoinerType joiner, + UnaryTransform transformer) { + namespace KE = ::Kokkos::Experimental; + static_assert(std::is_move_constructible::value, + "ValueType must be move constructible."); + + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); + + return Impl::transform_reduce_custom_functors_team_impl( + teamHandle, KE::cbegin(view), KE::cend(view), + std::move(init_reduction_value), std::move(joiner), + std::move(transformer)); +} + } // namespace Experimental } // namespace Kokkos diff --git a/algorithms/src/std_algorithms/Kokkos_Unique.hpp b/algorithms/src/std_algorithms/Kokkos_Unique.hpp index b47ecffb20..2d56315f61 100644 --- a/algorithms/src/std_algorithms/Kokkos_Unique.hpp +++ b/algorithms/src/std_algorithms/Kokkos_Unique.hpp @@ -23,71 +23,132 @@ namespace Kokkos { namespace Experimental { -// note: the enable_if below is to avoid "call to ... is ambiguous" -// for example in the unit test when using a variadic function - -// overload set1 -template -std::enable_if_t::value, IteratorType> unique( - const ExecutionSpace& ex, IteratorType first, IteratorType last) { - return Impl::unique_impl("Kokkos::unique_iterator_api_default", ex, first, - last); +// +// overload set1: default predicate, accepting execution space +// +template && + is_execution_space::value, + int> = 0> +IteratorType unique(const ExecutionSpace& ex, IteratorType first, + IteratorType last) { + return Impl::unique_exespace_impl("Kokkos::unique_iterator_api_default", ex, + first, last); } -template -std::enable_if_t::value, IteratorType> unique( - const std::string& label, const ExecutionSpace& ex, IteratorType first, - IteratorType last) { - return Impl::unique_impl(label, ex, first, last); +template && + is_execution_space::value, + int> = 0> +IteratorType unique(const std::string& label, const ExecutionSpace& ex, + IteratorType first, IteratorType last) { + return Impl::unique_exespace_impl(label, ex, first, last); } -template +template ::value, int> = 0> auto unique(const ExecutionSpace& ex, const ::Kokkos::View& view) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - return ::Kokkos::Experimental::unique("Kokkos::unique_view_api_default", ex, - begin(view), end(view)); + return Impl::unique_exespace_impl("Kokkos::unique_view_api_default", ex, + begin(view), end(view)); } -template +template ::value, int> = 0> auto unique(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - return ::Kokkos::Experimental::unique(label, ex, begin(view), end(view)); + return Impl::unique_exespace_impl(label, ex, begin(view), end(view)); } -// overload set2 -template +// +// overload set2: custom predicate, accepting execution space +// +template ::value, int> = 0> IteratorType unique(const ExecutionSpace& ex, IteratorType first, IteratorType last, BinaryPredicate pred) { - return Impl::unique_impl("Kokkos::unique_iterator_api_default", ex, first, - last, pred); + return Impl::unique_exespace_impl("Kokkos::unique_iterator_api_default", ex, + first, last, pred); } -template +template ::value, int> = 0> IteratorType unique(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, BinaryPredicate pred) { - return Impl::unique_impl(label, ex, first, last, pred); + return Impl::unique_exespace_impl(label, ex, first, last, pred); } -template +template ::value, int> = 0> auto unique(const ExecutionSpace& ex, const ::Kokkos::View& view, BinaryPredicate pred) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - return Impl::unique_impl("Kokkos::unique_view_api_default", ex, begin(view), - end(view), std::move(pred)); + return Impl::unique_exespace_impl("Kokkos::unique_view_api_default", ex, + begin(view), end(view), std::move(pred)); } -template +template ::value, int> = 0> auto unique(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, BinaryPredicate pred) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - return Impl::unique_impl(label, ex, begin(view), end(view), std::move(pred)); + return Impl::unique_exespace_impl(label, ex, begin(view), end(view), + std::move(pred)); +} + +// +// overload set3: default predicate, accepting team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template && + is_team_handle::value, + int> = 0> +KOKKOS_FUNCTION IteratorType unique(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last) { + return Impl::unique_team_impl(teamHandle, first, last); +} + +template ::value, int> = 0> +KOKKOS_FUNCTION auto unique( + const TeamHandleType& teamHandle, + const ::Kokkos::View& view) { + return Impl::unique_team_impl(teamHandle, begin(view), end(view)); +} + +// +// overload set4: custom predicate, accepting team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template ::value, int> = 0> +KOKKOS_FUNCTION IteratorType unique(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + BinaryPredicate pred) { + return Impl::unique_team_impl(teamHandle, first, last, std::move(pred)); +} + +template ::value, int> = 0> +KOKKOS_FUNCTION auto unique(const TeamHandleType& teamHandle, + const ::Kokkos::View& view, + BinaryPredicate pred) { + return Impl::unique_team_impl(teamHandle, begin(view), end(view), + std::move(pred)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/Kokkos_UniqueCopy.hpp b/algorithms/src/std_algorithms/Kokkos_UniqueCopy.hpp index bd2451c220..4a32d7e095 100644 --- a/algorithms/src/std_algorithms/Kokkos_UniqueCopy.hpp +++ b/algorithms/src/std_algorithms/Kokkos_UniqueCopy.hpp @@ -23,67 +23,90 @@ namespace Kokkos { namespace Experimental { -// overload set1 -template -std::enable_if_t::value, OutputIterator> -unique_copy(const ExecutionSpace& ex, InputIterator first, InputIterator last, - OutputIterator d_first) { - return Impl::unique_copy_impl("Kokkos::unique_copy_iterator_api_default", ex, - first, last, d_first); +// +// overload set1: default predicate, accepting execution space +// +template < + typename ExecutionSpace, typename InputIterator, typename OutputIterator, + std::enable_if_t && + is_execution_space_v, + int> = 0> +OutputIterator unique_copy(const ExecutionSpace& ex, InputIterator first, + InputIterator last, OutputIterator d_first) { + return Impl::unique_copy_exespace_impl( + "Kokkos::unique_copy_iterator_api_default", ex, first, last, d_first); } -template -std::enable_if_t::value, OutputIterator> -unique_copy(const std::string& label, const ExecutionSpace& ex, - InputIterator first, InputIterator last, OutputIterator d_first) { - return Impl::unique_copy_impl(label, ex, first, last, d_first); +template < + typename ExecutionSpace, typename InputIterator, typename OutputIterator, + std::enable_if_t && + is_execution_space_v, + int> = 0> +OutputIterator unique_copy(const std::string& label, const ExecutionSpace& ex, + InputIterator first, InputIterator last, + OutputIterator d_first) { + return Impl::unique_copy_exespace_impl(label, ex, first, last, d_first); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto unique_copy(const ExecutionSpace& ex, const ::Kokkos::View& source, const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); - return ::Kokkos::Experimental::unique_copy( - "Kokkos::unique_copy_view_api_default", ex, cbegin(source), cend(source), - begin(dest)); + return Impl::unique_copy_exespace_impl("Kokkos::unique_copy_view_api_default", + ex, cbegin(source), cend(source), + begin(dest)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto unique_copy(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); - return ::Kokkos::Experimental::unique_copy(label, ex, cbegin(source), - cend(source), begin(dest)); + return Impl::unique_copy_exespace_impl(label, ex, cbegin(source), + cend(source), begin(dest)); } -// overload set2 -template +// +// overload set2: custom predicate, accepting execution space +// + +template < + typename ExecutionSpace, typename InputIterator, typename OutputIterator, + typename BinaryPredicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> OutputIterator unique_copy(const ExecutionSpace& ex, InputIterator first, InputIterator last, OutputIterator d_first, BinaryPredicate pred) { - return Impl::unique_copy_impl("Kokkos::unique_copy_iterator_api_default", ex, - first, last, d_first, pred); + return Impl::unique_copy_exespace_impl( + "Kokkos::unique_copy_iterator_api_default", ex, first, last, d_first, + pred); } -template +template < + typename ExecutionSpace, typename InputIterator, typename OutputIterator, + typename BinaryPredicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> OutputIterator unique_copy(const std::string& label, const ExecutionSpace& ex, InputIterator first, InputIterator last, OutputIterator d_first, BinaryPredicate pred) { - return Impl::unique_copy_impl(label, ex, first, last, d_first, pred); + return Impl::unique_copy_exespace_impl(label, ex, first, last, d_first, pred); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename BinaryPredicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto unique_copy(const ExecutionSpace& ex, const ::Kokkos::View& source, const ::Kokkos::View& dest, @@ -91,13 +114,15 @@ auto unique_copy(const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); - return Impl::unique_copy_impl("Kokkos::unique_copy_view_api_default", ex, - cbegin(source), cend(source), begin(dest), - std::move(pred)); + return Impl::unique_copy_exespace_impl("Kokkos::unique_copy_view_api_default", + ex, cbegin(source), cend(source), + begin(dest), std::move(pred)); } -template +template < + typename ExecutionSpace, typename DataType1, typename... Properties1, + typename DataType2, typename... Properties2, typename BinaryPredicate, + std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto unique_copy(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, const ::Kokkos::View& dest, @@ -105,8 +130,70 @@ auto unique_copy(const std::string& label, const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); - return Impl::unique_copy_impl(label, ex, cbegin(source), cend(source), - begin(dest), std::move(pred)); + return Impl::unique_copy_exespace_impl( + label, ex, cbegin(source), cend(source), begin(dest), std::move(pred)); +} + +// +// overload set3: default predicate, accepting team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template < + typename TeamHandleType, typename InputIterator, typename OutputIterator, + std::enable_if_t && + Kokkos::is_team_handle_v, + int> = 0> +KOKKOS_FUNCTION OutputIterator unique_copy(const TeamHandleType& teamHandle, + InputIterator first, + InputIterator last, + OutputIterator d_first) { + return Impl::unique_copy_team_impl(teamHandle, first, last, d_first); +} + +template , int> = 0> +KOKKOS_FUNCTION auto unique_copy( + const TeamHandleType& teamHandle, + const ::Kokkos::View& source, + const ::Kokkos::View& dest) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); + + return Impl::unique_copy_team_impl(teamHandle, cbegin(source), cend(source), + begin(dest)); +} + +// +// overload set4: custom predicate, accepting team handle +// Note: for now omit the overloads accepting a label +// since they cause issues on device because of the string allocation. +// +template , int> = 0> +KOKKOS_FUNCTION OutputIterator unique_copy(const TeamHandleType& teamHandle, + InputIterator first, + InputIterator last, + OutputIterator d_first, + BinaryPredicate pred) { + return Impl::unique_copy_team_impl(teamHandle, first, last, d_first, pred); +} + +template , int> = 0> +KOKKOS_FUNCTION auto unique_copy( + const TeamHandleType& teamHandle, + const ::Kokkos::View& source, + const ::Kokkos::View& dest, + BinaryPredicate pred) { + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); + Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); + + return Impl::unique_copy_team_impl(teamHandle, cbegin(source), cend(source), + begin(dest), std::move(pred)); } } // namespace Experimental diff --git a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp index 8a474508d7..9f7fcf94fe 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp @@ -63,14 +63,15 @@ struct StdAdjacentDiffFunctor { m_op(std::move(op)) {} }; +// +// exespace impl +// template -OutputIteratorType adjacent_difference_impl(const std::string& label, - const ExecutionSpace& ex, - InputIteratorType first_from, - InputIteratorType last_from, - OutputIteratorType first_dest, - BinaryOp bin_op) { +OutputIteratorType adjacent_difference_exespace_impl( + const std::string& label, const ExecutionSpace& ex, + InputIteratorType first_from, InputIteratorType last_from, + OutputIteratorType first_dest, BinaryOp bin_op) { // checks Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest); Impl::static_assert_iterators_have_matching_difference_type(first_from, @@ -81,25 +82,60 @@ OutputIteratorType adjacent_difference_impl(const std::string& label, return first_dest; } - // aliases - using value_type = typename OutputIteratorType::value_type; - using aux_view_type = ::Kokkos::View; - using functor_t = - StdAdjacentDiffFunctor; +#ifdef KOKKOS_ENABLE_DEBUG + // check for overlapping iterators + Impl::expect_no_overlap(first_from, last_from, first_dest); +#endif // run const auto num_elements = Kokkos::Experimental::distance(first_from, last_from); - aux_view_type aux_view("aux_view", num_elements); - ::Kokkos::parallel_for(label, - RangePolicy(ex, 0, num_elements), - functor_t(first_from, first_dest, bin_op)); + ::Kokkos::parallel_for( + label, RangePolicy(ex, 0, num_elements), + StdAdjacentDiffFunctor(first_from, first_dest, bin_op)); ex.fence("Kokkos::adjacent_difference: fence after operation"); // return return first_dest + num_elements; } +// +// team impl +// +template +KOKKOS_FUNCTION OutputIteratorType adjacent_difference_team_impl( + const TeamHandleType& teamHandle, InputIteratorType first_from, + InputIteratorType last_from, OutputIteratorType first_dest, + BinaryOp bin_op) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first_from, + first_dest); + Impl::static_assert_iterators_have_matching_difference_type(first_from, + first_dest); + Impl::expect_valid_range(first_from, last_from); + + if (first_from == last_from) { + return first_dest; + } + +#ifdef KOKKOS_ENABLE_DEBUG + // check for overlapping iterators + Impl::expect_no_overlap(first_from, last_from, first_dest); +#endif + + // run + const auto num_elements = + Kokkos::Experimental::distance(first_from, last_from); + ::Kokkos::parallel_for( + TeamThreadRange(teamHandle, 0, num_elements), + StdAdjacentDiffFunctor(first_from, first_dest, bin_op)); + teamHandle.team_barrier(); + + // return + return first_dest + num_elements; +} + } // namespace Impl } // namespace Experimental } // namespace Kokkos diff --git a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentFind.hpp b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentFind.hpp index dd785e603b..f30b7be06a 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_AdjacentFind.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_AdjacentFind.hpp @@ -27,9 +27,9 @@ namespace Kokkos { namespace Experimental { namespace Impl { -template +template struct StdAdjacentFindFunctor { + using index_type = typename IteratorType::difference_type; using red_value_type = typename ReducerType::value_type; IteratorType m_first; @@ -37,13 +37,13 @@ struct StdAdjacentFindFunctor { PredicateType m_p; KOKKOS_FUNCTION - void operator()(const IndexType i, red_value_type& red_value) const { + void operator()(const index_type i, red_value_type& red_value) const { const auto& my_value = m_first[i]; const auto& next_value = m_first[i + 1]; const bool are_equal = m_p(my_value, next_value); // FIXME_NVHPC using a ternary operator causes problems - red_value_type value = {::Kokkos::reduction_identity::min()}; + red_value_type value = {::Kokkos::reduction_identity::min()}; if (are_equal) { value.min_loc_true = i; } @@ -59,10 +59,14 @@ struct StdAdjacentFindFunctor { m_p(std::move(p)) {} }; +// +// exespace impl +// template -IteratorType adjacent_find_impl(const std::string& label, - const ExecutionSpace& ex, IteratorType first, - IteratorType last, PredicateType pred) { +IteratorType adjacent_find_exespace_impl(const std::string& label, + const ExecutionSpace& ex, + IteratorType first, IteratorType last, + PredicateType pred) { // checks Impl::static_assert_random_access_and_accessible(ex, first); Impl::expect_valid_range(first, last); @@ -76,8 +80,6 @@ IteratorType adjacent_find_impl(const std::string& label, using index_type = typename IteratorType::difference_type; using reducer_type = FirstLoc; using reduction_value_type = typename reducer_type::value_type; - using func_t = StdAdjacentFindFunctor; reduction_value_type red_result; reducer_type reducer(red_result); @@ -86,7 +88,8 @@ IteratorType adjacent_find_impl(const std::string& label, // each index i in the reduction checks i and (i+1). ::Kokkos::parallel_reduce( label, RangePolicy(ex, 0, num_elements - 1), - func_t(first, reducer, pred), reducer); + // use CTAD + StdAdjacentFindFunctor(first, reducer, pred), reducer); // fence not needed because reducing into scalar if (red_result.min_loc_true == @@ -98,12 +101,62 @@ IteratorType adjacent_find_impl(const std::string& label, } template -IteratorType adjacent_find_impl(const std::string& label, - const ExecutionSpace& ex, IteratorType first, - IteratorType last) { +IteratorType adjacent_find_exespace_impl(const std::string& label, + const ExecutionSpace& ex, + IteratorType first, + IteratorType last) { + using value_type = typename IteratorType::value_type; + using default_pred_t = StdAlgoEqualBinaryPredicate; + return adjacent_find_exespace_impl(label, ex, first, last, default_pred_t()); +} + +// +// team impl +// +template +KOKKOS_FUNCTION IteratorType +adjacent_find_team_impl(const TeamHandleType& teamHandle, IteratorType first, + IteratorType last, PredicateType pred) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first); + Impl::expect_valid_range(first, last); + + const auto num_elements = Kokkos::Experimental::distance(first, last); + + if (num_elements <= 1) { + return last; + } + + using index_type = typename IteratorType::difference_type; + using reducer_type = FirstLoc; + using reduction_value_type = typename reducer_type::value_type; + + reduction_value_type red_result; + reducer_type reducer(red_result); + + // note that we use below num_elements-1 because + // each index i in the reduction checks i and (i+1). + ::Kokkos::parallel_reduce(TeamThreadRange(teamHandle, 0, num_elements - 1), + // use CTAD + StdAdjacentFindFunctor(first, reducer, pred), + reducer); + + teamHandle.team_barrier(); + + if (red_result.min_loc_true == + ::Kokkos::reduction_identity::min()) { + return last; + } else { + return first + red_result.min_loc_true; + } +} + +template +KOKKOS_FUNCTION IteratorType adjacent_find_team_impl( + const TeamHandleType& teamHandle, IteratorType first, IteratorType last) { using value_type = typename IteratorType::value_type; using default_pred_t = StdAlgoEqualBinaryPredicate; - return adjacent_find_impl(label, ex, first, last, default_pred_t()); + return adjacent_find_team_impl(teamHandle, first, last, default_pred_t()); } } // namespace Impl diff --git a/algorithms/src/std_algorithms/impl/Kokkos_AllOfAnyOfNoneOf.hpp b/algorithms/src/std_algorithms/impl/Kokkos_AllOfAnyOfNoneOf.hpp index ad562070a0..bdc050f9c1 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_AllOfAnyOfNoneOf.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_AllOfAnyOfNoneOf.hpp @@ -23,23 +23,58 @@ namespace Kokkos { namespace Experimental { namespace Impl { +// +// exespace impl +// template -bool all_of_impl(const std::string& label, const ExecutionSpace& ex, - InputIterator first, InputIterator last, Predicate predicate) { - return (find_if_or_not_impl(label, ex, first, last, predicate) == - last); +bool all_of_exespace_impl(const std::string& label, const ExecutionSpace& ex, + InputIterator first, InputIterator last, + Predicate predicate) { + return (find_if_or_not_exespace_impl(label, ex, first, last, + predicate) == last); } template -bool any_of_impl(const std::string& label, const ExecutionSpace& ex, - InputIterator first, InputIterator last, Predicate predicate) { - return (find_if_or_not_impl(label, ex, first, last, predicate) != last); +bool any_of_exespace_impl(const std::string& label, const ExecutionSpace& ex, + InputIterator first, InputIterator last, + Predicate predicate) { + return (find_if_or_not_exespace_impl(label, ex, first, last, + predicate) != last); } template -bool none_of_impl(const std::string& label, const ExecutionSpace& ex, - IteratorType first, IteratorType last, Predicate predicate) { - return (find_if_or_not_impl(label, ex, first, last, predicate) == last); +bool none_of_exespace_impl(const std::string& label, const ExecutionSpace& ex, + IteratorType first, IteratorType last, + Predicate predicate) { + return (find_if_or_not_exespace_impl(label, ex, first, last, + predicate) == last); +} + +// +// team impl +// +template +KOKKOS_FUNCTION bool all_of_team_impl(const TeamHandleType& teamHandle, + InputIterator first, InputIterator last, + Predicate predicate) { + return (find_if_or_not_team_impl(teamHandle, first, last, predicate) == + last); +} + +template +KOKKOS_FUNCTION bool any_of_team_impl(const TeamHandleType& teamHandle, + InputIterator first, InputIterator last, + Predicate predicate) { + return (find_if_or_not_team_impl(teamHandle, first, last, predicate) != + last); +} + +template +KOKKOS_FUNCTION bool none_of_team_impl(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + Predicate predicate) { + return (find_if_or_not_team_impl(teamHandle, first, last, predicate) == + last); } } // namespace Impl diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index 0376100410..54bb13e25b 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -24,18 +24,21 @@ namespace Kokkos { namespace Experimental { namespace Impl { +template +class RandomAccessIterator; + template struct is_admissible_to_kokkos_std_algorithms : std::false_type {}; template struct is_admissible_to_kokkos_std_algorithms< - T, std::enable_if_t< ::Kokkos::is_view::value && T::rank() == 1 && - (std::is_same::value || - std::is_same::value || - std::is_same::value)> > + T, std::enable_if_t<::Kokkos::is_view::value && T::rank() == 1 && + (std::is_same::value || + std::is_same::value || + std::is_same::value)>> : std::true_type {}; template @@ -55,6 +58,21 @@ using iterator_category_t = typename T::iterator_category; template using is_iterator = Kokkos::is_detected; +template +inline constexpr bool is_iterator_v = is_iterator::value; + +template +struct is_kokkos_iterator : std::false_type {}; + +template +struct is_kokkos_iterator> { + static constexpr bool value = + is_admissible_to_kokkos_std_algorithms::value; +}; + +template +inline constexpr bool is_kokkos_iterator_v = is_kokkos_iterator::value; + // // are_iterators // @@ -63,15 +81,18 @@ struct are_iterators; template struct are_iterators { - static constexpr bool value = is_iterator::value; + static constexpr bool value = is_iterator_v; }; template struct are_iterators { static constexpr bool value = - are_iterators::value && are_iterators::value; + are_iterators::value && (are_iterators::value && ... && true); }; +template +inline constexpr bool are_iterators_v = are_iterators::value; + // // are_random_access_iterators // @@ -81,17 +102,21 @@ struct are_random_access_iterators; template struct are_random_access_iterators { static constexpr bool value = - is_iterator::value && - std::is_base_of::value; + is_iterator_v && std::is_base_of::value; }; template struct are_random_access_iterators { - static constexpr bool value = are_random_access_iterators::value && - are_random_access_iterators::value; + static constexpr bool value = + are_random_access_iterators::value && + (are_random_access_iterators::value && ... && true); }; +template +inline constexpr bool are_random_access_iterators_v = + are_random_access_iterators::value; + // // iterators_are_accessible_from // @@ -113,16 +138,18 @@ struct iterators_are_accessible_from { iterators_are_accessible_from::value; }; -template +template KOKKOS_INLINE_FUNCTION constexpr void -static_assert_random_access_and_accessible(const ExecutionSpace& /* ex */, - IteratorTypes... /* iterators */) { +static_assert_random_access_and_accessible( + const ExecutionSpaceOrTeamHandleType& /* ex_or_th*/, + IteratorTypes... /* iterators */) { static_assert( are_random_access_iterators::value, "Currently, Kokkos standard algorithms require random access iterators."); - static_assert( - iterators_are_accessible_from::value, - "Incompatible view/iterator and execution space"); + static_assert(iterators_are_accessible_from< + typename ExecutionSpaceOrTeamHandleType::execution_space, + IteratorTypes...>::value, + "Incompatible view/iterator and execution space"); } // @@ -182,10 +209,10 @@ struct not_openmptarget { #endif }; -template +template KOKKOS_INLINE_FUNCTION constexpr void static_assert_is_not_openmptarget( - const ExecutionSpace&) { - static_assert(not_openmptarget::value, + const ExecutionSpaceOrTeamHandleType& /*ex_or_th*/) { + static_assert(not_openmptarget::value, "Currently, Kokkos standard algorithms do not support custom " "comparators in OpenMPTarget"); } @@ -194,7 +221,8 @@ KOKKOS_INLINE_FUNCTION constexpr void static_assert_is_not_openmptarget( // valid range // template -void expect_valid_range(IteratorType first, IteratorType last) { +KOKKOS_INLINE_FUNCTION void expect_valid_range(IteratorType first, + IteratorType last) { // this is a no-op for release KOKKOS_EXPECTS(last >= first); // avoid compiler complaining when KOKKOS_EXPECTS is no-op @@ -202,6 +230,38 @@ void expect_valid_range(IteratorType first, IteratorType last) { (void)last; } +// +// Check if kokkos iterators are overlapping +// +template +KOKKOS_INLINE_FUNCTION void expect_no_overlap( + [[maybe_unused]] IteratorType1 first, [[maybe_unused]] IteratorType1 last, + [[maybe_unused]] IteratorType2 s_first) { + if constexpr (is_kokkos_iterator_v && + is_kokkos_iterator_v) { + auto const view1 = first.view(); + auto const view2 = s_first.view(); + + std::size_t stride1 = view1.stride(0); + std::size_t stride2 = view2.stride(0); + ptrdiff_t first_diff = view1.data() - view2.data(); + + // FIXME If strides are not identical, checks may not be made + // with the cost of O(1) + // Currently, checks are made only if strides are identical + // If first_diff == 0, there is already an overlap + if (stride1 == stride2 || first_diff == 0) { + [[maybe_unused]] bool is_no_overlap = (first_diff % stride1); + auto* first_pointer1 = view1.data(); + auto* first_pointer2 = view2.data(); + [[maybe_unused]] auto* last_pointer1 = first_pointer1 + (last - first); + [[maybe_unused]] auto* last_pointer2 = first_pointer2 + (last - first); + KOKKOS_EXPECTS(first_pointer1 >= last_pointer2 || + last_pointer1 <= first_pointer2 || is_no_overlap); + } + } +} + } // namespace Impl } // namespace Experimental } // namespace Kokkos diff --git a/algorithms/src/std_algorithms/impl/Kokkos_CopyBackward.hpp b/algorithms/src/std_algorithms/impl/Kokkos_CopyBackward.hpp index b3adbc5e2d..0f68c9e978 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_CopyBackward.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_CopyBackward.hpp @@ -27,16 +27,18 @@ namespace Kokkos { namespace Experimental { namespace Impl { -template +template struct StdCopyBackwardFunctor { - static_assert(std::is_signed::value, - "Kokkos: StdCopyBackwardFunctor requires signed index type"); + // we can use difference type from IteratorType1 since + // the calling functions below already static assert that + // the iterators have matching difference type + using index_type = typename IteratorType1::difference_type; IteratorType1 m_last; IteratorType2 m_dest_last; KOKKOS_FUNCTION - void operator()(IndexType i) const { m_dest_last[-i - 1] = m_last[-i - 1]; } + void operator()(index_type i) const { m_dest_last[-i - 1] = m_last[-i - 1]; } KOKKOS_FUNCTION StdCopyBackwardFunctor(IteratorType1 _last, IteratorType2 _dest_last) @@ -44,30 +46,51 @@ struct StdCopyBackwardFunctor { }; template -IteratorType2 copy_backward_impl(const std::string& label, - const ExecutionSpace& ex, IteratorType1 first, - IteratorType1 last, IteratorType2 d_last) { +IteratorType2 copy_backward_exespace_impl(const std::string& label, + const ExecutionSpace& ex, + IteratorType1 first, + IteratorType1 last, + IteratorType2 d_last) { // checks Impl::static_assert_random_access_and_accessible(ex, first, d_last); Impl::static_assert_iterators_have_matching_difference_type(first, d_last); Impl::expect_valid_range(first, last); - // aliases - using index_type = typename IteratorType1::difference_type; - using func_t = - StdCopyBackwardFunctor; - // run const auto num_elements = Kokkos::Experimental::distance(first, last); ::Kokkos::parallel_for(label, RangePolicy(ex, 0, num_elements), - func_t(last, d_last)); + // use CTAD + StdCopyBackwardFunctor(last, d_last)); ex.fence("Kokkos::copy_backward: fence after operation"); // return return d_last - num_elements; } +// +// team-level impl +// +template +KOKKOS_FUNCTION IteratorType2 +copy_backward_team_impl(const TeamHandleType& teamHandle, IteratorType1 first, + IteratorType1 last, IteratorType2 d_last) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first, d_last); + Impl::static_assert_iterators_have_matching_difference_type(first, d_last); + Impl::expect_valid_range(first, last); + + // run + const auto num_elements = Kokkos::Experimental::distance(first, last); + ::Kokkos::parallel_for(TeamThreadRange(teamHandle, 0, num_elements), + // use CTAD + StdCopyBackwardFunctor(last, d_last)); + teamHandle.team_barrier(); + + // return + return d_last - num_elements; +} + } // namespace Impl } // namespace Experimental } // namespace Kokkos diff --git a/algorithms/src/std_algorithms/impl/Kokkos_CopyCopyN.hpp b/algorithms/src/std_algorithms/impl/Kokkos_CopyCopyN.hpp index 1b120c46d0..86e99ecbd0 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_CopyCopyN.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_CopyCopyN.hpp @@ -27,13 +27,18 @@ namespace Kokkos { namespace Experimental { namespace Impl { -template +template struct StdCopyFunctor { + // we can use difference type from InputIterator since + // the calling functions below already static assert that + // the iterators have matching difference type + using index_type = typename InputIterator::difference_type; + InputIterator m_first; OutputIterator m_dest_first; KOKKOS_FUNCTION - void operator()(IndexType i) const { m_dest_first[i] = m_first[i]; } + void operator()(index_type i) const { m_dest_first[i] = m_first[i]; } KOKKOS_FUNCTION StdCopyFunctor(InputIterator _first, OutputIterator _dest_first) @@ -41,23 +46,20 @@ struct StdCopyFunctor { }; template -OutputIterator copy_impl(const std::string& label, const ExecutionSpace& ex, - InputIterator first, InputIterator last, - OutputIterator d_first) { +OutputIterator copy_exespace_impl(const std::string& label, + const ExecutionSpace& ex, InputIterator first, + InputIterator last, OutputIterator d_first) { // checks Impl::static_assert_random_access_and_accessible(ex, first, d_first); Impl::static_assert_iterators_have_matching_difference_type(first, d_first); Impl::expect_valid_range(first, last); - // aliases - using index_type = typename InputIterator::difference_type; - using func_t = StdCopyFunctor; - // run const auto num_elements = Kokkos::Experimental::distance(first, last); ::Kokkos::parallel_for(label, RangePolicy(ex, 0, num_elements), - func_t(first, d_first)); + // use CTAD + StdCopyFunctor(first, d_first)); ex.fence("Kokkos::copy: fence after operation"); // return @@ -66,16 +68,61 @@ OutputIterator copy_impl(const std::string& label, const ExecutionSpace& ex, template -OutputIterator copy_n_impl(const std::string& label, const ExecutionSpace& ex, - InputIterator first_from, Size count, - OutputIterator first_dest) { +OutputIterator copy_n_exespace_impl(const std::string& label, + const ExecutionSpace& ex, + InputIterator first_from, Size count, + OutputIterator first_dest) { // checks Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest); Impl::static_assert_iterators_have_matching_difference_type(first_from, first_dest); if (count > 0) { - return copy_impl(label, ex, first_from, first_from + count, first_dest); + return copy_exespace_impl(label, ex, first_from, first_from + count, + first_dest); + } else { + return first_dest; + } +} + +// +// team-level impl +// +template +KOKKOS_FUNCTION OutputIterator copy_team_impl(const TeamHandleType& teamHandle, + InputIterator first, + InputIterator last, + OutputIterator d_first) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first, d_first); + Impl::static_assert_iterators_have_matching_difference_type(first, d_first); + Impl::expect_valid_range(first, last); + + // run + const auto num_elements = Kokkos::Experimental::distance(first, last); + ::Kokkos::parallel_for(TeamThreadRange(teamHandle, 0, num_elements), + // use CTAD + StdCopyFunctor(first, d_first)); + teamHandle.team_barrier(); + + // return + return d_first + num_elements; +} + +template +KOKKOS_FUNCTION OutputIterator +copy_n_team_impl(const TeamHandleType& teamHandle, InputIterator first_from, + Size count, OutputIterator first_dest) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first_from, + first_dest); + Impl::static_assert_iterators_have_matching_difference_type(first_from, + first_dest); + + if (count > 0) { + return copy_team_impl(teamHandle, first_from, first_from + count, + first_dest); } else { return first_dest; } diff --git a/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp b/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp index 3c0c4f7e9b..ad7b8bb8ca 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp @@ -20,6 +20,7 @@ #include #include "Kokkos_Constraints.hpp" #include "Kokkos_HelperPredicates.hpp" +#include "Kokkos_MustUseKokkosSingleInTeam.hpp" #include #include @@ -27,8 +28,10 @@ namespace Kokkos { namespace Experimental { namespace Impl { -template +template struct StdCopyIfFunctor { + using index_type = typename FirstFrom::difference_type; + FirstFrom m_first_from; FirstDest m_first_dest; PredType m_pred; @@ -40,7 +43,7 @@ struct StdCopyIfFunctor { m_pred(std::move(pred)) {} KOKKOS_FUNCTION - void operator()(const IndexType i, IndexType& update, + void operator()(const index_type i, index_type& update, const bool final_pass) const { const auto& myval = m_first_from[i]; if (final_pass) { @@ -57,9 +60,11 @@ struct StdCopyIfFunctor { template -OutputIterator copy_if_impl(const std::string& label, const ExecutionSpace& ex, - InputIterator first, InputIterator last, - OutputIterator d_first, PredicateType pred) { +OutputIterator copy_if_exespace_impl(const std::string& label, + const ExecutionSpace& ex, + InputIterator first, InputIterator last, + OutputIterator d_first, + PredicateType pred) { /* To explain the impl, suppose that our data is: @@ -90,23 +95,68 @@ OutputIterator copy_if_impl(const std::string& label, const ExecutionSpace& ex, if (first == last) { return d_first; } else { - // aliases - using index_type = typename InputIterator::difference_type; - using func_type = StdCopyIfFunctor; - // run const auto num_elements = Kokkos::Experimental::distance(first, last); - index_type count = 0; + + typename InputIterator::difference_type count = 0; ::Kokkos::parallel_scan(label, RangePolicy(ex, 0, num_elements), - func_type(first, d_first, pred), count); + // use CTAD + StdCopyIfFunctor(first, d_first, pred), count); // fence not needed because of the scan accumulating into count return d_first + count; } } +template +KOKKOS_FUNCTION OutputIterator copy_if_team_impl( + const TeamHandleType& teamHandle, InputIterator first, InputIterator last, + OutputIterator d_first, PredicateType pred) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first, d_first); + Impl::static_assert_iterators_have_matching_difference_type(first, d_first); + Impl::expect_valid_range(first, last); + + if (first == last) { + return d_first; + } + + const std::size_t num_elements = Kokkos::Experimental::distance(first, last); + if constexpr (stdalgo_must_use_kokkos_single_for_team_scan_v< + typename TeamHandleType::execution_space>) { + std::size_t count = 0; + Kokkos::single( + Kokkos::PerTeam(teamHandle), + [=](std::size_t& lcount) { + lcount = 0; + for (std::size_t i = 0; i < num_elements; ++i) { + const auto& myval = first[i]; + if (pred(myval)) { + d_first[lcount++] = myval; + } + } + }, + count); + // no barrier needed since single above broadcasts to all members + return d_first + count; + + } else { + typename InputIterator::difference_type count = 0; + ::Kokkos::parallel_scan(TeamThreadRange(teamHandle, 0, num_elements), + StdCopyIfFunctor(first, d_first, pred), count); + // no barrier needed because of the scan accumulating into count + return d_first + count; + } + +#if defined KOKKOS_COMPILER_INTEL || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif +} + } // namespace Impl } // namespace Experimental } // namespace Kokkos diff --git a/algorithms/src/std_algorithms/impl/Kokkos_CountCountIf.hpp b/algorithms/src/std_algorithms/impl/Kokkos_CountCountIf.hpp index 18b8c46359..9b6b403aa4 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_CountCountIf.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_CountCountIf.hpp @@ -46,37 +46,65 @@ struct StdCountIfFunctor { }; template -typename IteratorType::difference_type count_if_impl(const std::string& label, - const ExecutionSpace& ex, - IteratorType first, - IteratorType last, - Predicate predicate) { +typename IteratorType::difference_type count_if_exespace_impl( + const std::string& label, const ExecutionSpace& ex, IteratorType first, + IteratorType last, Predicate predicate) { // checks Impl::static_assert_random_access_and_accessible(ex, first); Impl::expect_valid_range(first, last); - // aliases - using func_t = StdCountIfFunctor; - // run const auto num_elements = Kokkos::Experimental::distance(first, last); typename IteratorType::difference_type count = 0; ::Kokkos::parallel_reduce(label, RangePolicy(ex, 0, num_elements), - func_t(first, predicate), count); + // use CTAD + StdCountIfFunctor(first, predicate), count); ex.fence("Kokkos::count_if: fence after operation"); return count; } template -auto count_impl(const std::string& label, const ExecutionSpace& ex, - IteratorType first, IteratorType last, const T& value) { - return count_if_impl( +auto count_exespace_impl(const std::string& label, const ExecutionSpace& ex, + IteratorType first, IteratorType last, + const T& value) { + return count_if_exespace_impl( label, ex, first, last, ::Kokkos::Experimental::Impl::StdAlgoEqualsValUnaryPredicate(value)); } +// +// team-level impl +// +template +KOKKOS_FUNCTION typename IteratorType::difference_type count_if_team_impl( + const TeamHandleType& teamHandle, IteratorType first, IteratorType last, + Predicate predicate) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first); + Impl::expect_valid_range(first, last); + + // run + const auto num_elements = Kokkos::Experimental::distance(first, last); + typename IteratorType::difference_type count = 0; + ::Kokkos::parallel_reduce(TeamThreadRange(teamHandle, 0, num_elements), + // use CTAD + StdCountIfFunctor(first, predicate), count); + teamHandle.team_barrier(); + + return count; +} + +template +KOKKOS_FUNCTION auto count_team_impl(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + const T& value) { + return count_if_team_impl( + teamHandle, first, last, + ::Kokkos::Experimental::Impl::StdAlgoEqualsValUnaryPredicate(value)); +} + } // namespace Impl } // namespace Experimental } // namespace Kokkos diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Equal.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Equal.hpp index e045080d4a..62b7d226f6 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Equal.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Equal.hpp @@ -27,15 +27,16 @@ namespace Kokkos { namespace Experimental { namespace Impl { -template +template struct StdEqualFunctor { + using index_type = typename IteratorType1::difference_type; + IteratorType1 m_first1; IteratorType2 m_first2; BinaryPredicateType m_predicate; KOKKOS_FUNCTION - void operator()(IndexType i, std::size_t& lsum) const { + void operator()(index_type i, std::size_t& lsum) const { if (!m_predicate(m_first1[i], m_first2[i])) { lsum = 1; } @@ -49,67 +50,130 @@ struct StdEqualFunctor { m_predicate(std::move(_predicate)) {} }; +// +// exespace impl +// template -bool equal_impl(const std::string& label, const ExecutionSpace& ex, - IteratorType1 first1, IteratorType1 last1, IteratorType2 first2, - BinaryPredicateType predicate) { +bool equal_exespace_impl(const std::string& label, const ExecutionSpace& ex, + IteratorType1 first1, IteratorType1 last1, + IteratorType2 first2, BinaryPredicateType predicate) { // checks Impl::static_assert_random_access_and_accessible(ex, first1, first2); Impl::static_assert_iterators_have_matching_difference_type(first1, first2); Impl::expect_valid_range(first1, last1); - // aliases - using index_type = typename IteratorType1::difference_type; - using func_t = StdEqualFunctor; - // run const auto num_elements = Kokkos::Experimental::distance(first1, last1); std::size_t different = 0; - ::Kokkos::parallel_reduce(label, - RangePolicy(ex, 0, num_elements), - func_t(first1, first2, predicate), different); + ::Kokkos::parallel_reduce( + label, RangePolicy(ex, 0, num_elements), + StdEqualFunctor(first1, first2, predicate), different); ex.fence("Kokkos::equal: fence after operation"); return !different; } template -bool equal_impl(const std::string& label, const ExecutionSpace& ex, - IteratorType1 first1, IteratorType1 last1, - IteratorType2 first2) { +bool equal_exespace_impl(const std::string& label, const ExecutionSpace& ex, + IteratorType1 first1, IteratorType1 last1, + IteratorType2 first2) { using value_type1 = typename IteratorType1::value_type; using value_type2 = typename IteratorType2::value_type; using pred_t = StdAlgoEqualBinaryPredicate; - return equal_impl(label, ex, first1, last1, first2, pred_t()); + return equal_exespace_impl(label, ex, first1, last1, first2, pred_t()); } template -bool equal_impl(const std::string& label, const ExecutionSpace& ex, - IteratorType1 first1, IteratorType1 last1, IteratorType2 first2, - IteratorType2 last2, BinaryPredicateType predicate) { +bool equal_exespace_impl(const std::string& label, const ExecutionSpace& ex, + IteratorType1 first1, IteratorType1 last1, + IteratorType2 first2, IteratorType2 last2, + BinaryPredicateType predicate) { const auto d1 = ::Kokkos::Experimental::distance(first1, last1); const auto d2 = ::Kokkos::Experimental::distance(first2, last2); if (d1 != d2) { return false; } - return equal_impl(label, ex, first1, last1, first2, predicate); + return equal_exespace_impl(label, ex, first1, last1, first2, predicate); } template -bool equal_impl(const std::string& label, const ExecutionSpace& ex, - IteratorType1 first1, IteratorType1 last1, IteratorType2 first2, - IteratorType2 last2) { +bool equal_exespace_impl(const std::string& label, const ExecutionSpace& ex, + IteratorType1 first1, IteratorType1 last1, + IteratorType2 first2, IteratorType2 last2) { + Impl::expect_valid_range(first1, last1); + Impl::expect_valid_range(first2, last2); + + using value_type1 = typename IteratorType1::value_type; + using value_type2 = typename IteratorType2::value_type; + using pred_t = StdAlgoEqualBinaryPredicate; + return equal_exespace_impl(label, ex, first1, last1, first2, last2, pred_t()); +} + +// +// team impl +// +template +KOKKOS_FUNCTION bool equal_team_impl(const TeamHandleType& teamHandle, + IteratorType1 first1, IteratorType1 last1, + IteratorType2 first2, + BinaryPredicateType predicate) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first1, first2); + Impl::static_assert_iterators_have_matching_difference_type(first1, first2); + Impl::expect_valid_range(first1, last1); + + // run + const auto num_elements = Kokkos::Experimental::distance(first1, last1); + std::size_t different = 0; + ::Kokkos::parallel_reduce(TeamThreadRange(teamHandle, 0, num_elements), + StdEqualFunctor(first1, first2, predicate), + different); + teamHandle.team_barrier(); + + return !different; +} + +template +KOKKOS_FUNCTION bool equal_team_impl(const TeamHandleType& teamHandle, + IteratorType1 first1, IteratorType1 last1, + IteratorType2 first2) { + using value_type1 = typename IteratorType1::value_type; + using value_type2 = typename IteratorType2::value_type; + using pred_t = StdAlgoEqualBinaryPredicate; + return equal_team_impl(teamHandle, first1, last1, first2, pred_t()); +} + +template +KOKKOS_FUNCTION bool equal_team_impl(const TeamHandleType& teamHandle, + IteratorType1 first1, IteratorType1 last1, + IteratorType2 first2, IteratorType2 last2, + BinaryPredicateType predicate) { + const auto d1 = ::Kokkos::Experimental::distance(first1, last1); + const auto d2 = ::Kokkos::Experimental::distance(first2, last2); + if (d1 != d2) { + return false; + } + + return equal_team_impl(teamHandle, first1, last1, first2, predicate); +} + +template +KOKKOS_FUNCTION bool equal_team_impl(const TeamHandleType& teamHandle, + IteratorType1 first1, IteratorType1 last1, + IteratorType2 first2, + IteratorType2 last2) { Impl::expect_valid_range(first1, last1); Impl::expect_valid_range(first2, last2); using value_type1 = typename IteratorType1::value_type; using value_type2 = typename IteratorType2::value_type; using pred_t = StdAlgoEqualBinaryPredicate; - return equal_impl(label, ex, first1, last1, first2, last2, pred_t()); + return equal_team_impl(teamHandle, first1, last1, first2, last2, pred_t()); } } // namespace Impl diff --git a/algorithms/src/std_algorithms/impl/Kokkos_ExclusiveScan.hpp b/algorithms/src/std_algorithms/impl/Kokkos_ExclusiveScan.hpp index 71f13e490a..6da992b4bb 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_ExclusiveScan.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_ExclusiveScan.hpp @@ -22,6 +22,7 @@ #include "Kokkos_HelperPredicates.hpp" #include "Kokkos_ValueWrapperForNoNeutralElement.hpp" #include "Kokkos_IdentityReferenceUnaryFunctor.hpp" +#include "Kokkos_FunctorsForExclusiveScan.hpp" #include #include #include @@ -30,127 +31,15 @@ namespace Kokkos { namespace Experimental { namespace Impl { -template -struct ExclusiveScanDefaultFunctorForKnownNeutralElement { - using execution_space = ExeSpace; - - ValueType m_init_value; - FirstFrom m_first_from; - FirstDest m_first_dest; - - KOKKOS_FUNCTION - ExclusiveScanDefaultFunctorForKnownNeutralElement(ValueType init, - FirstFrom first_from, - FirstDest first_dest) - : m_init_value(std::move(init)), - m_first_from(std::move(first_from)), - m_first_dest(std::move(first_dest)) {} - - KOKKOS_FUNCTION - void operator()(const IndexType i, ValueType& update, - const bool final_pass) const { - if (final_pass) m_first_dest[i] = update + m_init_value; - update += m_first_from[i]; - } -}; - -template -struct ExclusiveScanDefaultFunctor { - using execution_space = ExeSpace; - using value_type = - ::Kokkos::Experimental::Impl::ValueWrapperForNoNeutralElement; - - ValueType m_init_value; - FirstFrom m_first_from; - FirstDest m_first_dest; - - KOKKOS_FUNCTION - ExclusiveScanDefaultFunctor(ValueType init, FirstFrom first_from, - FirstDest first_dest) - : m_init_value(std::move(init)), - m_first_from(std::move(first_from)), - m_first_dest(std::move(first_dest)) {} - - KOKKOS_FUNCTION - void operator()(const IndexType i, value_type& update, - const bool final_pass) const { - if (final_pass) { - if (i == 0) { - m_first_dest[i] = m_init_value; - } else { - m_first_dest[i] = update.val + m_init_value; - } - } - - const auto tmp = value_type{m_first_from[i], false}; - this->join(update, tmp); - } - - KOKKOS_FUNCTION - void init(value_type& update) const { - update.val = {}; - update.is_initial = true; - } - - KOKKOS_FUNCTION - void join(value_type& update, const value_type& input) const { - if (input.is_initial) return; - - if (update.is_initial) { - update.val = input.val; - update.is_initial = false; - } else { - update.val = update.val + input.val; - } - } -}; - +// +// exespace impl +// template -OutputIteratorType exclusive_scan_custom_op_impl( + class OutputIteratorType, class ValueType> +OutputIteratorType exclusive_scan_default_op_exespace_impl( const std::string& label, const ExecutionSpace& ex, InputIteratorType first_from, InputIteratorType last_from, - OutputIteratorType first_dest, ValueType init_value, BinaryOpType bop) { - // checks - Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest); - Impl::static_assert_iterators_have_matching_difference_type(first_from, - first_dest); - Impl::expect_valid_range(first_from, last_from); - - // aliases - using index_type = typename InputIteratorType::difference_type; - using unary_op_type = StdNumericScanIdentityReferenceUnaryFunctor; - using func_type = - TransformExclusiveScanFunctor; - - // run - const auto num_elements = - Kokkos::Experimental::distance(first_from, last_from); - ::Kokkos::parallel_scan( - label, RangePolicy(ex, 0, num_elements), - func_type(init_value, first_from, first_dest, bop, unary_op_type())); - ex.fence("Kokkos::exclusive_scan_custom_op: fence after operation"); - - // return - return first_dest + num_elements; -} - -template -using ex_scan_has_reduction_identity_sum_t = - decltype(Kokkos::reduction_identity::sum()); - -template -OutputIteratorType exclusive_scan_default_op_impl(const std::string& label, - const ExecutionSpace& ex, - InputIteratorType first_from, - InputIteratorType last_from, - OutputIteratorType first_dest, - ValueType init_value) { + OutputIteratorType first_dest, ValueType init_value) { // checks Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest); Impl::static_assert_iterators_have_matching_difference_type(first_from, @@ -184,17 +73,122 @@ OutputIteratorType exclusive_scan_default_op_impl(const std::string& label, ExclusiveScanDefaultFunctorForKnownNeutralElement< ExecutionSpace, index_type, ValueType, InputIteratorType, OutputIteratorType>, - ExclusiveScanDefaultFunctor>; + ExclusiveScanDefaultFunctorWithValueWrapper>; + + // run + const auto num_elements = + Kokkos::Experimental::distance(first_from, last_from); + ::Kokkos::parallel_scan( + label, RangePolicy(ex, 0, num_elements), + func_type(std::move(init_value), first_from, first_dest)); + + ex.fence("Kokkos::exclusive_scan_default_op: fence after operation"); + + return first_dest + num_elements; +} + +template +OutputIteratorType exclusive_scan_custom_op_exespace_impl( + const std::string& label, const ExecutionSpace& ex, + InputIteratorType first_from, InputIteratorType last_from, + OutputIteratorType first_dest, ValueType init_value, BinaryOpType bop) { + // checks + Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest); + Impl::static_assert_iterators_have_matching_difference_type(first_from, + first_dest); + Impl::expect_valid_range(first_from, last_from); + + // aliases + using index_type = typename InputIteratorType::difference_type; + using unary_op_type = StdNumericScanIdentityReferenceUnaryFunctor; + using func_type = TransformExclusiveScanFunctorWithValueWrapper< + ExecutionSpace, index_type, ValueType, InputIteratorType, + OutputIteratorType, BinaryOpType, unary_op_type>; // run const auto num_elements = Kokkos::Experimental::distance(first_from, last_from); ::Kokkos::parallel_scan(label, RangePolicy(ex, 0, num_elements), - func_type(init_value, first_from, first_dest)); + func_type(std::move(init_value), first_from, + first_dest, bop, unary_op_type())); + ex.fence("Kokkos::exclusive_scan_custom_op: fence after operation"); - ex.fence("Kokkos::exclusive_scan_default_op: fence after operation"); + // return + return first_dest + num_elements; +} + +// +// team impl +// +template +KOKKOS_FUNCTION OutputIteratorType exclusive_scan_default_op_team_impl( + const TeamHandleType& teamHandle, InputIteratorType first_from, + InputIteratorType last_from, OutputIteratorType first_dest, + ValueType init_value) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first_from, + first_dest); + Impl::static_assert_iterators_have_matching_difference_type(first_from, + first_dest); + Impl::expect_valid_range(first_from, last_from); + + static_assert( + ::Kokkos::is_detected_v, + "The team-level impl of Kokkos::Experimental::exclusive_scan currently " + "does not support types without reduction identity"); + + // aliases + using exe_space = typename TeamHandleType::execution_space; + using index_type = typename InputIteratorType::difference_type; + using func_type = ExclusiveScanDefaultFunctorForKnownNeutralElement< + exe_space, index_type, ValueType, InputIteratorType, OutputIteratorType>; + + const auto num_elements = + Kokkos::Experimental::distance(first_from, last_from); + ::Kokkos::parallel_scan( + TeamThreadRange(teamHandle, 0, num_elements), + func_type(std::move(init_value), first_from, first_dest)); + teamHandle.team_barrier(); + return first_dest + num_elements; +} + +template +KOKKOS_FUNCTION OutputIteratorType exclusive_scan_custom_op_team_impl( + const TeamHandleType& teamHandle, InputIteratorType first_from, + InputIteratorType last_from, OutputIteratorType first_dest, + ValueType init_value, BinaryOpType bop) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first_from, + first_dest); + Impl::static_assert_iterators_have_matching_difference_type(first_from, + first_dest); + Impl::expect_valid_range(first_from, last_from); + + static_assert( + ::Kokkos::is_detected_v, + "The team-level impl of Kokkos::Experimental::exclusive_scan currently " + "does not support types without reduction identity"); + + // aliases + using exe_space = typename TeamHandleType::execution_space; + using unary_op_type = StdNumericScanIdentityReferenceUnaryFunctor; + using index_type = typename InputIteratorType::difference_type; + using func_type = TransformExclusiveScanFunctorWithoutValueWrapper< + exe_space, index_type, ValueType, InputIteratorType, OutputIteratorType, + BinaryOpType, unary_op_type>; + + const auto num_elements = + Kokkos::Experimental::distance(first_from, last_from); + ::Kokkos::parallel_scan(TeamThreadRange(teamHandle, 0, num_elements), + func_type(std::move(init_value), first_from, + first_dest, bop, unary_op_type())); + teamHandle.team_barrier(); return first_dest + num_elements; } diff --git a/algorithms/src/std_algorithms/impl/Kokkos_FillFillN.hpp b/algorithms/src/std_algorithms/impl/Kokkos_FillFillN.hpp index 316d865f31..972e57f2cc 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_FillFillN.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_FillFillN.hpp @@ -41,9 +41,12 @@ struct StdFillFunctor { : m_first(std::move(_first)), m_value(std::move(_value)) {} }; +// +// exespace impl +// template -void fill_impl(const std::string& label, const ExecutionSpace& ex, - IteratorType first, IteratorType last, const T& value) { +void fill_exespace_impl(const std::string& label, const ExecutionSpace& ex, + IteratorType first, IteratorType last, const T& value) { // checks Impl::static_assert_random_access_and_accessible(ex, first); Impl::expect_valid_range(first, last); @@ -52,13 +55,14 @@ void fill_impl(const std::string& label, const ExecutionSpace& ex, const auto num_elements = Kokkos::Experimental::distance(first, last); ::Kokkos::parallel_for(label, RangePolicy(ex, 0, num_elements), - StdFillFunctor(first, value)); + StdFillFunctor(first, value)); ex.fence("Kokkos::fill: fence after operation"); } template -IteratorType fill_n_impl(const std::string& label, const ExecutionSpace& ex, - IteratorType first, SizeType n, const T& value) { +IteratorType fill_n_exespace_impl(const std::string& label, + const ExecutionSpace& ex, IteratorType first, + SizeType n, const T& value) { auto last = first + n; Impl::static_assert_random_access_and_accessible(ex, first); Impl::expect_valid_range(first, last); @@ -67,7 +71,40 @@ IteratorType fill_n_impl(const std::string& label, const ExecutionSpace& ex, return first; } - fill_impl(label, ex, first, last, value); + fill_exespace_impl(label, ex, first, last, value); + return last; +} + +// +// team-level impl +// +template +KOKKOS_FUNCTION void fill_team_impl(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + const T& value) { + Impl::static_assert_random_access_and_accessible(teamHandle, first); + Impl::expect_valid_range(first, last); + + const auto num_elements = Kokkos::Experimental::distance(first, last); + ::Kokkos::parallel_for(TeamThreadRange(teamHandle, 0, num_elements), + StdFillFunctor(first, value)); + + teamHandle.team_barrier(); +} + +template +KOKKOS_FUNCTION IteratorType fill_n_team_impl(const TeamHandleType& teamHandle, + IteratorType first, SizeType n, + const T& value) { + auto last = first + n; + Impl::static_assert_random_access_and_accessible(teamHandle, first); + Impl::expect_valid_range(first, last); + + if (n <= 0) { + return first; + } + + fill_team_impl(teamHandle, first, last, value); return last; } diff --git a/algorithms/src/std_algorithms/impl/Kokkos_FindEnd.hpp b/algorithms/src/std_algorithms/impl/Kokkos_FindEnd.hpp index 3ec64fa43d..1f1ec5e54f 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_FindEnd.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_FindEnd.hpp @@ -80,12 +80,17 @@ struct StdFindEndFunctor { m_p(std::move(p)) {} }; +// +// exespace impl +// template -IteratorType1 find_end_impl(const std::string& label, const ExecutionSpace& ex, - IteratorType1 first, IteratorType1 last, - IteratorType2 s_first, IteratorType2 s_last, - const BinaryPredicateType& pred) { +IteratorType1 find_end_exespace_impl(const std::string& label, + const ExecutionSpace& ex, + IteratorType1 first, IteratorType1 last, + IteratorType2 s_first, + IteratorType2 s_last, + const BinaryPredicateType& pred) { // checks Impl::static_assert_random_access_and_accessible(ex, first, s_first); Impl::static_assert_iterators_have_matching_difference_type(first, s_first); @@ -97,7 +102,6 @@ IteratorType1 find_end_impl(const std::string& label, const ExecutionSpace& ex, const auto num_elements = KE::distance(first, last); const auto s_count = KE::distance(s_first, s_last); KOKKOS_EXPECTS(num_elements >= s_count); - (void)s_count; // needed when macro above is a no-op if (s_first == s_last) { return last; @@ -109,7 +113,8 @@ IteratorType1 find_end_impl(const std::string& label, const ExecutionSpace& ex, // special case where the two ranges have equal size if (num_elements == s_count) { - const auto equal_result = equal_impl(label, ex, first, last, s_first, pred); + const auto equal_result = + equal_exespace_impl(label, ex, first, last, s_first, pred); return (equal_result) ? first : last; } else { using index_type = typename IteratorType1::difference_type; @@ -148,14 +153,97 @@ IteratorType1 find_end_impl(const std::string& label, const ExecutionSpace& ex, } template -IteratorType1 find_end_impl(const std::string& label, const ExecutionSpace& ex, - IteratorType1 first, IteratorType1 last, - IteratorType2 s_first, IteratorType2 s_last) { +IteratorType1 find_end_exespace_impl(const std::string& label, + const ExecutionSpace& ex, + IteratorType1 first, IteratorType1 last, + IteratorType2 s_first, + IteratorType2 s_last) { + using value_type1 = typename IteratorType1::value_type; + using value_type2 = typename IteratorType2::value_type; + using predicate_type = StdAlgoEqualBinaryPredicate; + return find_end_exespace_impl(label, ex, first, last, s_first, s_last, + predicate_type()); +} + +// +// team impl +// +template +KOKKOS_FUNCTION IteratorType1 +find_end_team_impl(const TeamHandleType& teamHandle, IteratorType1 first, + IteratorType1 last, IteratorType2 s_first, + IteratorType2 s_last, const BinaryPredicateType& pred) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first, s_first); + Impl::static_assert_iterators_have_matching_difference_type(first, s_first); + Impl::expect_valid_range(first, last); + Impl::expect_valid_range(s_first, s_last); + + // the target sequence should not be larger than the range [first, last) + namespace KE = ::Kokkos::Experimental; + const auto num_elements = KE::distance(first, last); + const auto s_count = KE::distance(s_first, s_last); + KOKKOS_EXPECTS(num_elements >= s_count); + + if (s_first == s_last) { + return last; + } + + if (first == last) { + return last; + } + + // special case where the two ranges have equal size + if (num_elements == s_count) { + const auto equal_result = + equal_team_impl(teamHandle, first, last, s_first, pred); + return (equal_result) ? first : last; + } else { + using index_type = typename IteratorType1::difference_type; + using reducer_type = LastLoc; + using reduction_value_type = typename reducer_type::value_type; + using func_t = StdFindEndFunctor; + + // run + reduction_value_type red_result; + reducer_type reducer(red_result); + + // decide the size of the range policy of the par_red: + // note that the last feasible index to start looking is the index + // whose distance from the "last" is equal to the sequence count. + // the +1 is because we need to include that location too. + const auto range_size = num_elements - s_count + 1; + + // run par reduce + ::Kokkos::parallel_reduce( + TeamThreadRange(teamHandle, 0, range_size), + func_t(first, last, s_first, s_last, reducer, pred), reducer); + + teamHandle.team_barrier(); + + // decide and return + if (red_result.max_loc_true == + ::Kokkos::reduction_identity::max()) { + // if here, a subrange has not been found + return last; + } else { + // a location has been found + return first + red_result.max_loc_true; + } + } +} + +template +KOKKOS_FUNCTION IteratorType1 find_end_team_impl( + const TeamHandleType& teamHandle, IteratorType1 first, IteratorType1 last, + IteratorType2 s_first, IteratorType2 s_last) { using value_type1 = typename IteratorType1::value_type; using value_type2 = typename IteratorType2::value_type; using predicate_type = StdAlgoEqualBinaryPredicate; - return find_end_impl(label, ex, first, last, s_first, s_last, - predicate_type()); + return find_end_team_impl(teamHandle, first, last, s_first, s_last, + predicate_type()); } } // namespace Impl diff --git a/algorithms/src/std_algorithms/impl/Kokkos_FindFirstOf.hpp b/algorithms/src/std_algorithms/impl/Kokkos_FindFirstOf.hpp index 5f22d2ad13..145e235b9d 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_FindFirstOf.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_FindFirstOf.hpp @@ -71,13 +71,15 @@ struct StdFindFirstOfFunctor { m_p(std::move(p)) {} }; +// +// exespace impl +// template -IteratorType1 find_first_of_impl(const std::string& label, - const ExecutionSpace& ex, IteratorType1 first, - IteratorType1 last, IteratorType2 s_first, - IteratorType2 s_last, - const BinaryPredicateType& pred) { +IteratorType1 find_first_of_exespace_impl( + const std::string& label, const ExecutionSpace& ex, IteratorType1 first, + IteratorType1 last, IteratorType2 s_first, IteratorType2 s_last, + const BinaryPredicateType& pred) { // checks Impl::static_assert_random_access_and_accessible(ex, first, s_first); Impl::static_assert_iterators_have_matching_difference_type(first, s_first); @@ -116,15 +118,71 @@ IteratorType1 find_first_of_impl(const std::string& label, } template -IteratorType1 find_first_of_impl(const std::string& label, - const ExecutionSpace& ex, IteratorType1 first, - IteratorType1 last, IteratorType2 s_first, - IteratorType2 s_last) { +IteratorType1 find_first_of_exespace_impl( + const std::string& label, const ExecutionSpace& ex, IteratorType1 first, + IteratorType1 last, IteratorType2 s_first, IteratorType2 s_last) { + using value_type1 = typename IteratorType1::value_type; + using value_type2 = typename IteratorType2::value_type; + using predicate_type = StdAlgoEqualBinaryPredicate; + return find_first_of_exespace_impl(label, ex, first, last, s_first, s_last, + predicate_type()); +} + +// +// team impl +// +template +KOKKOS_FUNCTION IteratorType1 +find_first_of_team_impl(const TeamHandleType& teamHandle, IteratorType1 first, + IteratorType1 last, IteratorType2 s_first, + IteratorType2 s_last, const BinaryPredicateType& pred) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first, s_first); + Impl::static_assert_iterators_have_matching_difference_type(first, s_first); + Impl::expect_valid_range(first, last); + Impl::expect_valid_range(s_first, s_last); + + if ((s_first == s_last) || (first == last)) { + return last; + } + + using index_type = typename IteratorType1::difference_type; + using reducer_type = FirstLoc; + using reduction_value_type = typename reducer_type::value_type; + using func_t = StdFindFirstOfFunctor; + + // run + reduction_value_type red_result; + reducer_type reducer(red_result); + const auto num_elements = Kokkos::Experimental::distance(first, last); + ::Kokkos::parallel_reduce(TeamThreadRange(teamHandle, 0, num_elements), + func_t(first, s_first, s_last, reducer, pred), + reducer); + + teamHandle.team_barrier(); + + // decide and return + if (red_result.min_loc_true == + ::Kokkos::reduction_identity::min()) { + // if here, nothing found + return last; + } else { + // a location has been found + return first + red_result.min_loc_true; + } +} + +template +KOKKOS_FUNCTION IteratorType1 find_first_of_team_impl( + const TeamHandleType& teamHandle, IteratorType1 first, IteratorType1 last, + IteratorType2 s_first, IteratorType2 s_last) { using value_type1 = typename IteratorType1::value_type; using value_type2 = typename IteratorType2::value_type; using predicate_type = StdAlgoEqualBinaryPredicate; - return find_first_of_impl(label, ex, first, last, s_first, s_last, - predicate_type()); + return find_first_of_team_impl(teamHandle, first, last, s_first, s_last, + predicate_type()); } } // namespace Impl diff --git a/algorithms/src/std_algorithms/impl/Kokkos_FindIfOrNot.hpp b/algorithms/src/std_algorithms/impl/Kokkos_FindIfOrNot.hpp index 9c0b0c0ccd..8fffb59094 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_FindIfOrNot.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_FindIfOrNot.hpp @@ -61,11 +61,15 @@ struct StdFindIfOrNotFunctor { m_p(std::move(p)) {} }; +// +// exespace impl +// template -IteratorType find_if_or_not_impl(const std::string& label, - const ExecutionSpace& ex, IteratorType first, - IteratorType last, PredicateType pred) { +IteratorType find_if_or_not_exespace_impl(const std::string& label, + const ExecutionSpace& ex, + IteratorType first, IteratorType last, + PredicateType pred) { // checks Impl::static_assert_random_access_and_accessible( ex, first); // only need one It per type @@ -104,14 +108,68 @@ IteratorType find_if_or_not_impl(const std::string& label, } template -InputIterator find_impl(const std::string& label, ExecutionSpace ex, - InputIterator first, InputIterator last, - const T& value) { - return find_if_or_not_impl( +InputIterator find_exespace_impl(const std::string& label, ExecutionSpace ex, + InputIterator first, InputIterator last, + const T& value) { + return find_if_or_not_exespace_impl( label, ex, first, last, ::Kokkos::Experimental::Impl::StdAlgoEqualsValUnaryPredicate(value)); } +// +// team impl +// +template +KOKKOS_FUNCTION IteratorType +find_if_or_not_team_impl(const TeamHandleType& teamHandle, IteratorType first, + IteratorType last, PredicateType pred) { + // checks + Impl::static_assert_random_access_and_accessible( + teamHandle, first); // only need one It per type + Impl::expect_valid_range(first, last); + + if (first == last) { + return last; + } + + // aliases + using index_type = typename IteratorType::difference_type; + using reducer_type = FirstLoc; + using reduction_value_type = typename reducer_type::value_type; + using func_t = StdFindIfOrNotFunctor; + + // run + reduction_value_type red_result; + reducer_type reducer(red_result); + const auto num_elements = Kokkos::Experimental::distance(first, last); + ::Kokkos::parallel_reduce(TeamThreadRange(teamHandle, 0, num_elements), + func_t(first, reducer, pred), reducer); + + teamHandle.team_barrier(); + + // decide and return + if (red_result.min_loc_true == + ::Kokkos::reduction_identity::min()) { + // here, it means a valid loc has not been found, + return last; + } else { + // a location has been found + return first + red_result.min_loc_true; + } +} + +template +KOKKOS_FUNCTION InputIterator find_team_impl(const TeamHandleType& teamHandle, + InputIterator first, + InputIterator last, + const T& value) { + return find_if_or_not_team_impl( + teamHandle, first, last, + ::Kokkos::Experimental::Impl::StdAlgoEqualsValUnaryPredicate(value)); +} + } // namespace Impl } // namespace Experimental } // namespace Kokkos diff --git a/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp b/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp index f9a6ff2e99..99cc4a1cf3 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp @@ -41,29 +41,28 @@ struct StdForEachFunctor { : m_first(std::move(_first)), m_functor(std::move(_functor)) {} }; -template -UnaryFunctorType for_each_impl(const std::string& label, - const ExecutionSpace& ex, IteratorType first, - IteratorType last, UnaryFunctorType functor) { +template +void for_each_exespace_impl(const std::string& label, const HandleType& handle, + IteratorType first, IteratorType last, + UnaryFunctorType functor) { // checks - Impl::static_assert_random_access_and_accessible(ex, first); + Impl::static_assert_random_access_and_accessible(handle, first); Impl::expect_valid_range(first, last); // run const auto num_elements = Kokkos::Experimental::distance(first, last); ::Kokkos::parallel_for( - label, RangePolicy(ex, 0, num_elements), + label, RangePolicy(handle, 0, num_elements), StdForEachFunctor(first, functor)); - ex.fence("Kokkos::for_each: fence after operation"); - - return functor; + handle.fence("Kokkos::for_each: fence after operation"); } template -IteratorType for_each_n_impl(const std::string& label, const ExecutionSpace& ex, - IteratorType first, SizeType n, - UnaryFunctorType functor) { +IteratorType for_each_n_exespace_impl(const std::string& label, + const ExecutionSpace& ex, + IteratorType first, SizeType n, + UnaryFunctorType functor) { auto last = first + n; Impl::static_assert_random_access_and_accessible(ex, first, last); Impl::expect_valid_range(first, last); @@ -72,8 +71,45 @@ IteratorType for_each_n_impl(const std::string& label, const ExecutionSpace& ex, return first; } - for_each_impl(label, ex, first, last, std::move(functor)); - // no neeed to fence since for_each_impl fences already + for_each_exespace_impl(label, ex, first, last, std::move(functor)); + // no need to fence since for_each_exespace_impl fences already + + return last; +} + +// +// team impl +// +template +KOKKOS_FUNCTION void for_each_team_impl(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + UnaryFunctorType functor) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first); + Impl::expect_valid_range(first, last); + // run + const auto num_elements = Kokkos::Experimental::distance(first, last); + ::Kokkos::parallel_for( + TeamThreadRange(teamHandle, 0, num_elements), + StdForEachFunctor(first, functor)); + teamHandle.team_barrier(); +} + +template +KOKKOS_FUNCTION IteratorType +for_each_n_team_impl(const TeamHandleType& teamHandle, IteratorType first, + SizeType n, UnaryFunctorType functor) { + auto last = first + n; + Impl::static_assert_random_access_and_accessible(teamHandle, first, last); + Impl::expect_valid_range(first, last); + + if (n == 0) { + return first; + } + + for_each_team_impl(teamHandle, first, last, std::move(functor)); + // no need to fence since for_each_team_impl fences already return last; } diff --git a/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp b/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp new file mode 100644 index 0000000000..5a7fe16984 --- /dev/null +++ b/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp @@ -0,0 +1,221 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_STD_ALGORITHMS_FUNCTORS_FOR_EXCLUSIVE_SCAN_IMPL_HPP +#define KOKKOS_STD_ALGORITHMS_FUNCTORS_FOR_EXCLUSIVE_SCAN_IMPL_HPP + +#include +#include "Kokkos_ValueWrapperForNoNeutralElement.hpp" + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +template +using ex_scan_has_reduction_identity_sum_t = + decltype(Kokkos::reduction_identity::sum()); + +template +struct ExclusiveScanDefaultFunctorForKnownNeutralElement { + using execution_space = ExeSpace; + ValueType m_init_value; + FirstFrom m_first_from; + FirstDest m_first_dest; + + KOKKOS_FUNCTION + ExclusiveScanDefaultFunctorForKnownNeutralElement(ValueType init, + FirstFrom first_from, + FirstDest first_dest) + : m_init_value(std::move(init)), + m_first_from(std::move(first_from)), + m_first_dest(std::move(first_dest)) {} + + KOKKOS_FUNCTION + void operator()(const IndexType i, ValueType& update, + const bool final_pass) const { + const auto tmp = m_first_from[i]; + if (final_pass) m_first_dest[i] = update + m_init_value; + update += tmp; + } +}; + +template +struct ExclusiveScanDefaultFunctorWithValueWrapper { + using execution_space = ExeSpace; + using value_type = + ::Kokkos::Experimental::Impl::ValueWrapperForNoNeutralElement; + ValueType m_init_value; + FirstFrom m_first_from; + FirstDest m_first_dest; + + KOKKOS_FUNCTION + ExclusiveScanDefaultFunctorWithValueWrapper(ValueType init, + FirstFrom first_from, + FirstDest first_dest) + : m_init_value(std::move(init)), + m_first_from(std::move(first_from)), + m_first_dest(std::move(first_dest)) {} + + KOKKOS_FUNCTION + void operator()(const IndexType i, value_type& update, + const bool final_pass) const { + const auto tmp = value_type{m_first_from[i], false}; + if (final_pass) { + if (i == 0) { + m_first_dest[i] = m_init_value; + } else { + m_first_dest[i] = update.val + m_init_value; + } + } + + this->join(update, tmp); + } + + KOKKOS_FUNCTION + void init(value_type& update) const { + update.val = {}; + update.is_initial = true; + } + + KOKKOS_FUNCTION + void join(value_type& update, const value_type& input) const { + if (input.is_initial) return; + + if (update.is_initial) { + update.val = input.val; + update.is_initial = false; + } else { + update.val = update.val + input.val; + } + } +}; + +template +struct TransformExclusiveScanFunctorWithValueWrapper { + using execution_space = ExeSpace; + using value_type = + ::Kokkos::Experimental::Impl::ValueWrapperForNoNeutralElement; + + ValueType m_init_value; + FirstFrom m_first_from; + FirstDest m_first_dest; + BinaryOpType m_binary_op; + UnaryOpType m_unary_op; + + KOKKOS_FUNCTION + TransformExclusiveScanFunctorWithValueWrapper(ValueType init, + FirstFrom first_from, + FirstDest first_dest, + BinaryOpType bop, + UnaryOpType uop) + : m_init_value(std::move(init)), + m_first_from(std::move(first_from)), + m_first_dest(std::move(first_dest)), + m_binary_op(std::move(bop)), + m_unary_op(std::move(uop)) {} + + KOKKOS_FUNCTION + void operator()(const IndexType i, value_type& update, + const bool final_pass) const { + const auto tmp = value_type{m_unary_op(m_first_from[i]), false}; + if (final_pass) { + if (i == 0) { + // for both ExclusiveScan and TransformExclusiveScan, + // init is unmodified + m_first_dest[i] = m_init_value; + } else { + m_first_dest[i] = m_binary_op(update.val, m_init_value); + } + } + + this->join(update, tmp); + } + + KOKKOS_FUNCTION void init(value_type& value) const { + value.val = {}; + value.is_initial = true; + } + + KOKKOS_FUNCTION + void join(value_type& update, const value_type& input) const { + if (input.is_initial) return; + + if (update.is_initial) { + update.val = input.val; + } else { + update.val = m_binary_op(update.val, input.val); + } + update.is_initial = false; + } +}; + +template +struct TransformExclusiveScanFunctorWithoutValueWrapper { + using execution_space = ExeSpace; + + ValueType m_init_value; + FirstFrom m_first_from; + FirstDest m_first_dest; + BinaryOpType m_binary_op; + UnaryOpType m_unary_op; + + KOKKOS_FUNCTION + TransformExclusiveScanFunctorWithoutValueWrapper(ValueType init, + FirstFrom first_from, + FirstDest first_dest, + BinaryOpType bop, + UnaryOpType uop) + : m_init_value(std::move(init)), + m_first_from(std::move(first_from)), + m_first_dest(std::move(first_dest)), + m_binary_op(std::move(bop)), + m_unary_op(std::move(uop)) {} + + KOKKOS_FUNCTION + void operator()(const IndexType i, ValueType& update, + const bool final_pass) const { + const auto tmp = ValueType{m_unary_op(m_first_from[i])}; + if (final_pass) { + if (i == 0) { + // for both ExclusiveScan and TransformExclusiveScan, + // init is unmodified + m_first_dest[i] = m_init_value; + } else { + m_first_dest[i] = m_binary_op(update, m_init_value); + } + } + + this->join(update, tmp); + } + + KOKKOS_FUNCTION + void init(ValueType& update) const { update = {}; } + + KOKKOS_FUNCTION + void join(ValueType& update, const ValueType& input) const { + update = m_binary_op(update, input); + } +}; + +} // namespace Impl +} // namespace Experimental +} // namespace Kokkos + +#endif diff --git a/algorithms/src/std_algorithms/impl/Kokkos_GenerateGenerateN.hpp b/algorithms/src/std_algorithms/impl/Kokkos_GenerateGenerateN.hpp index 228390bdff..157de1125e 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_GenerateGenerateN.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_GenerateGenerateN.hpp @@ -41,32 +41,65 @@ struct StdGenerateFunctor { : m_first(std::move(_first)), m_generator(std::move(_g)) {} }; +// +// generate impl +// template -void generate_impl(const std::string& label, const ExecutionSpace& ex, - IteratorType first, IteratorType last, Generator g) { +void generate_exespace_impl(const std::string& label, const ExecutionSpace& ex, + IteratorType first, IteratorType last, + Generator g) { // checks Impl::static_assert_random_access_and_accessible(ex, first); Impl::expect_valid_range(first, last); - // aliases - using func_t = StdGenerateFunctor; - // run const auto num_elements = Kokkos::Experimental::distance(first, last); ::Kokkos::parallel_for(label, RangePolicy(ex, 0, num_elements), - func_t(first, g)); + StdGenerateFunctor(first, g)); ex.fence("Kokkos::generate: fence after operation"); } +template +KOKKOS_FUNCTION void generate_team_impl(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + Generator g) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first); + Impl::expect_valid_range(first, last); + + // run + const auto num_elements = Kokkos::Experimental::distance(first, last); + ::Kokkos::parallel_for(TeamThreadRange(teamHandle, 0, num_elements), + StdGenerateFunctor(first, g)); + teamHandle.team_barrier(); +} + +// +// generate_n impl +// template -IteratorType generate_n_impl(const std::string& label, const ExecutionSpace& ex, - IteratorType first, Size count, Generator g) { +IteratorType generate_n_exespace_impl(const std::string& label, + const ExecutionSpace& ex, + IteratorType first, Size count, + Generator g) { + if (count <= 0) { + return first; + } + + generate_exespace_impl(label, ex, first, first + count, g); + return first + count; +} + +template +KOKKOS_FUNCTION IteratorType +generate_n_team_impl(const TeamHandleType& teamHandle, IteratorType first, + Size count, Generator g) { if (count <= 0) { return first; } - generate_impl(label, ex, first, first + count, g); + generate_team_impl(teamHandle, first, first + count, g); return first + count; } diff --git a/algorithms/src/std_algorithms/impl/Kokkos_InclusiveScan.hpp b/algorithms/src/std_algorithms/impl/Kokkos_InclusiveScan.hpp index ecd6ff39cd..0b4acec0fe 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_InclusiveScan.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_InclusiveScan.hpp @@ -101,9 +101,12 @@ struct InclusiveScanDefaultFunctor { } }; +// +// exespace impl +// template -OutputIteratorType inclusive_scan_default_op_impl( +OutputIteratorType inclusive_scan_default_op_exespace_impl( const std::string& label, const ExecutionSpace& ex, InputIteratorType first_from, InputIteratorType last_from, OutputIteratorType first_dest) { @@ -143,7 +146,7 @@ OutputIteratorType inclusive_scan_default_op_impl( // ------------------------------------------------------------- template -OutputIteratorType inclusive_scan_custom_binary_op_impl( +OutputIteratorType inclusive_scan_custom_binary_op_exespace_impl( const std::string& label, const ExecutionSpace& ex, InputIteratorType first_from, InputIteratorType last_from, OutputIteratorType first_dest, BinaryOpType binary_op) { @@ -158,7 +161,7 @@ OutputIteratorType inclusive_scan_custom_binary_op_impl( using value_type = std::remove_const_t; using unary_op_type = StdNumericScanIdentityReferenceUnaryFunctor; - using func_type = TransformInclusiveScanNoInitValueFunctor< + using func_type = ExeSpaceTransformInclusiveScanNoInitValueFunctor< ExecutionSpace, index_type, value_type, InputIteratorType, OutputIteratorType, BinaryOpType, unary_op_type>; @@ -179,7 +182,7 @@ OutputIteratorType inclusive_scan_custom_binary_op_impl( // ------------------------------------------------------------- template -OutputIteratorType inclusive_scan_custom_binary_op_impl( +OutputIteratorType inclusive_scan_custom_binary_op_exespace_impl( const std::string& label, const ExecutionSpace& ex, InputIteratorType first_from, InputIteratorType last_from, OutputIteratorType first_dest, BinaryOpType binary_op, @@ -193,7 +196,7 @@ OutputIteratorType inclusive_scan_custom_binary_op_impl( // aliases using index_type = typename InputIteratorType::difference_type; using unary_op_type = StdNumericScanIdentityReferenceUnaryFunctor; - using func_type = TransformInclusiveScanWithInitValueFunctor< + using func_type = ExeSpaceTransformInclusiveScanWithInitValueFunctor< ExecutionSpace, index_type, ValueType, InputIteratorType, OutputIteratorType, BinaryOpType, unary_op_type>; @@ -203,13 +206,142 @@ OutputIteratorType inclusive_scan_custom_binary_op_impl( ::Kokkos::parallel_scan(label, RangePolicy(ex, 0, num_elements), func_type(first_from, first_dest, binary_op, - unary_op_type(), init_value)); + unary_op_type(), std::move(init_value))); ex.fence("Kokkos::inclusive_scan_custom_binary_op: fence after operation"); // return return first_dest + num_elements; } +// +// team impl +// +template +KOKKOS_FUNCTION OutputIteratorType inclusive_scan_default_op_team_impl( + const TeamHandleType& teamHandle, InputIteratorType first_from, + InputIteratorType last_from, OutputIteratorType first_dest) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first_from, + first_dest); + Impl::static_assert_iterators_have_matching_difference_type(first_from, + first_dest); + Impl::expect_valid_range(first_from, last_from); + + using value_type = + std::remove_const_t; + + // #if defined(KOKKOS_ENABLE_CUDA) + + using exe_space = typename TeamHandleType::execution_space; + using index_type = typename InputIteratorType::difference_type; + using func_type = std::conditional_t< + ::Kokkos::is_detected::value, + InclusiveScanDefaultFunctorForKnownIdentityElement< + exe_space, index_type, value_type, InputIteratorType, + OutputIteratorType>, + InclusiveScanDefaultFunctor>; + + // run + const auto num_elements = + Kokkos::Experimental::distance(first_from, last_from); + ::Kokkos::parallel_scan(TeamThreadRange(teamHandle, 0, num_elements), + func_type(first_from, first_dest)); + teamHandle.team_barrier(); + + // return + return first_dest + num_elements; +} + +// ------------------------------------------------------------- +// inclusive_scan_custom_binary_op_impl +// ------------------------------------------------------------- +template +KOKKOS_FUNCTION OutputIteratorType inclusive_scan_custom_binary_op_team_impl( + const TeamHandleType& teamHandle, InputIteratorType first_from, + InputIteratorType last_from, OutputIteratorType first_dest, + BinaryOpType binary_op) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first_from, + first_dest); + Impl::static_assert_iterators_have_matching_difference_type(first_from, + first_dest); + Impl::expect_valid_range(first_from, last_from); + + using value_type = + std::remove_const_t; + + static_assert( + ::Kokkos::is_detected_v, + "At the moment inclusive_scan doesn't support types without reduction " + "identity"); + + // #if defined(KOKKOS_ENABLE_CUDA) + + // aliases + using exe_space = typename TeamHandleType::execution_space; + using unary_op_type = StdNumericScanIdentityReferenceUnaryFunctor; + using func_type = TeamTransformInclusiveScanNoInitValueFunctor< + exe_space, value_type, InputIteratorType, OutputIteratorType, + BinaryOpType, unary_op_type>; + + // run + const auto num_elements = + Kokkos::Experimental::distance(first_from, last_from); + + ::Kokkos::parallel_scan( + TeamThreadRange(teamHandle, 0, num_elements), + func_type(first_from, first_dest, binary_op, unary_op_type())); + teamHandle.team_barrier(); + + return first_dest + num_elements; +} + +// ------------------------------------------------------------- +// inclusive_scan_custom_binary_op_impl with init_value +// ------------------------------------------------------------- +template +KOKKOS_FUNCTION OutputIteratorType inclusive_scan_custom_binary_op_team_impl( + const TeamHandleType& teamHandle, InputIteratorType first_from, + InputIteratorType last_from, OutputIteratorType first_dest, + BinaryOpType binary_op, ValueType init_value) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first_from, + first_dest); + Impl::static_assert_iterators_have_matching_difference_type(first_from, + first_dest); + Impl::expect_valid_range(first_from, last_from); + + static_assert( + ::Kokkos::is_detected_v, + "At the moment inclusive_scan doesn't support types without reduction " + "identity"); + + // #if defined(KOKKOS_ENABLE_CUDA) + + // aliases + using exe_space = typename TeamHandleType::execution_space; + using unary_op_type = StdNumericScanIdentityReferenceUnaryFunctor; + using func_type = TeamTransformInclusiveScanWithInitValueFunctor< + exe_space, ValueType, InputIteratorType, OutputIteratorType, BinaryOpType, + unary_op_type>; + + // run + const auto num_elements = + Kokkos::Experimental::distance(first_from, last_from); + ::Kokkos::parallel_scan(TeamThreadRange(teamHandle, 0, num_elements), + func_type(first_from, first_dest, binary_op, + unary_op_type(), std::move(init_value))); + teamHandle.team_barrier(); + + // return + return first_dest + num_elements; +} + } // namespace Impl } // namespace Experimental } // namespace Kokkos diff --git a/algorithms/src/std_algorithms/impl/Kokkos_IsPartitioned.hpp b/algorithms/src/std_algorithms/impl/Kokkos_IsPartitioned.hpp index 0fe2d246ff..281efca36b 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_IsPartitioned.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_IsPartitioned.hpp @@ -62,9 +62,9 @@ struct StdIsPartitionedFunctor { }; template -bool is_partitioned_impl(const std::string& label, const ExecutionSpace& ex, - IteratorType first, IteratorType last, - PredicateType pred) { +bool is_partitioned_exespace_impl(const std::string& label, + const ExecutionSpace& ex, IteratorType first, + IteratorType last, PredicateType pred) { // true if all elements in the range [first, last) that satisfy // the predicate "pred" appear before all elements that don't. // Also returns true if [first, last) is empty. @@ -97,6 +97,7 @@ bool is_partitioned_impl(const std::string& label, const ExecutionSpace& ex, const auto num_elements = Kokkos::Experimental::distance(first, last); ::Kokkos::parallel_reduce(label, RangePolicy(ex, 0, num_elements), + func_t(first, reducer, pred), reducer); // fence not needed because reducing into scalar @@ -109,8 +110,72 @@ bool is_partitioned_impl(const std::string& label, const ExecutionSpace& ex, if (red_result.max_loc_true != red_id_max && red_result.min_loc_false != red_id_min) { + // this occurs when the reduction yields nontrivial values return red_result.max_loc_true < red_result.min_loc_false; + } else if (red_result.max_loc_true == red_id_max && + red_result.min_loc_false == 0) { + // this occurs when all values do NOT satisfy + // the predicate, and this corner case should also be true + return true; + } else if (first + red_result.max_loc_true == --last) { + // this occurs when all values satisfy the predicate, + // this corner case should also be true + return true; + } else { + return false; + } +} + +template +KOKKOS_FUNCTION bool is_partitioned_team_impl(const TeamHandleType& teamHandle, + IteratorType first, + IteratorType last, + PredicateType pred) { + /* see exespace impl for the description of the impl */ + + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first); + Impl::expect_valid_range(first, last); + + // trivial case + if (first == last) { + return true; + } + + // aliases + using index_type = typename IteratorType::difference_type; + using reducer_type = StdIsPartitioned; + using reduction_value_type = typename reducer_type::value_type; + using func_t = + StdIsPartitionedFunctor; + + // run + reduction_value_type red_result; + reducer_type reducer(red_result); + const auto num_elements = Kokkos::Experimental::distance(first, last); + ::Kokkos::parallel_reduce(TeamThreadRange(teamHandle, 0, num_elements), + func_t(first, reducer, pred), reducer); + + // fence not needed because reducing into scalar + + // decide and return + constexpr index_type red_id_min = + ::Kokkos::reduction_identity::min(); + constexpr index_type red_id_max = + ::Kokkos::reduction_identity::max(); + + if (red_result.max_loc_true != red_id_max && + red_result.min_loc_false != red_id_min) { + // this occurs when the reduction yields nontrivial values + return red_result.max_loc_true < red_result.min_loc_false; + } else if (red_result.max_loc_true == red_id_max && + red_result.min_loc_false == 0) { + // this occurs when all values do NOT satisfy + // the predicate, and this corner case should also be true + return true; } else if (first + red_result.max_loc_true == --last) { + // this occurs when all values satisfy the predicate, + // this corner case should also be true return true; } else { return false; diff --git a/algorithms/src/std_algorithms/impl/Kokkos_IsSorted.hpp b/algorithms/src/std_algorithms/impl/Kokkos_IsSorted.hpp index 4696821586..b2c912848a 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_IsSorted.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_IsSorted.hpp @@ -48,10 +48,13 @@ struct StdIsSortedFunctor { : m_first(std::move(_first1)), m_comparator(std::move(comparator)) {} }; +// +// exespace impl +// template -bool is_sorted_impl(const std::string& label, const ExecutionSpace& ex, - IteratorType first, IteratorType last, - ComparatorType comp) { +bool is_sorted_exespace_impl(const std::string& label, const ExecutionSpace& ex, + IteratorType first, IteratorType last, + ComparatorType comp) { // checks Impl::static_assert_random_access_and_accessible(ex, first); Impl::expect_valid_range(first, last); @@ -75,11 +78,49 @@ bool is_sorted_impl(const std::string& label, const ExecutionSpace& ex, } template -bool is_sorted_impl(const std::string& label, const ExecutionSpace& ex, - IteratorType first, IteratorType last) { +bool is_sorted_exespace_impl(const std::string& label, const ExecutionSpace& ex, + IteratorType first, IteratorType last) { + using value_type = typename IteratorType::value_type; + using pred_t = Impl::StdAlgoLessThanBinaryPredicate; + return is_sorted_exespace_impl(label, ex, first, last, pred_t()); +} + +// +// team impl +// +template +KOKKOS_FUNCTION bool is_sorted_team_impl(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + ComparatorType comp) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first); + Impl::expect_valid_range(first, last); + + const auto num_elements = Kokkos::Experimental::distance(first, last); + if (num_elements <= 1) { + return true; + } + + // use num_elements-1 because each index handles i and i+1 + const auto num_elements_minus_one = num_elements - 1; + + // result is incremented by one if sorting breaks at index i + std::size_t result = 0; + ::Kokkos::parallel_reduce( + TeamThreadRange(teamHandle, 0, num_elements_minus_one), + // use CTAD here + StdIsSortedFunctor(first, std::move(comp)), result); + + return result == 0; +} + +template +KOKKOS_FUNCTION bool is_sorted_team_impl(const TeamHandleType& teamHandle, + IteratorType first, + IteratorType last) { using value_type = typename IteratorType::value_type; using pred_t = Impl::StdAlgoLessThanBinaryPredicate; - return is_sorted_impl(label, ex, first, last, pred_t()); + return is_sorted_team_impl(teamHandle, first, last, pred_t()); } } // namespace Impl diff --git a/algorithms/src/std_algorithms/impl/Kokkos_IsSortedUntil.hpp b/algorithms/src/std_algorithms/impl/Kokkos_IsSortedUntil.hpp index 2a0c112bf5..d33580ca53 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_IsSortedUntil.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_IsSortedUntil.hpp @@ -54,10 +54,15 @@ struct StdIsSortedUntilFunctor { m_reducer(std::move(reducer)) {} }; +// +// overloads accepting exespace +// template -IteratorType is_sorted_until_impl(const std::string& label, - const ExecutionSpace& ex, IteratorType first, - IteratorType last, ComparatorType comp) { +IteratorType is_sorted_until_exespace_impl(const std::string& label, + const ExecutionSpace& ex, + IteratorType first, + IteratorType last, + ComparatorType comp) { // checks Impl::static_assert_random_access_and_accessible(ex, first); Impl::expect_valid_range(first, last); @@ -81,7 +86,6 @@ IteratorType is_sorted_until_impl(const std::string& label, label, // use num_elements-1 because each index handles i and i+1 RangePolicy(ex, 0, num_elements - 1), - // use CTAD StdIsSortedUntilFunctor(first, comp, reducer), reducer); /* If the reduction result is equal to the initial value, @@ -98,12 +102,66 @@ IteratorType is_sorted_until_impl(const std::string& label, } template -IteratorType is_sorted_until_impl(const std::string& label, - const ExecutionSpace& ex, IteratorType first, - IteratorType last) { +IteratorType is_sorted_until_exespace_impl(const std::string& label, + const ExecutionSpace& ex, + IteratorType first, + IteratorType last) { + using value_type = typename IteratorType::value_type; + using pred_t = Impl::StdAlgoLessThanBinaryPredicate; + return is_sorted_until_exespace_impl(label, ex, first, last, pred_t()); +} + +// +// overloads accepting team handle +// +template +KOKKOS_FUNCTION IteratorType +is_sorted_until_team_impl(const ExecutionSpace& teamHandle, IteratorType first, + IteratorType last, ComparatorType comp) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first); + Impl::expect_valid_range(first, last); + + const auto num_elements = Kokkos::Experimental::distance(first, last); + + // trivial case + if (num_elements <= 1) { + return last; + } + + /* + Do a par_reduce computing the *min* index that breaks the sorting. + If one such index is found, then the range is sorted until that element, + if no such index is found, then it means the range is sorted until the end. + */ + using index_type = typename IteratorType::difference_type; + index_type red_result; + index_type red_result_init; + ::Kokkos::Min reducer(red_result); + reducer.init(red_result_init); + ::Kokkos::parallel_reduce( // use num_elements-1 because each index handles i + // and i+1 + TeamThreadRange(teamHandle, 0, num_elements - 1), + StdIsSortedUntilFunctor(first, comp, reducer), reducer); + teamHandle.team_barrier(); + + /* If the reduction result is equal to the initial value, + and it means the range is sorted until the end */ + if (red_result == red_result_init) { + return last; + } else { + /* If such index is found, then the range is sorted until there and + we need to return an iterator past the element found so do +1 */ + return first + (red_result + 1); + } +} + +template +KOKKOS_FUNCTION IteratorType is_sorted_until_team_impl( + const ExecutionSpace& teamHandle, IteratorType first, IteratorType last) { using value_type = typename IteratorType::value_type; using pred_t = Impl::StdAlgoLessThanBinaryPredicate; - return is_sorted_until_impl(label, ex, first, last, pred_t()); + return is_sorted_until_team_impl(teamHandle, first, last, pred_t()); } } // namespace Impl diff --git a/algorithms/src/std_algorithms/impl/Kokkos_LexicographicalCompare.hpp b/algorithms/src/std_algorithms/impl/Kokkos_LexicographicalCompare.hpp index ad7f59232e..b95a66c3bd 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_LexicographicalCompare.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_LexicographicalCompare.hpp @@ -84,13 +84,15 @@ struct StdLexicographicalCompareFunctor { m_comparator(std::move(_comp)) {} }; +// +// exespace impl +// template -bool lexicographical_compare_impl(const std::string& label, - const ExecutionSpace& ex, - IteratorType1 first1, IteratorType1 last1, - IteratorType2 first2, IteratorType2 last2, - ComparatorType comp) { +bool lexicographical_compare_exespace_impl( + const std::string& label, const ExecutionSpace& ex, IteratorType1 first1, + IteratorType1 last1, IteratorType2 first2, IteratorType2 last2, + ComparatorType comp) { // checks Impl::static_assert_random_access_and_accessible(ex, first1, first2); Impl::static_assert_iterators_have_matching_difference_type(first1, first2); @@ -139,16 +141,84 @@ bool lexicographical_compare_impl(const std::string& label, } template -bool lexicographical_compare_impl(const std::string& label, - const ExecutionSpace& ex, - IteratorType1 first1, IteratorType1 last1, - IteratorType2 first2, IteratorType2 last2) { +bool lexicographical_compare_exespace_impl( + const std::string& label, const ExecutionSpace& ex, IteratorType1 first1, + IteratorType1 last1, IteratorType2 first2, IteratorType2 last2) { + using value_type_1 = typename IteratorType1::value_type; + using value_type_2 = typename IteratorType2::value_type; + using predicate_t = + Impl::StdAlgoLessThanBinaryPredicate; + return lexicographical_compare_exespace_impl(label, ex, first1, last1, first2, + last2, predicate_t()); +} + +// +// team impl +// +template +KOKKOS_FUNCTION bool lexicographical_compare_team_impl( + const TeamHandleType& teamHandle, IteratorType1 first1, IteratorType1 last1, + IteratorType2 first2, IteratorType2 last2, ComparatorType comp) { + // checks + Impl::static_assert_random_access_and_accessible(teamHandle, first1, first2); + Impl::static_assert_iterators_have_matching_difference_type(first1, first2); + Impl::expect_valid_range(first1, last1); + Impl::expect_valid_range(first2, last2); + + // aliases + using index_type = typename IteratorType1::difference_type; + using reducer_type = FirstLoc; + using reduction_value_type = typename reducer_type::value_type; + + // run + const auto d1 = Kokkos::Experimental::distance(first1, last1); + const auto d2 = Kokkos::Experimental::distance(first2, last2); + const auto range = Kokkos::min(d1, d2); + reduction_value_type red_result; + reducer_type reducer(red_result); + using func1_t = + StdLexicographicalCompareFunctor; + + ::Kokkos::parallel_reduce(TeamThreadRange(teamHandle, 0, range), + func1_t(first1, first2, reducer, comp), reducer); + + teamHandle.team_barrier(); + + // no mismatch + if (red_result.min_loc_true == + ::Kokkos::reduction_identity::min()) { + auto new_last1 = first1 + range; + auto new_last2 = first2 + range; + bool is_prefix = (new_last1 == last1) && (new_last2 != last2); + return is_prefix; + } + + // check mismatched + int less = 0; + auto it1 = first1 + red_result.min_loc_true; + auto it2 = first2 + red_result.min_loc_true; + using func2_t = StdCompareFunctor; + ::Kokkos::parallel_reduce(TeamThreadRange(teamHandle, 0, 1), + func2_t(it1, it2, comp), less); + + teamHandle.team_barrier(); + + return static_cast(less); +} + +template +KOKKOS_FUNCTION bool lexicographical_compare_team_impl( + const TeamHandleType& teamHandle, IteratorType1 first1, IteratorType1 last1, + IteratorType2 first2, IteratorType2 last2) { using value_type_1 = typename IteratorType1::value_type; using value_type_2 = typename IteratorType2::value_type; using predicate_t = Impl::StdAlgoLessThanBinaryPredicate; - return lexicographical_compare_impl(label, ex, first1, last1, first2, last2, - predicate_t()); + return lexicographical_compare_team_impl(teamHandle, first1, last1, first2, + last2, predicate_t()); } } // namespace Impl diff --git a/algorithms/src/std_algorithms/impl/Kokkos_MinMaxMinmaxElement.hpp b/algorithms/src/std_algorithms/impl/Kokkos_MinMaxMinmaxElement.hpp index 048420f7a8..2f51db03b4 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_MinMaxMinmaxElement.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_MinMaxMinmaxElement.hpp @@ -63,12 +63,16 @@ struct StdMinMaxElemFunctor { : m_first(std::move(first)), m_reducer(std::move(reducer)) {} }; +// +// exespace impl +// template