diff --git a/README.md b/README.md index 8c9675169..2e77664f3 100644 --- a/README.md +++ b/README.md @@ -178,11 +178,11 @@ reference specification. All operations take as their first argument a reference to the SB_Handle, a `blas::SB_Handle` created with a `sycl::queue`. The last argument for all operators -is a vector of dependencies of type `cl::sycl::event` (empty by default). The return value +is a vector of dependencies of type `sycl::event` (empty by default). The return value is usually an array of SYCL events (except for some operations that can return a scalar or a tuple). The containers for the vectors and matrices (and scalars written by the BLAS operations) can either be `raw usm pointers` or `iterator buffers` that can be -created with a call to `cl::sycl::malloc_device` or `make_sycl_iterator_buffer` respectively. +created with a call to `sycl::malloc_device` or `make_sycl_iterator_buffer` respectively. The USM support in portBLAS is limited to `device allocated` memory only and we don't support `shared` or `host` allocations with USM. diff --git a/benchmark/cublas/blas1/asum.cpp b/benchmark/cublas/blas1/asum.cpp index 3766fb2cf..398d14b22 100644 --- a/benchmark/cublas/blas1/asum.cpp +++ b/benchmark/cublas/blas1/asum.cpp @@ -52,7 +52,7 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, index_t size, // Create data std::vector v1 = blas_benchmark::utils::random_data(size); - // We need to guarantee that cl::sycl::half can hold the sum + // We need to guarantee that sycl::half can hold the sum // of x_v without overflow by making sum(x_v) to be 1.0 std::transform(std::begin(v1), std::end(v1), std::begin(v1), [=](scalar_t x) { return x / v1.size(); }); diff --git a/benchmark/cublas/blas1/dot.cpp b/benchmark/cublas/blas1/dot.cpp index 75ae06719..2bfb33177 100644 --- a/benchmark/cublas/blas1/dot.cpp +++ b/benchmark/cublas/blas1/dot.cpp @@ -54,7 +54,7 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, index_t size, std::vector v1 = blas_benchmark::utils::random_data(size); std::vector v2 = blas_benchmark::utils::random_data(size); - // Make sure cl::sycl::half can hold the result of the dot product + // Make sure sycl::half can hold the result of the dot product std::transform(std::begin(v1), std::end(v1), std::begin(v1), [=](scalar_t x) { return x / v1.size(); }); diff --git a/benchmark/cublas/blas1/nrm2.cpp b/benchmark/cublas/blas1/nrm2.cpp index edcae7e88..4922387a8 100644 --- a/benchmark/cublas/blas1/nrm2.cpp +++ b/benchmark/cublas/blas1/nrm2.cpp @@ -54,7 +54,7 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, index_t size, // Create data std::vector v1 = blas_benchmark::utils::random_data(size); - // We need to guarantee that cl::sycl::half can hold the norm of the vector + // We need to guarantee that sycl::half can hold the norm of the vector std::transform(std::begin(v1), std::end(v1), std::begin(v1), [=](scalar_t x) { return x / v1.size(); }); diff --git a/benchmark/cublas/blas3/gemm.cpp b/benchmark/cublas/blas3/gemm.cpp index 61f044c03..6190c6706 100644 --- a/benchmark/cublas/blas3/gemm.cpp +++ b/benchmark/cublas/blas3/gemm.cpp @@ -34,7 +34,7 @@ static inline void cublas_routine(args_t&&... args) { CUBLAS_CHECK(cublasSgemm(std::forward(args)...)); } else if constexpr (std::is_same_v) { CUBLAS_CHECK(cublasDgemm(std::forward(args)...)); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { CUBLAS_CHECK(cublasHgemm(std::forward(args)...)); } return; diff --git a/benchmark/cublas/blas3/gemm_batched.cpp b/benchmark/cublas/blas3/gemm_batched.cpp index d1a4e3ae2..dc85dd6ad 100644 --- a/benchmark/cublas/blas3/gemm_batched.cpp +++ b/benchmark/cublas/blas3/gemm_batched.cpp @@ -34,7 +34,7 @@ static inline void cublas_routine(args_t&&... args) { CUBLAS_CHECK(cublasSgemmBatched(std::forward(args)...)); } else if constexpr (std::is_same_v) { CUBLAS_CHECK(cublasDgemmBatched(std::forward(args)...)); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { CUBLAS_CHECK(cublasHgemmBatched(std::forward(args)...)); } return; diff --git a/benchmark/cublas/blas3/gemm_batched_strided.cpp b/benchmark/cublas/blas3/gemm_batched_strided.cpp index 846fd7806..046220a1e 100644 --- a/benchmark/cublas/blas3/gemm_batched_strided.cpp +++ b/benchmark/cublas/blas3/gemm_batched_strided.cpp @@ -34,7 +34,7 @@ static inline void cublas_routine(args_t&&... args) { CUBLAS_CHECK(cublasSgemmStridedBatched(std::forward(args)...)); } else if constexpr (std::is_same_v) { CUBLAS_CHECK(cublasDgemmStridedBatched(std::forward(args)...)); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { CUBLAS_CHECK(cublasHgemmStridedBatched(std::forward(args)...)); } return; diff --git a/benchmark/cublas/utils.hpp b/benchmark/cublas/utils.hpp index c658caff4..39845a741 100644 --- a/benchmark/cublas/utils.hpp +++ b/benchmark/cublas/utils.hpp @@ -287,7 +287,7 @@ struct CudaType { // When T is sycl::half, use cuda's __cuda as type. template -struct CudaType>> { +struct CudaType>> { using type = __half; }; diff --git a/benchmark/portblas/blas1/asum.cpp b/benchmark/portblas/blas1/asum.cpp index bdcfbd819..766a17830 100644 --- a/benchmark/portblas/blas1/asum.cpp +++ b/benchmark/portblas/blas1/asum.cpp @@ -45,7 +45,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, // Create data std::vector v1 = blas_benchmark::utils::random_data(size); - // We need to guarantee that cl::sycl::half can hold the sum + // We need to guarantee that sycl::half can hold the sum // of x_v without overflow by making sum(x_v) to be 1.0 std::transform(std::begin(v1), std::end(v1), std::begin(v1), [=](scalar_t x) { return x / v1.size(); }); @@ -82,7 +82,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _asum(sb_handle, size, inx, static_cast(1), inr); sb_handle.wait(event); return event; diff --git a/benchmark/portblas/blas1/axpy.cpp b/benchmark/portblas/blas1/axpy.cpp index aee6cca52..04374fe15 100644 --- a/benchmark/portblas/blas1/axpy.cpp +++ b/benchmark/portblas/blas1/axpy.cpp @@ -42,8 +42,8 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, blas::SB_Handle& sb_handle = *sb_handle_ptr; auto q = sb_handle.get_queue(); - if (std::is_same_v && - !q.get_device().has(cl::sycl::aspect::fp16)) { + if (std::is_same_v && + !q.get_device().has(sycl::aspect::fp16)) { state.SkipWithError("Unsupported fp16 (half) on this device."); } @@ -90,7 +90,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _axpy(sb_handle, size, alpha, inx, static_cast(1), iny, static_cast(1)); sb_handle.wait(event); diff --git a/benchmark/portblas/blas1/copy.cpp b/benchmark/portblas/blas1/copy.cpp index 0cf5c81c9..ea953d00a 100644 --- a/benchmark/portblas/blas1/copy.cpp +++ b/benchmark/portblas/blas1/copy.cpp @@ -88,7 +88,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = blas::_copy(sb_handle, size, x_gpu, incx, y_gpu, incy); diff --git a/benchmark/portblas/blas1/dot.cpp b/benchmark/portblas/blas1/dot.cpp index 4d45dc577..fc615be61 100644 --- a/benchmark/portblas/blas1/dot.cpp +++ b/benchmark/portblas/blas1/dot.cpp @@ -46,7 +46,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, std::vector v1 = blas_benchmark::utils::random_data(size); std::vector v2 = blas_benchmark::utils::random_data(size); - // Make sure cl::sycl::half can hold the result of the dot product + // Make sure sycl::half can hold the result of the dot product std::transform(std::begin(v1), std::end(v1), std::begin(v1), [=](scalar_t x) { return x / v1.size(); }); @@ -85,7 +85,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _dot(sb_handle, size, inx, static_cast(1), iny, static_cast(1), inr); sb_handle.wait(event); diff --git a/benchmark/portblas/blas1/iamax.cpp b/benchmark/portblas/blas1/iamax.cpp index 053055874..86bbbb37c 100644 --- a/benchmark/portblas/blas1/iamax.cpp +++ b/benchmark/portblas/blas1/iamax.cpp @@ -84,7 +84,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _iamax(sb_handle, size, inx, static_cast(1), outI); sb_handle.wait(event); return event; diff --git a/benchmark/portblas/blas1/iamin.cpp b/benchmark/portblas/blas1/iamin.cpp index d586d4353..d59883812 100644 --- a/benchmark/portblas/blas1/iamin.cpp +++ b/benchmark/portblas/blas1/iamin.cpp @@ -83,7 +83,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _iamin(sb_handle, size, inx, static_cast(1), outI); sb_handle.wait(event); return event; diff --git a/benchmark/portblas/blas1/nrm2.cpp b/benchmark/portblas/blas1/nrm2.cpp index 057bb5b74..509485381 100644 --- a/benchmark/portblas/blas1/nrm2.cpp +++ b/benchmark/portblas/blas1/nrm2.cpp @@ -45,7 +45,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, // Create data std::vector v1 = blas_benchmark::utils::random_data(size); - // We need to guarantee that cl::sycl::half can hold the norm of the vector + // We need to guarantee that sycl::half can hold the norm of the vector std::transform(std::begin(v1), std::end(v1), std::begin(v1), [=](scalar_t x) { return x / v1.size(); }); @@ -81,7 +81,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _nrm2(sb_handle, size, inx, static_cast(1), inr); sb_handle.wait(event); return event; diff --git a/benchmark/portblas/blas1/rotg.cpp b/benchmark/portblas/blas1/rotg.cpp index f334613ff..0db4b2791 100644 --- a/benchmark/portblas/blas1/rotg.cpp +++ b/benchmark/portblas/blas1/rotg.cpp @@ -124,7 +124,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, #endif // Create a utility lambda describing the blas method that we want to run. - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _rotg(sb_handle, buf_a, buf_b, buf_c, buf_s); sb_handle.wait(event); return event; diff --git a/benchmark/portblas/blas1/rotm.cpp b/benchmark/portblas/blas1/rotm.cpp index 8266f329f..4d3adbac0 100644 --- a/benchmark/portblas/blas1/rotm.cpp +++ b/benchmark/portblas/blas1/rotm.cpp @@ -116,7 +116,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _rotm(sb_handle, size, gpu_x_v, static_cast(1), gpu_y_v, static_cast(1), gpu_param); sb_handle.wait(event); diff --git a/benchmark/portblas/blas1/rotmg.cpp b/benchmark/portblas/blas1/rotmg.cpp index 2b4e81a07..31dd38dc7 100644 --- a/benchmark/portblas/blas1/rotmg.cpp +++ b/benchmark/portblas/blas1/rotmg.cpp @@ -135,7 +135,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, #endif // Create a utility lambda describing the blas method that we want to run. - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _rotmg(sb_handle, buf_d1, buf_d2, buf_x1, buf_y1, buf_param); sb_handle.wait(event); return event; diff --git a/benchmark/portblas/blas1/scal.cpp b/benchmark/portblas/blas1/scal.cpp index e9c8b6646..b523a0860 100644 --- a/benchmark/portblas/blas1/scal.cpp +++ b/benchmark/portblas/blas1/scal.cpp @@ -42,8 +42,8 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, blas::SB_Handle& sb_handle = *sb_handle_ptr; auto q = sb_handle.get_queue(); - if (std::is_same_v && - !q.get_device().has(cl::sycl::aspect::fp16)) { + if (std::is_same_v && + !q.get_device().has(sycl::aspect::fp16)) { state.SkipWithError("Unsupported fp16 (half) on this device."); } @@ -84,7 +84,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _scal(sb_handle, size, alpha, in, static_cast(1)); sb_handle.wait(event); return event; diff --git a/benchmark/portblas/blas1/sdsdot.cpp b/benchmark/portblas/blas1/sdsdot.cpp index 36e963bf6..2420928d9 100644 --- a/benchmark/portblas/blas1/sdsdot.cpp +++ b/benchmark/portblas/blas1/sdsdot.cpp @@ -84,7 +84,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _sdsdot(sb_handle, size, sb, inx, static_cast(1), iny, static_cast(1), inr); sb_handle.wait(event); diff --git a/benchmark/portblas/blas2/gbmv.cpp b/benchmark/portblas/blas2/gbmv.cpp index e4a912aed..8c530d882 100644 --- a/benchmark/portblas/blas2/gbmv.cpp +++ b/benchmark/portblas/blas2/gbmv.cpp @@ -104,7 +104,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _gbmv(sb_handle, *t_str, m, n, kl, ku, alpha, m_a_gpu, lda, v_x_gpu, incX, beta, v_y_gpu, incY); sb_handle.wait(event); diff --git a/benchmark/portblas/blas2/gemv.cpp b/benchmark/portblas/blas2/gemv.cpp index a822b9c3f..d4f52e13a 100644 --- a/benchmark/portblas/blas2/gemv.cpp +++ b/benchmark/portblas/blas2/gemv.cpp @@ -104,7 +104,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _gemv(sb_handle, *t_str, m, n, alpha, m_a_gpu, m, v_x_gpu, incX, beta, v_y_gpu, incY); sb_handle.wait(event); diff --git a/benchmark/portblas/blas2/ger.cpp b/benchmark/portblas/blas2/ger.cpp index 0b4b853ad..7f1ecf28e 100644 --- a/benchmark/portblas/blas2/ger.cpp +++ b/benchmark/portblas/blas2/ger.cpp @@ -102,7 +102,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t m, #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _ger(sb_handle, m, n, alpha, v_x_gpu, incX, v_y_gpu, incY, m_a_gpu, lda); sb_handle.wait(event); diff --git a/benchmark/portblas/blas2/sbmv.cpp b/benchmark/portblas/blas2/sbmv.cpp index c10629b66..a4c056729 100644 --- a/benchmark/portblas/blas2/sbmv.cpp +++ b/benchmark/portblas/blas2/sbmv.cpp @@ -103,7 +103,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _sbmv(sb_handle, *uplo_str, n, k, alpha, m_a_gpu, lda, v_x_gpu, incX, beta, v_y_gpu, incY); sb_handle.wait(event); diff --git a/benchmark/portblas/blas2/spmv.cpp b/benchmark/portblas/blas2/spmv.cpp index 5b2925f0a..b2a68fd41 100644 --- a/benchmark/portblas/blas2/spmv.cpp +++ b/benchmark/portblas/blas2/spmv.cpp @@ -101,7 +101,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _spmv(sb_handle, *uplo_str, n, alpha, m_a_gpu, v_x_gpu, incX, beta, v_y_gpu, incY); sb_handle.wait(event); diff --git a/benchmark/portblas/blas2/spr.cpp b/benchmark/portblas/blas2/spr.cpp index cb9479bea..1006c0f34 100644 --- a/benchmark/portblas/blas2/spr.cpp +++ b/benchmark/portblas/blas2/spr.cpp @@ -92,7 +92,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, char uplo, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = blas::_spr( sb_handle, uplo, size, alpha, v_x_gpu, incX, m_a_gpu); diff --git a/benchmark/portblas/blas2/spr2.cpp b/benchmark/portblas/blas2/spr2.cpp index d45d85d5e..065e3ec24 100644 --- a/benchmark/portblas/blas2/spr2.cpp +++ b/benchmark/portblas/blas2/spr2.cpp @@ -99,7 +99,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, char uplo, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = blas::_spr2(sb_handle, uplo, n, alpha, v_x_gpu, incX, v_y_gpu, incY, m_a_gpu); sb_handle.wait(event); diff --git a/benchmark/portblas/blas2/symv.cpp b/benchmark/portblas/blas2/symv.cpp index c93241439..fa1659009 100644 --- a/benchmark/portblas/blas2/symv.cpp +++ b/benchmark/portblas/blas2/symv.cpp @@ -102,7 +102,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _symv(sb_handle, *uplo_str, n, alpha, m_a_gpu, lda, v_x_gpu, incX, beta, v_y_gpu, incY); sb_handle.wait(event); diff --git a/benchmark/portblas/blas2/syr.cpp b/benchmark/portblas/blas2/syr.cpp index 0f80905aa..d46b6df22 100644 --- a/benchmark/portblas/blas2/syr.cpp +++ b/benchmark/portblas/blas2/syr.cpp @@ -93,7 +93,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _syr(sb_handle, *uplo_str, n, alpha, v_x_gpu, incX, m_a_gpu, lda); sb_handle.wait(event); diff --git a/benchmark/portblas/blas2/syr2.cpp b/benchmark/portblas/blas2/syr2.cpp index 0ead8dc66..73a40ca51 100644 --- a/benchmark/portblas/blas2/syr2.cpp +++ b/benchmark/portblas/blas2/syr2.cpp @@ -103,7 +103,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _syr2(sb_handle, *uplo_str, n, alpha, v_x_gpu, incX, v_y_gpu, incY, m_a_gpu, lda); sb_handle.wait(event); diff --git a/benchmark/portblas/blas2/tbmv.cpp b/benchmark/portblas/blas2/tbmv.cpp index 82ea41f23..c6f869582 100644 --- a/benchmark/portblas/blas2/tbmv.cpp +++ b/benchmark/portblas/blas2/tbmv.cpp @@ -95,7 +95,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _tbmv(sb_handle, *uplo_str, *t_str, *diag_str, n, k, m_a_gpu, lda, v_x_gpu, incX); sb_handle.wait(event); diff --git a/benchmark/portblas/blas2/tbsv.cpp b/benchmark/portblas/blas2/tbsv.cpp index c516b3afd..d8319fda9 100644 --- a/benchmark/portblas/blas2/tbsv.cpp +++ b/benchmark/portblas/blas2/tbsv.cpp @@ -106,7 +106,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _tbsv(sb_handle, *uplo_str, *t_str, *diag_str, n, k, m_a_gpu, lda, v_x_gpu, incX); sb_handle.wait(event); diff --git a/benchmark/portblas/blas2/tpmv.cpp b/benchmark/portblas/blas2/tpmv.cpp index 621f91523..baf0290af 100644 --- a/benchmark/portblas/blas2/tpmv.cpp +++ b/benchmark/portblas/blas2/tpmv.cpp @@ -92,7 +92,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _tpmv(sb_handle, *uplo_str, *t_str, *diag_str, n, m_a_gpu, v_x_gpu, incX); sb_handle.wait(event); diff --git a/benchmark/portblas/blas2/tpsv.cpp b/benchmark/portblas/blas2/tpsv.cpp index 0028cbf0a..fb0fb77e1 100644 --- a/benchmark/portblas/blas2/tpsv.cpp +++ b/benchmark/portblas/blas2/tpsv.cpp @@ -106,7 +106,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _tpsv(sb_handle, *uplo_str, *t_str, *diag_str, n, m_a_gpu, v_x_gpu, incX); sb_handle.wait(event); diff --git a/benchmark/portblas/blas2/trmv.cpp b/benchmark/portblas/blas2/trmv.cpp index ac6d54818..4a7911f39 100644 --- a/benchmark/portblas/blas2/trmv.cpp +++ b/benchmark/portblas/blas2/trmv.cpp @@ -96,7 +96,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _trmv(sb_handle, *uplo_str, *t_str, *diag_str, n, m_a_gpu, lda, v_x_gpu, incX); sb_handle.wait(event); diff --git a/benchmark/portblas/blas2/trsv.cpp b/benchmark/portblas/blas2/trsv.cpp index 85d383e07..2027091ed 100644 --- a/benchmark/portblas/blas2/trsv.cpp +++ b/benchmark/portblas/blas2/trsv.cpp @@ -104,7 +104,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _trsv(sb_handle, *uplo_str, *t_str, *diag_str, n, m_a_gpu, lda, v_x_gpu, incX); sb_handle.wait(event); diff --git a/benchmark/portblas/blas3/gemm.cpp b/benchmark/portblas/blas3/gemm.cpp index 12352bcb3..824d4cbc1 100644 --- a/benchmark/portblas/blas3/gemm.cpp +++ b/benchmark/portblas/blas3/gemm.cpp @@ -55,8 +55,8 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, blas::SB_Handle& sb_handle = *sb_handle_ptr; auto q = sb_handle.get_queue(); - if (std::is_same_v && - !q.get_device().has(cl::sycl::aspect::fp16)) { + if (std::is_same_v && + !q.get_device().has(sycl::aspect::fp16)) { state.SkipWithError("Unsupported fp16 (half) on this device."); } @@ -108,7 +108,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _gemm(sb_handle, *t_a, *t_b, m, n, k, alpha, a_gpu, lda, b_gpu, ldb, beta, c_gpu, ldc); sb_handle.wait(event); @@ -280,7 +280,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _gemm(sb_handle, *t_a, *t_b, m, n, k, alpha_sycl, a_gpu, lda, b_gpu, ldb, beta_sycl, c_gpu, ldc); sb_handle.wait(event); diff --git a/benchmark/portblas/blas3/gemm_batched.cpp b/benchmark/portblas/blas3/gemm_batched.cpp index 21a7c47a6..93da8aaed 100644 --- a/benchmark/portblas/blas3/gemm_batched.cpp +++ b/benchmark/portblas/blas3/gemm_batched.cpp @@ -90,8 +90,8 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, blas::SB_Handle& sb_handle = *sb_handle_ptr; auto q = sb_handle.get_queue(); - if (std::is_same_v && - !q.get_device().has(cl::sycl::aspect::fp16)) { + if (std::is_same_v && + !q.get_device().has(sycl::aspect::fp16)) { state.SkipWithError("Unsupported fp16 (half) on this device."); } @@ -173,7 +173,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _gemm_batched(sb_handle, *t_a, *t_b, m, n, k, alpha, a_gpu, lda, b_gpu, ldb, beta, c_gpu, ldc, batch_size, batch_type); @@ -363,7 +363,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, }; #endif // BLAS_VERIFY_BENCHMARK - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _gemm_batched(sb_handle, *t_a, *t_b, m, n, k, alpha_sycl, a_gpu, lda, b_gpu, ldb, beta_sycl, c_gpu, ldc, batch_size, batch_type); diff --git a/benchmark/portblas/blas3/gemm_batched_strided.cpp b/benchmark/portblas/blas3/gemm_batched_strided.cpp index eb76e01f7..85e02ab0e 100644 --- a/benchmark/portblas/blas3/gemm_batched_strided.cpp +++ b/benchmark/portblas/blas3/gemm_batched_strided.cpp @@ -60,8 +60,8 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, blas::SB_Handle& sb_handle = *sb_handle_ptr; auto q = sb_handle.get_queue(); - if (std::is_same_v && - !q.get_device().has(cl::sycl::aspect::fp16)) { + if (std::is_same_v && + !q.get_device().has(sycl::aspect::fp16)) { state.SkipWithError("Unsupported fp16 (half) on this device."); } @@ -140,7 +140,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _gemm_strided_batched( sb_handle, *t_a, *t_b, m, n, k, alpha, a_gpu, lda, stride_a, b_gpu, ldb, stride_b, beta, c_gpu, ldc, stride_c, batch_size); @@ -349,7 +349,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _gemm_strided_batched( sb_handle, *t_a, *t_b, m, n, k, alpha_sycl, a_gpu, lda, stride_a, b_gpu, ldb, stride_b, beta_sycl, c_gpu, ldc, stride_c, batch_size); diff --git a/benchmark/portblas/blas3/symm.cpp b/benchmark/portblas/blas3/symm.cpp index 2a482753f..ef47b0595 100644 --- a/benchmark/portblas/blas3/symm.cpp +++ b/benchmark/portblas/blas3/symm.cpp @@ -99,7 +99,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, char side, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _symm(sb_handle, side, uplo, m, n, alpha, a_gpu, lda, b_gpu, ldb, beta, c_gpu, ldc); sb_handle.wait(event); diff --git a/benchmark/portblas/blas3/trsm.cpp b/benchmark/portblas/blas3/trsm.cpp index 8d28cec4c..e422d556a 100644 --- a/benchmark/portblas/blas3/trsm.cpp +++ b/benchmark/portblas/blas3/trsm.cpp @@ -110,7 +110,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, char side, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _trsm(sb_handle, side, uplo, trans, diag, m, n, alpha, a_gpu, lda, b_gpu, ldb); sb_handle.wait(event); diff --git a/benchmark/portblas/extension/axpy_batch.cpp b/benchmark/portblas/extension/axpy_batch.cpp index 2dcbe3451..09890b7f7 100644 --- a/benchmark/portblas/extension/axpy_batch.cpp +++ b/benchmark/portblas/extension/axpy_batch.cpp @@ -97,7 +97,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = _axpy_batch(sb_handle, size, alpha, inx, inc_x, stride_x, iny, inc_y, stride_y, batch_size); sb_handle.wait(event); diff --git a/benchmark/portblas/extension/omatadd.cpp b/benchmark/portblas/extension/omatadd.cpp index c319eb176..eb9bb064d 100644 --- a/benchmark/portblas/extension/omatadd.cpp +++ b/benchmark/portblas/extension/omatadd.cpp @@ -113,7 +113,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti_a, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = blas::_omatadd(sb_handle, *t_str_a, *t_str_b, m, n, alpha, m_a_gpu, lda, beta, m_b_gpu, ldb, m_c_gpu, ldc); sb_handle.wait(event); diff --git a/benchmark/portblas/extension/omatadd_batched.cpp b/benchmark/portblas/extension/omatadd_batched.cpp index 17f9820ad..0d708c209 100644 --- a/benchmark/portblas/extension/omatadd_batched.cpp +++ b/benchmark/portblas/extension/omatadd_batched.cpp @@ -117,7 +117,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti_a, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = blas::_omatadd_batch( sb_handle, *t_str_a, *t_str_b, m, n, alpha, m_a_gpu, lda, stride_a, beta, m_b_gpu, ldb, stride_b, m_c_gpu, ldc, stride_c, batch_size); diff --git a/benchmark/portblas/extension/omatcopy.cpp b/benchmark/portblas/extension/omatcopy.cpp index 17e990d90..88f3152ae 100644 --- a/benchmark/portblas/extension/omatcopy.cpp +++ b/benchmark/portblas/extension/omatcopy.cpp @@ -102,7 +102,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = blas::_omatcopy(sb_handle, *t_str, m, n, alpha, m_a_gpu, lda, m_b_gpu, ldb); sb_handle.wait(event); diff --git a/benchmark/portblas/extension/omatcopy2.cpp b/benchmark/portblas/extension/omatcopy2.cpp index 121eef9f7..db6111807 100644 --- a/benchmark/portblas/extension/omatcopy2.cpp +++ b/benchmark/portblas/extension/omatcopy2.cpp @@ -103,7 +103,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = blas::_omatcopy2(sb_handle, *t_str, m, n, alpha, m_a_gpu, lda, inc_a, m_b_gpu, ldb, inc_b); sb_handle.wait(event); diff --git a/benchmark/portblas/extension/omatcopy_batched.cpp b/benchmark/portblas/extension/omatcopy_batched.cpp index 926bff960..1385ac71e 100644 --- a/benchmark/portblas/extension/omatcopy_batched.cpp +++ b/benchmark/portblas/extension/omatcopy_batched.cpp @@ -105,7 +105,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = blas::_omatcopy_batch(sb_handle, *t_str, m, n, alpha, m_a_gpu, lda, stride_a, m_b_gpu, ldb, stride_b, batch_size); diff --git a/benchmark/portblas/extension/reduction.cpp b/benchmark/portblas/extension/reduction.cpp index 906cc56f1..445a1d713 100644 --- a/benchmark/portblas/extension/reduction.cpp +++ b/benchmark/portblas/extension/reduction.cpp @@ -113,7 +113,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t rows, }; #endif - auto blas_method_def = [&]() -> std::vector { + auto blas_method_def = [&]() -> std::vector { auto event = extension::_reduction( sb_handle, mat_buffer, rows, vec_buffer, rows, cols, dim); sb_handle.wait(event); diff --git a/benchmark/portblas/main.cpp b/benchmark/portblas/main.cpp index 46ed313f1..c25c1781b 100644 --- a/benchmark/portblas/main.cpp +++ b/benchmark/portblas/main.cpp @@ -41,7 +41,7 @@ int main(int argc, char** argv) { // Initialize googlebench benchmark::Initialize(&argc, argv); - cl::sycl::queue q; + sycl::queue q; if (!args.device.empty()) { // Initialise the command line device selector in a unique pointer so that @@ -57,11 +57,10 @@ int main(int argc, char** argv) { // Create a queue from the device selector - do this after initialising // googlebench, as otherwise we may not be able to delete the queue before // we exit (if Initialise calls exit(0)), and dump some information about it - q = cl::sycl::queue(*cdsp.get(), - {cl::sycl::property::queue::enable_profiling()}); + q = sycl::queue(*cdsp.get(), {sycl::property::queue::enable_profiling()}); } else { - q = cl::sycl::queue(cl::sycl::default_selector(), - {cl::sycl::property::queue::enable_profiling()}); + q = sycl::queue(sycl::default_selector_v, + {sycl::property::queue::enable_profiling()}); } utils::print_queue_information(q); diff --git a/benchmark/portblas/utils.hpp b/benchmark/portblas/utils.hpp index 97f84409c..659f9384f 100644 --- a/benchmark/portblas/utils.hpp +++ b/benchmark/portblas/utils.hpp @@ -26,8 +26,8 @@ #ifndef SYCL_UTILS_HPP #define SYCL_UTILS_HPP -#include #include +#include #include #include "portblas.h" @@ -46,17 +46,17 @@ namespace utils { /** * @fn time_event - * @brief Get the overall run time (start -> end) of a cl::sycl::event enqueued + * @brief Get the overall run time (start -> end) of a sycl::event enqueued * on a queue with profiling. */ template <> -inline double time_event(cl::sycl::event& e) { +inline double time_event(sycl::event& e) { // get start and end times auto start_time = e.template get_profiling_info< - cl::sycl::info::event_profiling::command_start>(); + sycl::info::event_profiling::command_start>(); - auto end_time = e.template get_profiling_info< - cl::sycl::info::event_profiling::command_end>(); + auto end_time = + e.template get_profiling_info(); // return the delta return static_cast(end_time - start_time); diff --git a/benchmark/rocblas/blas3/gemm.cpp b/benchmark/rocblas/blas3/gemm.cpp index 8868f44f7..ffe117294 100644 --- a/benchmark/rocblas/blas3/gemm.cpp +++ b/benchmark/rocblas/blas3/gemm.cpp @@ -34,7 +34,7 @@ static inline void rocblas_gemm_f(args_t&&... args) { CHECK_ROCBLAS_STATUS(rocblas_sgemm(std::forward(args)...)); } else if constexpr (std::is_same_v) { CHECK_ROCBLAS_STATUS(rocblas_dgemm(std::forward(args)...)); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { CHECK_ROCBLAS_STATUS(rocblas_hgemm(std::forward(args)...)); } return; diff --git a/benchmark/rocblas/blas3/gemm_batched.cpp b/benchmark/rocblas/blas3/gemm_batched.cpp index f73c176cf..35b43a164 100644 --- a/benchmark/rocblas/blas3/gemm_batched.cpp +++ b/benchmark/rocblas/blas3/gemm_batched.cpp @@ -34,7 +34,7 @@ static inline void rocblas_gemm_batched_f(args_t&&... args) { CHECK_ROCBLAS_STATUS(rocblas_sgemm_batched(std::forward(args)...)); } else if constexpr (std::is_same_v) { CHECK_ROCBLAS_STATUS(rocblas_dgemm_batched(std::forward(args)...)); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { CHECK_ROCBLAS_STATUS(rocblas_hgemm_batched(std::forward(args)...)); } return; diff --git a/benchmark/rocblas/blas3/gemm_batched_strided.cpp b/benchmark/rocblas/blas3/gemm_batched_strided.cpp index 61526b246..df270c642 100644 --- a/benchmark/rocblas/blas3/gemm_batched_strided.cpp +++ b/benchmark/rocblas/blas3/gemm_batched_strided.cpp @@ -36,7 +36,7 @@ static inline void rocblas_gemm_strided_batched(args_t&&... args) { } else if constexpr (std::is_same_v) { CHECK_ROCBLAS_STATUS( rocblas_dgemm_strided_batched(std::forward(args)...)); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { CHECK_ROCBLAS_STATUS( rocblas_hgemm_strided_batched(std::forward(args)...)); } diff --git a/benchmark/rocblas/utils.hpp b/benchmark/rocblas/utils.hpp index e5637e1bc..494f782ae 100644 --- a/benchmark/rocblas/utils.hpp +++ b/benchmark/rocblas/utils.hpp @@ -385,7 +385,7 @@ struct RocblasType { // When T is sycl::half, use rocBLAS's rocblas_half as type. template -struct RocblasType>> { +struct RocblasType>> { using type = rocblas_half; }; } // namespace utils diff --git a/cmake/CmakeFunctionHelper.cmake b/cmake/CmakeFunctionHelper.cmake index 4a02655a3..678442e53 100644 --- a/cmake/CmakeFunctionHelper.cmake +++ b/cmake/CmakeFunctionHelper.cmake @@ -34,13 +34,13 @@ set(data_list "${BLAS_DATA_TYPES}") # Converts a user specified type name into a C++ type function(cpp_type output data) if (${data} STREQUAL "half") - set(${output} "cl::sycl::half" PARENT_SCOPE) + set(${output} "sycl::half" PARENT_SCOPE) return() elseif(${data} STREQUAL "complex") - set(${output} "cl::sycl::ext::oneapi::experimental::complex" PARENT_SCOPE) + set(${output} "sycl::ext::oneapi::experimental::complex" PARENT_SCOPE) return() elseif(${data} STREQUAL "complex") - set(${output} "cl::sycl::ext::oneapi::experimental::complex" PARENT_SCOPE) + set(${output} "sycl::ext::oneapi::experimental::complex" PARENT_SCOPE) return() endif() set(${output} "${data}" PARENT_SCOPE) @@ -613,13 +613,13 @@ elseif(${TUNING_TARGET} STREQUAL "NVIDIA_GPU") if(${start_idx} AND ${sm_val} GREATER_EQUAL "80") add_gemm_configuration( "float" 128 "false" "true" "true" - 128 2 4 16 8 16 2 1 1 1 1 16 16 16 cl::sycl::half float "local" "standard" "none" 1 "strided" "true") + 128 2 4 16 8 16 2 1 1 1 1 16 16 16 sycl::half float "local" "standard" "none" 1 "strided" "true") add_gemm_configuration( "float" 128 "false" "true" "true" - 128 4 8 16 8 16 2 1 1 1 1 16 16 16 cl::sycl::half float "local" "standard" "none" 1 "strided" "true") + 128 4 8 16 8 16 2 1 1 1 1 16 16 16 sycl::half float "local" "standard" "none" 1 "strided" "true") add_gemm_configuration( "float" 256 "false" "true" "true" - 128 8 8 16 16 16 2 1 1 1 1 16 16 16 cl::sycl::half float "local" "standard" "none" 1 "strided" "true") + 128 8 8 16 16 16 2 1 1 1 1 16 16 16 sycl::half float "local" "standard" "none" 1 "strided" "true") endif() foreach(data ${supported_types}) # Non-Joint Matrix specific GEMM Configurations diff --git a/common/include/common/cli_device_selector.hpp b/common/include/common/cli_device_selector.hpp index 1eff0a709..66e2cc92f 100644 --- a/common/include/common/cli_device_selector.hpp +++ b/common/include/common/cli_device_selector.hpp @@ -29,10 +29,10 @@ #ifndef CLI_DEVICE_SELECTOR_HPP #define CLI_DEVICE_SELECTOR_HPP -#include #include #include #include +#include #include "extract_vendor_type.hpp" @@ -43,26 +43,25 @@ namespace utils { * the available devices according to whether they match the vendor/device type, * and picks the one with highest score. */ -class cli_device_selector : public cl::sycl::device_selector { +class cli_device_selector { std::string device_vendor; std::string device_type; - static cl::sycl::info::device_type match_device_type(std::string requested) { - if (requested.empty()) return cl::sycl::info::device_type::automatic; + static sycl::info::device_type match_device_type(std::string requested) { + if (requested.empty()) return sycl::info::device_type::automatic; std::transform(requested.begin(), requested.end(), requested.begin(), ::tolower); - if (requested == "gpu") return cl::sycl::info::device_type::gpu; - if (requested == "cpu") return cl::sycl::info::device_type::cpu; - if (requested == "accel") return cl::sycl::info::device_type::accelerator; + if (requested == "gpu") return sycl::info::device_type::gpu; + if (requested == "cpu") return sycl::info::device_type::cpu; + if (requested == "accel") return sycl::info::device_type::accelerator; if (requested == "*" || requested == "any") - return cl::sycl::info::device_type::all; + return sycl::info::device_type::all; - return cl::sycl::info::device_type::automatic; + return sycl::info::device_type::automatic; } public: - cli_device_selector(const std::string& device_spec) - : cl::sycl::device_selector() { + cli_device_selector(const std::string& device_spec) { if (!device_spec.empty()) { bool result; std::tie(result, device_vendor, device_type) = @@ -70,24 +69,24 @@ class cli_device_selector : public cl::sycl::device_selector { } } - int operator()(const cl::sycl::device& device) const { + int operator()(const sycl::device& device) const { int score = 0; // Score the device type... - cl::sycl::info::device_type dtype = - device.get_info(); - cl::sycl::info::device_type rtype = match_device_type(device_type); - if (rtype == dtype || rtype == cl::sycl::info::device_type::all) { + sycl::info::device_type dtype = + device.get_info(); + sycl::info::device_type rtype = match_device_type(device_type); + if (rtype == dtype || rtype == sycl::info::device_type::all) { score += 2; - } else if (rtype == cl::sycl::info::device_type::automatic) { + } else if (rtype == sycl::info::device_type::automatic) { score += 1; } else { score -= 2; } // score the vendor name - cl::sycl::platform plat = device.get_platform(); - std::string name = plat.template get_info(); + sycl::platform plat = device.get_platform(); + std::string name = plat.template get_info(); std::transform(name.begin(), name.end(), name.begin(), ::tolower); if (name.find(device_vendor) != std::string::npos && !device_vendor.empty()) { diff --git a/common/include/common/common_utils.hpp b/common/include/common/common_utils.hpp index cc9496581..06e01d570 100644 --- a/common/include/common/common_utils.hpp +++ b/common/include/common/common_utils.hpp @@ -1627,7 +1627,7 @@ inline std::string get_type_name() { } template <> -inline std::string get_type_name() { +inline std::string get_type_name() { return "half"; } @@ -1662,8 +1662,7 @@ static inline scalar_t random_scalar(scalar_t rangeMin, scalar_t rangeMax) { static std::random_device rd; static std::default_random_engine gen(rd()); using random_scalar_t = - std::conditional_t, float, - scalar_t>; + std::conditional_t, float, scalar_t>; std::uniform_real_distribution dis(rangeMin, rangeMax); return dis(gen); } @@ -1938,11 +1937,11 @@ static inline void calc_avg_counters(benchmark::State& state) { #endif // BLAS_DATA_TYPE_DOUBLE #ifdef BLAS_ENABLE_HALF -/** Registers benchmark for the cl::sycl::half data type +/** Registers benchmark for the sycl::half data type * @see BLAS_REGISTER_BENCHMARK */ #define BLAS_REGISTER_BENCHMARK_HALF(args, sb_handle_ptr, success) \ - register_benchmark(args, sb_handle_ptr, success) + register_benchmark(args, sb_handle_ptr, success) #else #define BLAS_REGISTER_BENCHMARK_HALF(args, sb_handle_ptr, success) #endif // BLAS_ENABLE_HALF diff --git a/common/include/common/float_comparison.hpp b/common/include/common/float_comparison.hpp index 5c634279e..f55118b55 100644 --- a/common/include/common/float_comparison.hpp +++ b/common/include/common/float_comparison.hpp @@ -32,23 +32,6 @@ #include #endif -#if SYCL_LANGUAGE_VERSION < 202000 -#include -inline std::ostream& operator<<(std::ostream& os, const cl::sycl::half& value) { - os << static_cast(value); - return os; -} - -namespace std { -template <> -class numeric_limits { - public: - static constexpr float min() { return -65504.0f; } - static constexpr float max() { return 65504.0f; } -}; -} // namespace std -#endif // SYCL_LANGUAGE_VERSION - namespace utils { template @@ -84,17 +67,17 @@ scalar_t abs(std::complex value) noexcept { #endif template <> -inline bool isnan(cl::sycl::half value) noexcept { +inline bool isnan(sycl::half value) noexcept { return std::isnan(static_cast(value)); } template <> -inline bool isinf(cl::sycl::half value) noexcept { +inline bool isinf(sycl::half value) noexcept { return std::isinf(static_cast(value)); } template <> -inline cl::sycl::half abs(cl::sycl::half value) noexcept { +inline sycl::half abs(sycl::half value) noexcept { return std::abs(static_cast(value)); } @@ -138,7 +121,7 @@ inline double getRelativeErrorMargin(const int32_t) { } template <> -inline cl::sycl::half getRelativeErrorMargin(const int32_t) { +inline sycl::half getRelativeErrorMargin(const int32_t) { // Measured empirically with gemm return 0.05f; } @@ -169,7 +152,7 @@ inline double getAbsoluteErrorMargin(const int32_t) { } template <> -inline cl::sycl::half getAbsoluteErrorMargin(const int32_t) { +inline sycl::half getAbsoluteErrorMargin(const int32_t) { // Measured empirically with gemm. return 1.0f; } @@ -209,7 +192,7 @@ inline bool almost_equal(scalar_t const& scalar1, scalar_t const& scalar2, * The second vector is considered the reference. * @tparam scalar_t the type of data present in the input vectors * @tparam epsilon_t the type used as tolerance. Lower precision types - * (cl::sycl::half) will have a higher tolerance for errors + * (sycl::half) will have a higher tolerance for errors */ template inline bool compare_vectors(std::vector const& vec, @@ -270,7 +253,7 @@ inline bool compare_vectors(std::vector> const& vec, * the reference. * @tparam scalar_t the type of data present in the input vectors * @tparam epsilon_t the type used as tolerance. Lower precision types - * (cl::sycl::half) will have a higher tolerance for errors + * (sycl::half) will have a higher tolerance for errors * @param stride is the stride between two consecutive 'windows' * @param window is the size of a comparison window */ diff --git a/common/include/common/print_queue_information.hpp b/common/include/common/print_queue_information.hpp index 8e5269188..1b8019526 100644 --- a/common/include/common/print_queue_information.hpp +++ b/common/include/common/print_queue_information.hpp @@ -29,43 +29,41 @@ #ifndef PRINT_QUEUE_INFORMATION_HPP #define PRINT_QUEUE_INFORMATION_HPP -#include #include #include #include +#include namespace utils { -inline void print_queue_information(cl::sycl::queue q) { - std::cerr - << "Device vendor: " - << q.get_device().template get_info() - << std::endl; +inline void print_queue_information(sycl::queue q) { + std::cerr << "Device vendor: " + << q.get_device().template get_info() + << std::endl; std::cerr << "Device name: " - << q.get_device().template get_info() + << q.get_device().template get_info() << std::endl; std::cerr << "Device type: "; - switch ( - q.get_device().template get_info()) { - case cl::sycl::info::device_type::cpu: + switch (q.get_device().template get_info()) { + case sycl::info::device_type::cpu: std::cerr << "cpu"; break; - case cl::sycl::info::device_type::gpu: + case sycl::info::device_type::gpu: std::cerr << "gpu"; break; - case cl::sycl::info::device_type::accelerator: + case sycl::info::device_type::accelerator: std::cerr << "accelerator"; break; - case cl::sycl::info::device_type::custom: + case sycl::info::device_type::custom: std::cerr << "custom"; break; - case cl::sycl::info::device_type::automatic: + case sycl::info::device_type::automatic: std::cerr << "automatic"; break; - case cl::sycl::info::device_type::host: + case sycl::info::device_type::host: std::cerr << "host"; break; - case cl::sycl::info::device_type::all: + case sycl::info::device_type::all: std::cerr << "all"; break; default: diff --git a/common/include/common/set_benchmark_label.hpp b/common/include/common/set_benchmark_label.hpp index b707276bb..0c2010df8 100644 --- a/common/include/common/set_benchmark_label.hpp +++ b/common/include/common/set_benchmark_label.hpp @@ -53,7 +53,7 @@ namespace device_info { * \param [in] device SYCL device to query for info to add to the label. * \param [out] key_value_map The benchmark key value pair to hold the info. */ -inline void add_device_info(cl::sycl::device const& device, +inline void add_device_info(sycl::device const& device, std::map& key_value_map) { // OpenCL is unclear whether strings returned from clGet*Info() should be // null terminated. @@ -63,11 +63,10 @@ inline void add_device_info(cl::sycl::device const& device, s.resize(strlen(s.c_str())); return s; }; - auto device_name = device.get_info(); - auto device_version = device.get_info(); - auto vendor_name = device.get_info(); - auto driver_version = - device.get_info(); + auto device_name = device.get_info(); + auto device_version = device.get_info(); + auto vendor_name = device.get_info(); + auto driver_version = device.get_info(); key_value_map["device_name"] = trim(device_name); key_value_map["device_version"] = trim(device_version); @@ -149,7 +148,7 @@ inline void add_datatype_info( #ifdef BLAS_ENABLE_HALF template <> -inline void add_datatype_info( +inline void add_datatype_info( std::map& key_value_map) { key_value_map["@datatype"] = "half"; } @@ -197,8 +196,7 @@ inline void add_common_labels( } // namespace internal template -inline void set_benchmark_label(benchmark::State& state, - const cl::sycl::queue& q) { +inline void set_benchmark_label(benchmark::State& state, const sycl::queue& q) { std::map key_value_map; auto dev = q.get_device(); device_info::add_device_info(dev, key_value_map); diff --git a/common/include/common/system_reference_blas.hpp b/common/include/common/system_reference_blas.hpp index dc98f120e..e24f47c7a 100644 --- a/common/include/common/system_reference_blas.hpp +++ b/common/include/common/system_reference_blas.hpp @@ -151,7 +151,7 @@ scalar_t asum(const int n, scalar_t x[], const int incX) { template void axpy(const int n, scalar_t alpha, const scalar_t x[], const int incX, scalar_t y[], const int incY) { - if constexpr (!std::is_same_v) { + if constexpr (!std::is_same_v) { auto func = blas_system_function(&cblas_saxpy, &cblas_daxpy); func(n, alpha, x, incX, y, incY); } else { @@ -165,7 +165,7 @@ void axpy(const int n, scalar_t alpha, const scalar_t x[], const int incX, for (int i = 0; i < y_size; ++i) y_f[i] = static_cast(y[i]); cblas_saxpy(n, alpha_f, x_f.data(), incX, y_f.data(), incY); - for (int i = 0; i < y_size; ++i) y[i] = static_cast(y_f[i]); + for (int i = 0; i < y_size; ++i) y[i] = static_cast(y_f[i]); } } @@ -238,7 +238,7 @@ void rotmg(scalar_t *d1, scalar_t *d2, scalar_t *x1, scalar_t *y1, template void scal(const int n, const scalar_t alpha, scalar_t x[], const int incX) { - if constexpr (!std::is_same_v) { + if constexpr (!std::is_same_v) { auto func = blas_system_function(&cblas_sscal, &cblas_dscal); func(n, alpha, x, incX); } else { @@ -248,7 +248,7 @@ void scal(const int n, const scalar_t alpha, scalar_t x[], const int incX) { std::vector x_f(size); for (int i = 0; i < size; ++i) x_f[i] = static_cast(x[i]); cblas_sscal(n, alpha_f, x_f.data(), incX); - for (int i = 0; i < size; ++i) x[i] = static_cast(x_f[i]); + for (int i = 0; i < size; ++i) x[i] = static_cast(x_f[i]); } } @@ -403,7 +403,7 @@ template void gemm(const char *transA, const char *transB, int m, int n, int k, scalar_t alpha, const scalar_t a[], int lda, const scalar_t b[], int ldb, scalar_t beta, scalar_t c[], int ldc) { - if constexpr (!std::is_same_v) { + if constexpr (!std::is_same_v) { auto func = blas_system_function(&cblas_sgemm, &cblas_dgemm); func(CblasColMajor, c_trans(*transA), c_trans(*transB), m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); @@ -426,7 +426,7 @@ void gemm(const char *transA, const char *transB, int m, int n, int k, cblas_sgemm(CblasColMajor, c_trans(*transA), c_trans(*transB), m, n, k, alpha_f, a_f.data(), lda, b_f.data(), ldb, beta_f, c_f.data(), ldc); - for (int i = 0; i < c_size; ++i) c[i] = static_cast(c_f[i]); + for (int i = 0; i < c_size; ++i) c[i] = static_cast(c_f[i]); } } diff --git a/doc/AddingBlas3Op.md b/doc/AddingBlas3Op.md index 147817025..5367c3e16 100644 --- a/doc/AddingBlas3Op.md +++ b/doc/AddingBlas3Op.md @@ -281,12 +281,12 @@ struct DiagonalBlocksInverter { index_t N_; DiagonalBlocksInverter(matrix_t& A, matrix_t& invA); - bool valid_thread(cl::sycl::nd_item<1> id) const; - void bind(cl::sycl::handler& cgh); + bool valid_thread(sycl::nd_item<1> id) const; + void bind(sycl::handler& cgh); void adjust_access_displacement(); template - void eval(local_memory_t localMem, cl::sycl::nd_item<1> id) noexcept; + void eval(local_memory_t localMem, sycl::nd_item<1> id) noexcept; }; template diff --git a/include/blas_meta.h b/include/blas_meta.h index b80639128..28ceb8cf9 100644 --- a/include/blas_meta.h +++ b/include/blas_meta.h @@ -26,7 +26,7 @@ #ifndef PORTBLAS_META_H #define PORTBLAS_META_H -#include +#include #include #include #ifdef BLAS_ENABLE_COMPLEX @@ -191,7 +191,7 @@ struct is_sycl_scalar std::false_type>::type {}; template <> -struct is_sycl_scalar : std::true_type {}; +struct is_sycl_scalar : std::true_type {}; template <> struct is_sycl_scalar : std::false_type {}; @@ -201,12 +201,12 @@ struct is_sycl_scalar : std::false_type {}; template struct is_half - : std::integral_constant> {}; + : std::integral_constant> {}; #ifdef BLAS_ENABLE_COMPLEX // SYCL Complex type alias template -using complex_sycl = typename cl::sycl::ext::oneapi::experimental::complex; +using complex_sycl = typename sycl::ext::oneapi::experimental::complex; template struct is_complex_sycl diff --git a/include/container/sycl_iterator.h b/include/container/sycl_iterator.h index e6ea4f953..2ac4b8ae3 100644 --- a/include/container/sycl_iterator.h +++ b/include/container/sycl_iterator.h @@ -25,7 +25,7 @@ #ifndef PORTBLAS_BUFFER_ITERATOR_H #define PORTBLAS_BUFFER_ITERATOR_H #include "blas_meta.h" -#include +#include namespace blas { /*! * @brief See BufferIterator. @@ -35,25 +35,21 @@ class BufferIterator { public: using scalar_t = element_t; template - using buffer_t = cl::sycl::buffer; - using access_mode_t = cl::sycl::access::mode; - template + using buffer_t = sycl::buffer; + using access_mode_t = sycl::access_mode; + template using accessor_t = - cl::sycl::accessor; - template + sycl::accessor; + template using placeholder_accessor_t = - cl::sycl::accessor; - template + sycl::accessor; + template using default_accessor_t = placeholder_accessor_t; using self_t = BufferIterator; using buff_t = buffer_t<1>; @@ -63,12 +59,11 @@ class BufferIterator { * @tparam acc_md_t memory access mode * @tparam scalar_t the element type of the buffer * @param buff_iterator BufferIterator - * @param cgh cl::sycl::handler + * @param cgh sycl::handler * @param size the region needed to be copied */ - template < - cl::sycl::access::mode acc_md_t = cl::sycl::access::mode::read_write> - inline accessor_t get_range_accessor(cl::sycl::handler& cgh, + template + inline accessor_t get_range_accessor(sycl::handler& cgh, size_t size); /*! @@ -77,12 +72,11 @@ class BufferIterator { * @tparam acc_md_t memory access mode * @tparam scalar_t the element type of the buffer * @param buff_iterator BufferIterator - * @param cgh cl::sycl::handler + * @param cgh sycl::handler * @param size the region needed to be copied */ - template < - cl::sycl::access::mode acc_md_t = cl::sycl::access::mode::read_write> + template inline placeholder_accessor_t get_range_accessor(size_t size); /*! @@ -91,12 +85,11 @@ class BufferIterator { * @tparam acc_md_t memory access mode * @tparam scalar_t the element type of the buffer * @param buff_iterator BufferIterator - * @param cgh cl::sycl::handler + * @param cgh sycl::handler * @param size the region needed to be copied */ - template < - cl::sycl::access::mode acc_md_t = cl::sycl::access::mode::read_write> - inline accessor_t get_range_accessor(cl::sycl::handler& cgh); + template + inline accessor_t get_range_accessor(sycl::handler& cgh); /*! * @brief create a range placeholder accessor from (offset, @@ -104,11 +97,10 @@ class BufferIterator { * @tparam acc_md_t memory access mode * @tparam scalar_t the element type of the buffer * @param buff_iterator BufferIterator - * @param cgh cl::sycl::handler + * @param cgh sycl::handler * @param size the region needed to be copied */ - template < - cl::sycl::access::mode acc_md_t = cl::sycl::access::mode::read_write> + template inline placeholder_accessor_t get_range_accessor(); /*! * @brief Default construct a BufferIterator. @@ -119,7 +111,7 @@ class BufferIterator { * constructible. See: * https://github.com/codeplaysoftware/standards-proposals/blob/master/default-constructed-buffers/default-constructed-buffers.md */ - BufferIterator() : offset_{0}, buffer_{cl::sycl::range<1>{1}} {} + BufferIterator() : offset_{0}, buffer_{sycl::range<1>{1}} {} /*! * @brief See BufferIterator. */ @@ -190,35 +182,34 @@ class BufferIterator { }; template -template +template inline typename BufferIterator::template accessor_t -BufferIterator::get_range_accessor(cl::sycl::handler& cgh, - size_t size) { +BufferIterator::get_range_accessor(sycl::handler& cgh, size_t size) { return typename BufferIterator::template accessor_t( - buffer_, cgh, cl::sycl::range<1>(size), - cl::sycl::id<1>(BufferIterator::get_offset())); + buffer_, cgh, sycl::range<1>(size), + sycl::id<1>(BufferIterator::get_offset())); } template -template +template inline typename BufferIterator::template accessor_t -BufferIterator::get_range_accessor(cl::sycl::handler& cgh) { +BufferIterator::get_range_accessor(sycl::handler& cgh) { return BufferIterator::get_range_accessor( cgh, BufferIterator::get_size()); } template -template +template inline typename BufferIterator::template placeholder_accessor_t< acc_md_t> BufferIterator::get_range_accessor(size_t size) { return typename BufferIterator::template placeholder_accessor_t< - acc_md_t>(buffer_, cl::sycl::range<1>(size), - cl::sycl::id<1>(BufferIterator::get_offset())); + acc_md_t>(buffer_, sycl::range<1>(size), + sycl::id<1>(BufferIterator::get_offset())); } template -template +template inline typename BufferIterator::template placeholder_accessor_t< acc_md_t> BufferIterator::get_range_accessor() { @@ -237,7 +228,7 @@ template inline blas::BufferIterator make_sycl_iterator_buffer(scalar_t* data, index_t size) { using buff_t = typename blas::BufferIterator::buff_t; - return blas::BufferIterator{buff_t{data, cl::sycl::range<1>(size)}}; + return blas::BufferIterator{buff_t{data, sycl::range<1>(size)}}; } /*! @@ -252,7 +243,7 @@ inline BufferIterator make_sycl_iterator_buffer( std::vector& data, index_t size) { using buff_t = typename blas::BufferIterator::buff_t; return blas::BufferIterator{ - buff_t{data.data(), cl::sycl::range<1>(size)}}; + buff_t{data.data(), sycl::range<1>(size)}}; } /*! @@ -265,7 +256,7 @@ inline BufferIterator make_sycl_iterator_buffer( template inline blas::BufferIterator make_sycl_iterator_buffer(index_t size) { using buff_t = typename blas::BufferIterator::buff_t; - return blas::BufferIterator{buff_t{cl::sycl::range<1>(size)}}; + return blas::BufferIterator{buff_t{sycl::range<1>(size)}}; } /*! * @brief Helper function to build BufferIterator @@ -296,7 +287,7 @@ template inline BufferIterator::BufferIterator( const BufferIterator& other) : BufferIterator(other.get_buffer().template reinterpret( - cl::sycl::range<1>(other.get_buffer().get_count())), + sycl::range<1>(other.get_buffer().size())), other.get_offset()) {} template @@ -341,7 +332,7 @@ inline BufferIterator BufferIterator::operator++(int i) { template inline std::ptrdiff_t BufferIterator::get_size() const { - return (buffer_.get_count() - offset_); + return (buffer_.size() - offset_); } template diff --git a/include/operations/blas1_trees.h b/include/operations/blas1_trees.h index d005e1915..cb79f554c 100644 --- a/include/operations/blas1_trees.h +++ b/include/operations/blas1_trees.h @@ -27,8 +27,8 @@ #define PORTBLAS_BLAS1_TREES_H #include "operations/blas_constants.h" #include "operations/blas_operators.h" -#include #include +#include #include namespace blas { @@ -44,10 +44,10 @@ struct Join { Join(lhs_t &_l, rhs_t _r); index_t get_size() const; - bool valid_thread(cl::sycl::nd_item<1> ndItem) const; + bool valid_thread(sycl::nd_item<1> ndItem) const; value_t eval(index_t i); - value_t eval(cl::sycl::nd_item<1> ndItem); - void bind(cl::sycl::handler &h); + value_t eval(sycl::nd_item<1> ndItem); + void bind(sycl::handler &h); void adjust_access_displacement(); }; @@ -61,10 +61,10 @@ struct Assign { rhs_t rhs_; Assign(lhs_t &_l, rhs_t _r); index_t get_size() const; - bool valid_thread(cl::sycl::nd_item<1> ndItem) const; + bool valid_thread(sycl::nd_item<1> ndItem) const; value_t eval(index_t i); - value_t eval(cl::sycl::nd_item<1> ndItem); - void bind(cl::sycl::handler &h); + value_t eval(sycl::nd_item<1> ndItem); + void bind(sycl::handler &h); void adjust_access_displacement(); }; @@ -81,10 +81,10 @@ struct DoubleAssign { rhs_2_t rhs_2_; DoubleAssign(lhs_1_t &_l1, lhs_2_t &_l2, rhs_1_t _r1, rhs_2_t _r2); index_t get_size() const; - bool valid_thread(cl::sycl::nd_item<1> ndItem) const; + bool valid_thread(sycl::nd_item<1> ndItem) const; value_t eval(index_t i); - value_t eval(cl::sycl::nd_item<1> ndItem); - void bind(cl::sycl::handler &h); + value_t eval(sycl::nd_item<1> ndItem); + void bind(sycl::handler &h); void adjust_access_displacement(); }; @@ -100,10 +100,10 @@ struct ScalarOp { rhs_t rhs_; ScalarOp(scalar_t _scl, rhs_t &_r); index_t get_size() const; - bool valid_thread(cl::sycl::nd_item<1> ndItem) const; + bool valid_thread(sycl::nd_item<1> ndItem) const; value_t eval(index_t i); - value_t eval(cl::sycl::nd_item<1> ndItem); - void bind(cl::sycl::handler &h); + value_t eval(sycl::nd_item<1> ndItem); + void bind(sycl::handler &h); void adjust_access_displacement(); }; @@ -117,10 +117,10 @@ struct UnaryOp { rhs_t rhs_; UnaryOp(rhs_t &_r); index_t get_size() const; - bool valid_thread(cl::sycl::nd_item<1> ndItem) const; + bool valid_thread(sycl::nd_item<1> ndItem) const; value_t eval(index_t i); - value_t eval(cl::sycl::nd_item<1> ndItem); - void bind(cl::sycl::handler &h); + value_t eval(sycl::nd_item<1> ndItem); + void bind(sycl::handler &h); void adjust_access_displacement(); }; @@ -135,10 +135,10 @@ struct BinaryOp { rhs_t rhs_; BinaryOp(lhs_t &_l, rhs_t &_r); index_t get_size() const; - bool valid_thread(cl::sycl::nd_item<1> ndItem) const; + bool valid_thread(sycl::nd_item<1> ndItem) const; value_t eval(index_t i); - value_t eval(cl::sycl::nd_item<1> ndItem); - void bind(cl::sycl::handler &h); + value_t eval(sycl::nd_item<1> ndItem); + void bind(sycl::handler &h); void adjust_access_displacement(); }; @@ -153,10 +153,10 @@ struct BinaryOpConst { rhs_t rhs_; BinaryOpConst(lhs_t &_l, rhs_t &_r); index_t get_size() const; - bool valid_thread(cl::sycl::nd_item<1> ndItem) const; + bool valid_thread(sycl::nd_item<1> ndItem) const; value_t eval(index_t i) const; - value_t eval(cl::sycl::nd_item<1> ndItem) const; - void bind(cl::sycl::handler &h); + value_t eval(sycl::nd_item<1> ndItem) const; + void bind(sycl::handler &h); void adjust_access_displacement(); }; @@ -170,10 +170,10 @@ struct TupleOp { rhs_t rhs_; TupleOp(rhs_t &_r); index_t get_size() const; - bool valid_thread(cl::sycl::nd_item<1> ndItem) const; + bool valid_thread(sycl::nd_item<1> ndItem) const; value_t eval(index_t i); - value_t eval(cl::sycl::nd_item<1> ndItem); - void bind(cl::sycl::handler &h); + value_t eval(sycl::nd_item<1> ndItem); + void bind(sycl::handler &h); void adjust_access_displacement(); }; @@ -191,12 +191,12 @@ struct AssignReduction { index_t global_num_thread_; // grid size AssignReduction(lhs_t &_l, rhs_t &_r, index_t _blqS, index_t _grdS); index_t get_size() const; - bool valid_thread(cl::sycl::nd_item<1> ndItem) const; + bool valid_thread(sycl::nd_item<1> ndItem) const; value_t eval(index_t i); - value_t eval(cl::sycl::nd_item<1> ndItem); + value_t eval(sycl::nd_item<1> ndItem); template - value_t eval(sharedT scratch, cl::sycl::nd_item<1> ndItem); - void bind(cl::sycl::handler &h); + value_t eval(sharedT scratch, sycl::nd_item<1> ndItem); + void bind(sycl::handler &h); void adjust_access_displacement(); }; @@ -217,11 +217,11 @@ struct WGAtomicReduction { rhs_t rhs_; WGAtomicReduction(lhs_t &_l, rhs_t &_r); index_t get_size() const; - bool valid_thread(cl::sycl::nd_item<1> ndItem) const; - value_t eval(cl::sycl::nd_item<1> ndItem); + bool valid_thread(sycl::nd_item<1> ndItem) const; + value_t eval(sycl::nd_item<1> ndItem); template - value_t eval(sharedT scratch, cl::sycl::nd_item<1> ndItem); - void bind(cl::sycl::handler &h); + value_t eval(sharedT scratch, sycl::nd_item<1> ndItem); + void bind(sycl::handler &h); void adjust_access_displacement(); }; @@ -247,11 +247,11 @@ struct IndexMaxMin { rhs_t rhs_; IndexMaxMin(lhs_t &_l, rhs_t &_r); index_t get_size() const; - bool valid_thread(cl::sycl::nd_item<1> ndItem) const; - void eval(cl::sycl::nd_item<1> ndItem); + bool valid_thread(sycl::nd_item<1> ndItem) const; + void eval(sycl::nd_item<1> ndItem); template - void eval(sharedT scratch, cl::sycl::nd_item<1> ndItem); - void bind(cl::sycl::handler &h); + void eval(sharedT scratch, sycl::nd_item<1> ndItem); + void bind(sycl::handler &h); void adjust_access_displacement(); }; @@ -269,9 +269,9 @@ struct Rotg { Rotg(operand_t &a, operand_t &b, operand_t &c, operand_t &s); index_t get_size() const; value_t eval(index_t i); - value_t eval(cl::sycl::nd_item<1> ndItem); - bool valid_thread(cl::sycl::nd_item<1> ndItem) const; - void bind(cl::sycl::handler &h); + value_t eval(sycl::nd_item<1> ndItem); + bool valid_thread(sycl::nd_item<1> ndItem) const; + void bind(sycl::handler &h); void adjust_access_displacement(); }; @@ -291,9 +291,9 @@ struct Rotmg { operand_t ¶m); index_t get_size() const; value_t eval(index_t i); - value_t eval(cl::sycl::nd_item<1> ndItem); - bool valid_thread(cl::sycl::nd_item<1> ndItem) const; - void bind(cl::sycl::handler &h); + value_t eval(sycl::nd_item<1> ndItem); + bool valid_thread(sycl::nd_item<1> ndItem) const; + void bind(sycl::handler &h); void adjust_access_displacement(); }; diff --git a/include/operations/blas2_trees.h b/include/operations/blas2_trees.h index 9dbbedebb..f69d6071a 100644 --- a/include/operations/blas2_trees.h +++ b/include/operations/blas2_trees.h @@ -77,11 +77,11 @@ struct Gemv { Gemv(lhs_t &_l, matrix_t &_matrix, vector_t &_vector, index_t &_wgs_per_nc, index_t &_wgs_per_c); - bool valid_thread(cl::sycl::nd_item<1> ndItem) const; - value_t eval(cl::sycl::nd_item<1> ndItem); + bool valid_thread(sycl::nd_item<1> ndItem) const; + value_t eval(sycl::nd_item<1> ndItem); template - value_t eval(local_memory_t local_mem, cl::sycl::nd_item<1> ndItem); - void bind(cl::sycl::handler &h); + value_t eval(local_memory_t local_mem, sycl::nd_item<1> ndItem); + void bind(sycl::handler &h); void adjust_access_displacement(); private: @@ -119,12 +119,12 @@ struct SumMatrixColumns { index_t get_size() const; - bool valid_thread(cl::sycl::nd_item<1> ndItem) const; + bool valid_thread(sycl::nd_item<1> ndItem) const; value_t eval(index_t i); - value_t eval(cl::sycl::nd_item<1> ndItem) const; - void bind(cl::sycl::handler &h); + value_t eval(sycl::nd_item<1> ndItem) const; + void bind(sycl::handler &h); void adjust_access_displacement(); }; @@ -157,12 +157,12 @@ struct GemvCol { GemvCol(lhs_t &_l, matrix_t &_matrix, vector_t &_vector, index_t &_nWG_row, index_t &_nWG_col, index_t &_shrMemSize); index_t get_size() const; - bool valid_thread(cl::sycl::nd_item<1> ndItem) const; + bool valid_thread(sycl::nd_item<1> ndItem) const; value_t eval(index_t i); - value_t eval(cl::sycl::nd_item<1> ndItem); + value_t eval(sycl::nd_item<1> ndItem); template - value_t eval(sharedT shrMem, cl::sycl::nd_item<1> ndItem); - void bind(cl::sycl::handler &h); + value_t eval(sharedT shrMem, sycl::nd_item<1> ndItem); + void bind(sycl::handler &h); void adjust_access_displacement(); }; @@ -199,12 +199,12 @@ struct GemvRow { GemvRow(lhs_t &_l, matrix_t &_matrix, vector_t &_vector, index_t &_nWG_row, index_t &_nWG_col, index_t &_shrMemSize); index_t get_size() const; - bool valid_thread(cl::sycl::nd_item<1> ndItem) const; + bool valid_thread(sycl::nd_item<1> ndItem) const; value_t eval(index_t i); - value_t eval(cl::sycl::nd_item<1> ndItem); + value_t eval(sycl::nd_item<1> ndItem); template - value_t eval(sharedT shrMem, cl::sycl::nd_item<1> ndItem); - void bind(cl::sycl::handler &h); + value_t eval(sharedT shrMem, sycl::nd_item<1> ndItem); + void bind(sycl::handler &h); void adjust_access_displacement(); }; /*! @@ -252,12 +252,12 @@ struct Gbmv { Gbmv(lhs_t &_l, matrix_t &_matrix, index_t &_kl, index_t &_ku, vector_t &_vector, value_t _alpha, value_t _beta); index_t get_size() const; - bool valid_thread(cl::sycl::nd_item<1> ndItem) const; + bool valid_thread(sycl::nd_item<1> ndItem) const; value_t eval(index_t i); - value_t eval(cl::sycl::nd_item<1> ndItem); + value_t eval(sycl::nd_item<1> ndItem); template - value_t eval(sharedT shrMem, cl::sycl::nd_item<1> ndItem); - void bind(cl::sycl::handler &h); + value_t eval(sharedT shrMem, sycl::nd_item<1> ndItem); + void bind(sycl::handler &h); void adjust_access_displacement(); }; /*! @@ -293,12 +293,12 @@ struct Sbmv { Sbmv(lhs_t &_l, matrix_t &_matrix, index_t &_k, vector_t &_vector, value_t _alpha, value_t _beta); index_t get_size() const; - bool valid_thread(cl::sycl::nd_item<1> ndItem) const; + bool valid_thread(sycl::nd_item<1> ndItem) const; value_t eval(index_t i); - value_t eval(cl::sycl::nd_item<1> ndItem); + value_t eval(sycl::nd_item<1> ndItem); template - value_t eval(sharedT shrMem, cl::sycl::nd_item<1> ndItem); - void bind(cl::sycl::handler &h); + value_t eval(sharedT shrMem, sycl::nd_item<1> ndItem); + void bind(sycl::handler &h); void adjust_access_displacement(); }; /*! @@ -355,10 +355,10 @@ struct Xpmv { Xpmv(lhs_t &_l, matrix_t &_matrix, vector_t &_vector, value_t _alpha, value_t _beta); index_t get_size() const; - bool valid_thread(cl::sycl::nd_item<1> ndItem) const; + bool valid_thread(sycl::nd_item<1> ndItem) const; template - value_t eval(sharedT shrMem, cl::sycl::nd_item<1> ndItem); - void bind(cl::sycl::handler &h); + value_t eval(sharedT shrMem, sycl::nd_item<1> ndItem); + void bind(sycl::handler &h); void adjust_access_displacement(); }; /*! @@ -394,12 +394,12 @@ struct Tbmv { Tbmv(lhs_t &_l, matrix_t &_matrix, index_t &_k, vector_t &_vector); index_t get_size() const; - bool valid_thread(cl::sycl::nd_item<1> ndItem) const; + bool valid_thread(sycl::nd_item<1> ndItem) const; value_t eval(index_t i); - value_t eval(cl::sycl::nd_item<1> ndItem); + value_t eval(sycl::nd_item<1> ndItem); template - value_t eval(sharedT shrMem, cl::sycl::nd_item<1> ndItem); - void bind(cl::sycl::handler &h); + value_t eval(sharedT shrMem, sycl::nd_item<1> ndItem); + void bind(sycl::handler &h); void adjust_access_displacement(); }; /*! @@ -453,8 +453,8 @@ struct Txsv { Txsv(vector_t &_l, matrix_t &_matrix, index_t &_k, sync_t &_sync); template - value_t eval(local_memory_t local_mem, cl::sycl::nd_item<1> ndItem); - void bind(cl::sycl::handler &h); + value_t eval(local_memory_t local_mem, sycl::nd_item<1> ndItem); + void bind(sycl::handler &h); void adjust_access_displacement(); value_t read_matrix(const index_t &row, const index_t &col) const; }; @@ -537,12 +537,12 @@ struct Ger { index_t &_nColsWG, index_t &_nWG_row, index_t &_nWG_col); index_t get_size() const; - bool valid_thread(cl::sycl::nd_item<1> ndItem) const; + bool valid_thread(sycl::nd_item<1> ndItem) const; value_t eval(index_t i); - value_t eval(cl::sycl::nd_item<1> ndItem); + value_t eval(sycl::nd_item<1> ndItem); template - value_t eval(sharedT shrMem, cl::sycl::nd_item<1> ndItem); - void bind(cl::sycl::handler &h); + value_t eval(sharedT shrMem, sycl::nd_item<1> ndItem); + void bind(sycl::handler &h); void adjust_access_displacement(); }; @@ -578,12 +578,12 @@ struct GerRow { GerRow(lhs_t &_l, value_t _scl, rhs_1_t &_r1, rhs_2_t &_r2, index_t &_nWG_row, index_t &_nWG_col, index_t &_shrMemSize); index_t get_size() const; - bool valid_thread(cl::sycl::nd_item<1> ndItem) const; + bool valid_thread(sycl::nd_item<1> ndItem) const; value_t eval(index_t i); - value_t eval(cl::sycl::nd_item<1> ndItem); + value_t eval(sycl::nd_item<1> ndItem); template - value_t eval(sharedT shrMem, cl::sycl::nd_item<1> ndItem); - void bind(cl::sycl::handler &h); + value_t eval(sharedT shrMem, sycl::nd_item<1> ndItem); + void bind(sycl::handler &h); void adjust_access_displacement(); }; @@ -617,12 +617,12 @@ struct GerCol { GerCol(lhs_t &_l, value_t _scl, rhs_1_t &_r1, rhs_2_t &_r2, index_t &_nWG_row, index_t &_nWG_col, index_t &_shrMemSize); index_t get_size() const; - bool valid_thread(cl::sycl::nd_item<1> ndItem) const; + bool valid_thread(sycl::nd_item<1> ndItem) const; value_t eval(index_t i); - value_t eval(cl::sycl::nd_item<1> ndItem); + value_t eval(sycl::nd_item<1> ndItem); template - value_t eval(sharedT shrMem, cl::sycl::nd_item<1> ndItem); - void bind(cl::sycl::handler &h); + value_t eval(sharedT shrMem, sycl::nd_item<1> ndItem); + void bind(sycl::handler &h); void adjust_access_displacement(); }; @@ -669,9 +669,9 @@ struct Spr { Spr(lhs_t &_l, index_t N_, value_t _alpha, rhs_1_t &_r1, rhs_2_t &_r2); index_t get_size() const; - bool valid_thread(cl::sycl::nd_item<1> ndItem) const; - value_t eval(cl::sycl::nd_item<1> ndItem); - void bind(cl::sycl::handler &h); + bool valid_thread(sycl::nd_item<1> ndItem) const; + value_t eval(sycl::nd_item<1> ndItem); + void bind(sycl::handler &h); void adjust_access_displacement(); index_t int_sqrt(int64_t s); void compute_row_col(const int64_t id, const index_t size, index_t &row, diff --git a/include/operations/blas3_trees.h b/include/operations/blas3_trees.h index d8ca1dc9f..d1c390dbc 100644 --- a/include/operations/blas3_trees.h +++ b/include/operations/blas3_trees.h @@ -25,7 +25,7 @@ #ifndef PORTBLAS_BLAS3_TREES_H #define PORTBLAS_BLAS3_TREES_H -#include +#include #include #include @@ -235,11 +235,11 @@ class Gemm { static std::string get_type_string() noexcept; index_t get_workgroup_cluster() const noexcept; index_t get_num_workgroup_cluster(index_t compute_units) const noexcept; - cl::sycl::nd_range<1> get_nd_range(index_t compute_units) const noexcept; + sycl::nd_range<1> get_nd_range(index_t compute_units) const noexcept; index_t get_size() const; - bool valid_thread(const cl::sycl::nd_item<1>& ndItem) const; - void eval(cl::sycl::nd_item<1> id) noexcept; - void bind(cl::sycl::handler& h); + bool valid_thread(const sycl::nd_item<1>& ndItem) const; + void eval(sycl::nd_item<1> id) noexcept; + void bind(sycl::handler& h); void adjust_access_displacement(); }; @@ -319,12 +319,12 @@ struct DiagonalBlocksInverter { index_t N_; DiagonalBlocksInverter(rhs_t A, lhs_t invA); - bool valid_thread(cl::sycl::nd_item<1> id) const; - void bind(cl::sycl::handler& cgh); + bool valid_thread(sycl::nd_item<1> id) const; + void bind(sycl::handler& cgh); void adjust_access_displacement(); template - void eval(local_memory_t localMem, cl::sycl::nd_item<1> id) noexcept; + void eval(local_memory_t localMem, sycl::nd_item<1> id) noexcept; }; template +#include #include #include @@ -199,43 +199,43 @@ struct constant, Indicator> { #endif template <> -struct constant +struct constant : constant {}; template <> -struct constant - : constant {}; +struct constant : constant { +}; template <> -struct constant +struct constant : constant {}; template <> -struct constant - : constant {}; +struct constant : constant { +}; template <> -struct constant +struct constant : constant {}; template <> -struct constant - : constant {}; +struct constant : constant { +}; template <> -struct constant - : constant {}; +struct constant : constant { +}; template <> -struct constant +struct constant : constant {}; template <> -struct constant +struct constant : constant {}; template <> -struct constant +struct constant : constant {}; template diff --git a/include/operations/extension/axpy_batch.h b/include/operations/extension/axpy_batch.h index a034201ab..948c87bc1 100644 --- a/include/operations/extension/axpy_batch.h +++ b/include/operations/extension/axpy_batch.h @@ -67,9 +67,9 @@ struct Axpy_batch { index_t _inc_l, index_t _lhs_stride, index_t _inc_r, index_t _rhs_stride, index_t _batch_size); index_t get_size() const; - bool valid_thread(cl::sycl::nd_item<1> ndItem) const; - value_t eval(cl::sycl::nd_item<1> ndItem); - void bind(cl::sycl::handler &h); + bool valid_thread(sycl::nd_item<1> ndItem) const; + value_t eval(sycl::nd_item<1> ndItem); + void bind(sycl::handler &h); void adjust_access_displacement(); }; diff --git a/include/operations/extension/matcopy_batch.h b/include/operations/extension/matcopy_batch.h index 367df1ebb..7c8e57d44 100644 --- a/include/operations/extension/matcopy_batch.h +++ b/include/operations/extension/matcopy_batch.h @@ -46,12 +46,12 @@ struct Matcopy_batch { index_t rhs_ld, index_t rhs_2_ld, index_t lhs_stride, index_t rhs_stride, index_t rhs_2_stride, index_t batch_size); index_t get_size() const; - bool valid_thread(cl::sycl::nd_item<1> ndItem) const; + bool valid_thread(sycl::nd_item<1> ndItem) const; value_t eval(index_t i); - value_t eval(cl::sycl::nd_item<1> ndItem); + value_t eval(sycl::nd_item<1> ndItem); template - value_t eval(sharedT shMem, cl::sycl::nd_item<1> ndItem); - void bind(cl::sycl::handler &h); + value_t eval(sharedT shMem, sycl::nd_item<1> ndItem); + void bind(sycl::handler &h); void adjust_access_displacement(); void compute_matcopy_batch(const index_t wg_batch_id, const index_t wg_row, const index_t wg_col, const index_t item_id); diff --git a/include/operations/extension/reduction.h b/include/operations/extension/reduction.h index cc9563964..f3a707337 100644 --- a/include/operations/extension/reduction.h +++ b/include/operations/extension/reduction.h @@ -25,7 +25,7 @@ #ifndef PORTBLAS_EXTENSION_REDUCTION_H #define PORTBLAS_EXTENSION_REDUCTION_H -#include +#include #include "container/sycl_iterator.h" @@ -171,14 +171,14 @@ class Reduction { const index_t num_elems_to_preserve_; const index_t num_elems_to_reduce_; Reduction(input_t in, output_t out); - bool valid_thread(cl::sycl::nd_item<1> id) const; - void bind(cl::sycl::handler& h); + bool valid_thread(sycl::nd_item<1> id) const; + void bind(sycl::handler& h); void adjust_access_displacement(); - cl::sycl::nd_range<1> get_nd_range(index_t compute_units) noexcept; + sycl::nd_range<1> get_nd_range(index_t compute_units) noexcept; void reduce(index_t global_reduce_id, index_t global_preserve_id, element_t& accumulator) noexcept; template - void eval(local_memory_t scratch, cl::sycl::nd_item<1> id) noexcept; + void eval(local_memory_t scratch, sycl::nd_item<1> id) noexcept; }; /*! diff --git a/include/operations/extension/transpose.h b/include/operations/extension/transpose.h index b684f0b01..b99a7e8b8 100644 --- a/include/operations/extension/transpose.h +++ b/include/operations/extension/transpose.h @@ -25,7 +25,7 @@ #ifndef PORTBLAS_EXTENSION_TRANSPOSE_H #define PORTBLAS_EXTENSION_TRANSPOSE_H -#include +#include #include "container/sycl_iterator.h" @@ -114,17 +114,17 @@ class Transpose { index_t get_size() const; - bool valid_thread(cl::sycl::nd_item<1> item) const; - void bind(cl::sycl::handler &cgh); + bool valid_thread(sycl::nd_item<1> item) const; + void bind(sycl::handler &cgh); void adjust_access_displacement(); - void eval(cl::sycl::nd_item<1> item); + void eval(sycl::nd_item<1> item); template - void eval(local_memory_t local_mem, cl::sycl::nd_item<1> id); - void get_indices(cl::sycl::nd_item<1> id, index_t &in_idx, - index_t &in_local_idx, index_t &out_idx, - index_t &out_local_idx, index_t &i_block_start, - index_t &j_block_start, index_t &il, index_t &jl); - void get_indices(cl::sycl::nd_item<1> id, index_t &in_idx, index_t &out_idx, + void eval(local_memory_t local_mem, sycl::nd_item<1> id); + void get_indices(sycl::nd_item<1> id, index_t &in_idx, index_t &in_local_idx, + index_t &out_idx, index_t &out_local_idx, + index_t &i_block_start, index_t &j_block_start, index_t &il, + index_t &jl); + void get_indices(sycl::nd_item<1> id, index_t &in_idx, index_t &out_idx, index_t &i, index_t &j); }; @@ -237,18 +237,18 @@ class TransposeAdd { index_t get_size() const; - bool valid_thread(cl::sycl::nd_item<1> item) const; - void bind(cl::sycl::handler &cgh); + bool valid_thread(sycl::nd_item<1> item) const; + void bind(sycl::handler &cgh); void adjust_access_displacement(); - void eval(cl::sycl::nd_item<1> item); + void eval(sycl::nd_item<1> item); template - void eval(local_memory_t local_mem, cl::sycl::nd_item<1> id); - void get_indices(cl::sycl::nd_item<1> id, index_t &in_a_idx, - index_t &in_b_idx, index_t &in_local_idx, index_t &out_idx, + void eval(local_memory_t local_mem, sycl::nd_item<1> id); + void get_indices(sycl::nd_item<1> id, index_t &in_a_idx, index_t &in_b_idx, + index_t &in_local_idx, index_t &out_idx, index_t &out_local_idx, index_t &i_block_start, index_t &j_block_start, index_t &il, index_t &jl); - void get_indices(cl::sycl::nd_item<1> id, index_t &in_a_idx, - index_t &in_b_idx, index_t &out_idx, index_t &i, index_t &j); + void get_indices(sycl::nd_item<1> id, index_t &in_a_idx, index_t &in_b_idx, + index_t &out_idx, index_t &i, index_t &j); }; /*! diff --git a/include/portblas.h b/include/portblas.h index 93719cfd6..e6c607332 100644 --- a/include/portblas.h +++ b/include/portblas.h @@ -23,7 +23,7 @@ * **************************************************************************/ -#include +#include #include "blas_meta.h" diff --git a/include/portblas_helper.h b/include/portblas_helper.h index 9171fcb7c..1d322fe19 100644 --- a/include/portblas_helper.h +++ b/include/portblas_helper.h @@ -27,7 +27,7 @@ #include "blas_meta.h" #include "container/sycl_iterator.h" -#include +#include namespace blas { namespace helper { @@ -54,31 +54,31 @@ struct AllocHelper { template typename std::enable_if::type>::type -allocate(int size, cl::sycl::queue q) { - return cl::sycl::malloc_device(size, q); +allocate(int size, sycl::queue q) { + return sycl::malloc_device(size, q); } #endif template typename std::enable_if::type>::type -allocate(int size, cl::sycl::queue q) { +allocate(int size, sycl::queue q) { return make_sycl_iterator_buffer(size); } #ifdef SB_ENABLE_USM template typename std::enable_if::type deallocate( - container_t mem, cl::sycl::queue q) { + container_t mem, sycl::queue q) { if (mem != NULL) { - cl::sycl::free(reinterpret_cast(mem), q); + sycl::free(reinterpret_cast(mem), q); } } #endif template typename std::enable_if::type deallocate( - container_t mem, cl::sycl::queue q) {} + container_t mem, sycl::queue q) {} template ::value @@ -94,12 +94,12 @@ template typename std::enable_if::type, AllocType::usm>::type>::value>::type -enqueue_deallocate(std::vector dependencies, container_t mem, - cl::sycl::queue q) { +enqueue_deallocate(std::vector dependencies, container_t mem, + sycl::queue q) { #ifdef SB_ENABLE_USM - auto event = q.submit([&](cl::sycl::handler &cgh) { + auto event = q.submit([&](sycl::handler &cgh) { cgh.depends_on(dependencies); - cgh.host_task([=]() { cl::sycl::free(mem, q); }); + cgh.host_task([=]() { sycl::free(mem, q); }); }); #endif return; @@ -109,26 +109,25 @@ template typename std::enable_if::type, AllocType::buffer>::type>::value>::type -enqueue_deallocate(std::vector, container_t mem, - cl::sycl::queue q) {} +enqueue_deallocate(std::vector, container_t mem, sycl::queue q) {} -inline bool has_local_memory(cl::sycl::queue &q) { - return (q.get_device() - .template get_info() == - cl::sycl::info::local_mem_type::local); +inline bool has_local_memory(sycl::queue &q) { + return ( + q.get_device().template get_info() == + sycl::info::local_mem_type::local); } // Force the system not to set this to bigger than 256. Using work group size // bigger than 256 may cause out of resource error on different platforms. -inline size_t get_work_group_size(cl::sycl::queue &q) { +inline size_t get_work_group_size(sycl::queue &q) { return std::min( size_t(256), q.get_device() - .template get_info()); + .template get_info()); } -inline size_t get_num_compute_units(cl::sycl::queue &q) { +inline size_t get_num_compute_units(sycl::queue &q) { return q.get_device() - .template get_info(); + .template get_info(); } /* @brief Copying the data back to device @@ -138,12 +137,12 @@ inline size_t get_num_compute_units(cl::sycl::queue &q) { @param size is the number of elements to be copied */ template -inline cl::sycl::event copy_to_device( - cl::sycl::queue q, const element_t *src, BufferIterator dst, - size_t size, const std::vector &_dependencies = {}) { - auto event = q.submit([&](cl::sycl::handler &cgh) { - auto acc = dst.template get_range_accessor( - cgh, size); +inline sycl::event copy_to_device( + sycl::queue q, const element_t *src, BufferIterator dst, + size_t size, const std::vector &_dependencies = {}) { + auto event = q.submit([&](sycl::handler &cgh) { + auto acc = + dst.template get_range_accessor(cgh, size); cgh.depends_on(_dependencies); cgh.copy(src, acc); }); @@ -152,9 +151,9 @@ inline cl::sycl::event copy_to_device( #ifdef SB_ENABLE_USM template -inline cl::sycl::event copy_to_device( - cl::sycl::queue q, const element_t *src, element_t *dst, size_t size, - const std::vector &_dependencies = {}) { +inline sycl::event copy_to_device( + sycl::queue q, const element_t *src, element_t *dst, size_t size, + const std::vector &_dependencies = {}) { auto event = q.memcpy(dst, src, size * sizeof(element_t), _dependencies); return event; } @@ -167,12 +166,11 @@ inline cl::sycl::event copy_to_device( @param size is the number of elements to be copied */ template -inline cl::sycl::event copy_to_host(cl::sycl::queue q, - BufferIterator src, - element_t *dst, size_t size) { - auto event = q.submit([&](cl::sycl::handler &cgh) { - auto acc = src.template get_range_accessor( - cgh, size); +inline sycl::event copy_to_host(sycl::queue q, BufferIterator src, + element_t *dst, size_t size) { + auto event = q.submit([&](sycl::handler &cgh) { + auto acc = + src.template get_range_accessor(cgh, size); cgh.copy(acc, dst); }); return event; @@ -180,14 +178,14 @@ inline cl::sycl::event copy_to_host(cl::sycl::queue q, #ifdef SB_ENABLE_USM template -inline cl::sycl::event copy_to_host(cl::sycl::queue q, element_t *src, - element_t *dst, size_t size) { +inline sycl::event copy_to_host(sycl::queue q, element_t *src, element_t *dst, + size_t size) { auto event = q.memcpy(dst, src, size * sizeof(element_t)); return event; } template -inline cl::sycl::event copy_to_host(cl::sycl::queue q, const element_t *src, - element_t *dst, size_t size) { +inline sycl::event copy_to_host(sycl::queue q, const element_t *src, + element_t *dst, size_t size) { auto event = q.memcpy(dst, src, size * sizeof(element_t)); return event; } @@ -195,13 +193,13 @@ inline cl::sycl::event copy_to_host(cl::sycl::queue q, const element_t *src, #endif template -inline cl::sycl::event fill(cl::sycl::queue q, BufferIterator buff, - element_t value, size_t size, - const std::vector &_dependencies) { - auto event = q.submit([&](cl::sycl::handler &cgh) { +inline sycl::event fill(sycl::queue q, BufferIterator buff, + element_t value, size_t size, + const std::vector &_dependencies) { + auto event = q.submit([&](sycl::handler &cgh) { cgh.depends_on(_dependencies); - auto acc = buff.template get_range_accessor( - cgh, size); + auto acc = + buff.template get_range_accessor(cgh, size); cgh.fill(acc, value); }); return event; @@ -209,10 +207,10 @@ inline cl::sycl::event fill(cl::sycl::queue q, BufferIterator buff, #ifdef SB_ENABLE_USM template -inline cl::sycl::event fill(cl::sycl::queue q, element_t *buff, element_t value, - size_t size, - const std::vector &dependencies) { - auto event = q.submit([&](cl::sycl::handler &cgh) { +inline sycl::event fill(sycl::queue q, element_t *buff, element_t value, + size_t size, + const std::vector &dependencies) { + auto event = q.submit([&](sycl::handler &cgh) { cgh.depends_on(dependencies); cgh.fill(buff, value, size); }); @@ -223,8 +221,8 @@ inline cl::sycl::event fill(cl::sycl::queue q, element_t *buff, element_t value, template inline bool is_malloc_shared(sb_handle_t &sb_handle, const containerT _rs) { if constexpr (std::is_pointer_v) { - return cl::sycl::usm::alloc::shared == - cl::sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context()); + return sycl::usm::alloc::shared == + sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context()); } else { return false; } diff --git a/include/sb_handle/kernel_constructor.h b/include/sb_handle/kernel_constructor.h index b3d5bdccb..c201332b1 100644 --- a/include/sb_handle/kernel_constructor.h +++ b/include/sb_handle/kernel_constructor.h @@ -26,7 +26,7 @@ #ifndef PORTBLAS_KERNEL_CONSTRUCTOR_H #define PORTBLAS_KERNEL_CONSTRUCTOR_H -#include +#include namespace blas { @@ -118,9 +118,10 @@ struct ExpressionTreeFunctor; using_local_memory == false). */ template -static cl::sycl::event execute_tree( - queue_t q, expression_tree_t t, size_t _localSize, size_t _globalSize, - size_t _shMem, std::vector dependencies = {}); +static sycl::event execute_tree(queue_t q, expression_tree_t t, + size_t _localSize, size_t _globalSize, + size_t _shMem, + std::vector dependencies = {}); } // namespace blas diff --git a/include/sb_handle/portblas_handle.h b/include/sb_handle/portblas_handle.h index 836b37c61..041c17eec 100644 --- a/include/sb_handle/portblas_handle.h +++ b/include/sb_handle/portblas_handle.h @@ -44,10 +44,10 @@ namespace blas { * Only one method is mandatory, the Execute method. */ class SB_Handle { - using queue_t = cl::sycl::queue; + using queue_t = sycl::queue; public: - using event_t = std::vector; + using event_t = std::vector; inline SB_Handle(queue_t q) : #ifndef __ADAPTIVECPP__ @@ -174,22 +174,19 @@ class SB_Handle { inline void wait() { q_.wait(); } - inline void wait(std::vector evs) { - cl::sycl::event::wait(evs); - } + inline void wait(std::vector evs) { sycl::event::wait(evs); } - inline void wait(cl::sycl::event ev) { cl::sycl::event::wait({ev}); } + inline void wait(sycl::event ev) { sycl::event::wait({ev}); } /* @brief waiting for a list of sycl events - @param first_event and next_events are instances of sycl::sycl::event + @param first_event and next_events are instances of sycl::event */ // this must be in header as the number of event is controlled by user and we // dont know howmany permutation can be used by a user template void inline wait(first_event_t first_event, next_event_t... next_dependencies) { - cl::sycl::event::wait( - concatenate_vectors(first_event, next_dependencies...)); + sycl::event::wait(concatenate_vectors(first_event, next_dependencies...)); } private: diff --git a/include/sb_handle/temp_memory_pool.h b/include/sb_handle/temp_memory_pool.h index 836fb98c3..62aaf92c9 100644 --- a/include/sb_handle/temp_memory_pool.h +++ b/include/sb_handle/temp_memory_pool.h @@ -31,11 +31,11 @@ namespace blas { class Temp_Mem_Pool { - using queue_t = cl::sycl::queue; - using event_t = std::vector; + using queue_t = sycl::queue; + using event_t = std::vector; using temp_usm_map_t = std::multimap; using temp_usm_size_map_t = std::map; - using temp_buffer_map_t = std::multimap>; + using temp_buffer_map_t = std::multimap>; public: Temp_Mem_Pool(queue_t q) @@ -65,7 +65,7 @@ class Temp_Mem_Pool { << " bytes)" << std::endl; #endif for (const temp_usm_map_t::value_type& p : temp_usm_map_) - cl::sycl::free(p.second, q_); + sycl::free(p.second, q_); #endif } diff --git a/include/views/view.h b/include/views/view.h index 848db42aa..1a3c0df2c 100644 --- a/include/views/view.h +++ b/include/views/view.h @@ -94,7 +94,7 @@ struct VectorView { */ PORTBLAS_INLINE increment_t get_stride(); - PORTBLAS_INLINE void bind(cl::sycl::handler &h) const {} + PORTBLAS_INLINE void bind(sycl::handler &h) const {} /**** EVALUATING ****/ template @@ -109,11 +109,11 @@ struct VectorView { return (strd_ == 1) ? *(ptr_ + i) : *(ptr_ + i * strd_); } - PORTBLAS_INLINE value_t &eval(cl::sycl::nd_item<1> ndItem) { + PORTBLAS_INLINE value_t &eval(sycl::nd_item<1> ndItem) { return eval(ndItem.get_global_id(0)); } - PORTBLAS_INLINE value_t eval(cl::sycl::nd_item<1> ndItem) const { + PORTBLAS_INLINE value_t eval(sycl::nd_item<1> ndItem) const { return eval(ndItem.get_global_id(0)); } @@ -204,7 +204,7 @@ struct MatrixView { */ PORTBLAS_INLINE void adjust_access_displacement() const; - PORTBLAS_INLINE void bind(cl::sycl::handler &h) const {} + PORTBLAS_INLINE void bind(sycl::handler &h) const {} /*! eval. * @brief Evaluation for the pair of row/col. @@ -257,11 +257,11 @@ struct MatrixView { return eval(i, j); } - PORTBLAS_INLINE value_t &eval(cl::sycl::nd_item<1> ndItem) { + PORTBLAS_INLINE value_t &eval(sycl::nd_item<1> ndItem) { return eval(ndItem.get_global_id(0)); } - PORTBLAS_INLINE value_t eval(cl::sycl::nd_item<1> ndItem) const noexcept { + PORTBLAS_INLINE value_t eval(sycl::nd_item<1> ndItem) const noexcept { return eval(ndItem.get_global_id(0)); } @@ -283,10 +283,9 @@ struct VectorViewType; template struct VectorViewType, index_t, increment_t> { - static constexpr cl::sycl::access::mode access_mode_t = - Choose::value, cl::sycl::access::mode, - cl::sycl::access::mode::read, - cl::sycl::access::mode::read_write>::type; + static constexpr sycl::access_mode access_mode_t = + Choose::value, sycl::access_mode, + sycl::access_mode::read, sycl::access_mode::read_write>::type; using type = VectorView::template default_accessor_t< access_mode_t>, @@ -308,10 +307,9 @@ struct MatrixViewType; template struct MatrixViewType, index_t, access_layout_t, has_inc> { - static constexpr cl::sycl::access::mode access_mode_t = - Choose::value, cl::sycl::access::mode, - cl::sycl::access::mode::read, - cl::sycl::access::mode::read_write>::type; + static constexpr sycl::access_mode access_mode_t = + Choose::value, sycl::access_mode, + sycl::access_mode::read, sycl::access_mode::read_write>::type; using type = MatrixView::template default_accessor_t< access_mode_t>, @@ -331,10 +329,9 @@ struct MatrixViewType { template static PORTBLAS_INLINE auto make_vector_view(BufferIterator buff, increment_t inc, index_t sz) { - static constexpr cl::sycl::access::mode access_mode_t = - Choose::value, cl::sycl::access::mode, - cl::sycl::access::mode::read, - cl::sycl::access::mode::read_write>::type; + static constexpr sycl::access_mode access_mode_t = + Choose::value, sycl::access_mode, + sycl::access_mode::read, sycl::access_mode::read_write>::type; using leaf_node_t = VectorView::template default_accessor_t< access_mode_t>, @@ -348,10 +345,9 @@ template buff, index_t m, index_t n, index_t lda, index_t inc = 1) { - static constexpr cl::sycl::access::mode access_mode_t = - Choose::value, cl::sycl::access::mode, - cl::sycl::access::mode::read, - cl::sycl::access::mode::read_write>::type; + static constexpr sycl::access_mode access_mode_t = + Choose::value, sycl::access_mode, + sycl::access_mode::read, sycl::access_mode::read_write>::type; using leaf_node_t = MatrixView::template default_accessor_t< access_mode_t>, diff --git a/samples/gemm.cpp b/samples/gemm.cpp index c39b9462b..2f77e91db 100644 --- a/samples/gemm.cpp +++ b/samples/gemm.cpp @@ -1,11 +1,11 @@ #include "portblas.hpp" -#include +#include #include "util.hpp" int main(int argc, char** argv) { /* Create a SYCL queue with the default device selector */ - cl::sycl::queue q = cl::sycl::queue(cl::sycl::default_selector()); + sycl::queue q = sycl::queue(sycl::default_selector_v); /* Create a portBLAS sb_handle and get the policy handler */ blas::SB_Handle sb_handle(q); diff --git a/samples/gemv.cpp b/samples/gemv.cpp index 634176aa5..211843764 100644 --- a/samples/gemv.cpp +++ b/samples/gemv.cpp @@ -1,11 +1,11 @@ #include "portblas.hpp" -#include +#include #include "util.hpp" int main(int argc, char** argv) { /* Create a SYCL queue with the default device selector */ - cl::sycl::queue q = cl::sycl::queue(cl::sycl::default_selector()); + sycl::queue q = sycl::queue(sycl::default_selector_v); /* Create a portBLAS sb_handle and get the policy handler */ blas::SB_Handle sb_handle(q); diff --git a/samples/symm.cpp b/samples/symm.cpp index 1bb4a9a53..5e89f21a0 100644 --- a/samples/symm.cpp +++ b/samples/symm.cpp @@ -1,13 +1,13 @@ #include "portblas.hpp" -#include +#include #include "util.hpp" int main(int argc, char** argv) { /* Create a SYCL queue with the default device selector */ - cl::sycl::queue q = cl::sycl::queue(cl::sycl::default_selector()); + sycl::queue q = sycl::queue(sycl::default_selector_v); /* Create a portBLAS sb_handle and get the policy handler */ blas::SB_Handle sb_handle(q); diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp index e92027823..bb78b0d56 100644 --- a/src/interface/blas1_interface.hpp +++ b/src/interface/blas1_interface.hpp @@ -189,26 +189,10 @@ template ::type vx = - make_vector_view(_vx, _incx, _N); - auto rs = make_vector_view(_rs, static_cast(1), - static_cast(1)); - - const auto localSize = sb_handle.get_work_group_size(); - const auto nWG = 2 * localSize; - auto assignOp = make_assign_reduction(rs, vx, localSize, - localSize * nWG); - auto ret = sb_handle.execute(assignOp, _dependencies); - return ret; -#else return blas::asum::backend::_asum(sb_handle, _N, _vx, _incx, _rs, _dependencies); -#endif } -#if SYCL_LANGUAGE_VERSION >= 202000 && !defined(__ADAPTIVECPP__) /*! _asum_impl. * @brief Internal implementation of the Absolute sum operator. * @@ -260,7 +244,6 @@ typename sb_handle_t::event_t _asum_impl( } return ret; } -#endif /** * _iamax_iamin_impl. @@ -312,7 +295,7 @@ typename sb_handle_t::event_t _iamax_iamin_impl( // get the minimum supported sub_group size const index_t min_sg_size = static_cast( q.get_device() - .template get_info()[0]); + .template get_info()[0]); ret = sb_handle.execute(op, min_sg_size, min_sg_size, _dependencies); } else { ret = sb_handle.execute( @@ -327,7 +310,7 @@ typename sb_handle_t::event_t _iamax_iamin_impl( // get the minimum supported sub_group size const index_t min_sg_size = static_cast( q.get_device() - .template get_info()[0]); + .template get_info()[0]); // if using no local memory, every sub_group writes one intermediate output, // in case if sub_group size is not known at allocation time, than allocate // extra memory using min supported sub_group size. diff --git a/src/interface/blas2/backend/default.hpp b/src/interface/blas2/backend/default.hpp index d0208bc97..77d159dd3 100644 --- a/src/interface/blas2/backend/default.hpp +++ b/src/interface/blas2/backend/default.hpp @@ -140,7 +140,7 @@ typename sb_handle_t::event_t _trsv( const auto device = sb_handle.get_queue().get_device(); if (device.is_gpu()) { const std::string vendor = - device.template get_info(); + device.template get_info(); if (vendor.find("Intel") == vendor.npos) { return blas::internal::_trsv_impl<32, 4, uplo, trn, diag>( sb_handle, _N, _mA, _lda, _vx, _incx, _dependencies); @@ -168,7 +168,7 @@ typename sb_handle_t::event_t _tbsv( const auto device = sb_handle.get_queue().get_device(); if (device.is_gpu()) { const std::string vendor = - device.template get_info(); + device.template get_info(); if (vendor.find("Intel") == vendor.npos) { return blas::internal::_tbsv_impl<32, 4, uplo, trn, diag>( sb_handle, _N, _K, _mA, _lda, _vx, _incx, _dependencies); @@ -195,7 +195,7 @@ typename sb_handle_t::event_t _tpsv( const auto device = sb_handle.get_queue().get_device(); if (device.is_gpu()) { const std::string vendor = - device.template get_info(); + device.template get_info(); if (vendor.find("Intel") == vendor.npos) { return blas::internal::_tpsv_impl<32, 4, uplo, trn, diag>( sb_handle, _N, _mA, _vx, _incx, _dependencies); diff --git a/src/interface/blas2_interface.hpp b/src/interface/blas2_interface.hpp index 6f43d5300..16cc32bc4 100644 --- a/src/interface/blas2_interface.hpp +++ b/src/interface/blas2_interface.hpp @@ -356,7 +356,7 @@ typename sb_handle_t::event_t _trsv_impl( sb_handle_t& sb_handle, index_t _N, container_t0 _mA, index_t _lda, container_t1 _vx, increment_t _incx, const typename sb_handle_t::event_t& _dependencies) { -#if (SYCL_LANGUAGE_VERSION < 202000) || (defined __ADAPTIVECPP__) +#ifdef __ADAPTIVECPP__ throw std::runtime_error("trsv requires SYCL 2020"); #else static_assert(subgroup_size % subgroups == 0, @@ -747,7 +747,7 @@ typename sb_handle_t::event_t _tbsv_impl( sb_handle_t& sb_handle, index_t _N, index_t _K, container_t0 _mA, index_t _lda, container_t1 _vx, increment_t _incx, const typename sb_handle_t::event_t& _dependencies) { -#if (SYCL_LANGUAGE_VERSION < 202000) || (defined __ADAPTIVECPP__) +#ifdef __ADAPTIVECPP__ throw std::runtime_error("tbsv requires SYCL 2020"); #else static_assert(subgroup_size % subgroups == 0, @@ -810,7 +810,7 @@ template subgroup_sizes = sb_handle.get_queue() .get_device() - .template get_info(); + .template get_info(); size_t min_subgroup_size = *subgroup_sizes.begin(); size_t max_subgroup_size = *subgroup_sizes.rbegin(); assert(((_nRowsWG * _nColsWG) / _localSize) <= min_subgroup_size); diff --git a/src/interface/blas3/backend/nvidia_gpu.hpp b/src/interface/blas3/backend/nvidia_gpu.hpp index e5d599e18..366f34ba6 100644 --- a/src/interface/blas3/backend/nvidia_gpu.hpp +++ b/src/interface/blas3/backend/nvidia_gpu.hpp @@ -70,7 +70,7 @@ _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K, return blas::Gemm_Launcher< container_0_t, container_1_t, container_2_t, 256, false, true, true, 128, - Tile<8, 8, 16, 16, 16, 2, 1, 1, 1, 1, 16, 16, 16, cl::sycl::half, + Tile<8, 8, 16, 16, 16, 2, 1, 1, 1, 1, 16, 16, 16, sycl::half, float>, _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::standard), @@ -84,8 +84,7 @@ _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K, return blas::Gemm_Launcher< container_0_t, container_1_t, container_2_t, 128, false, true, true, 128, - Tile<4, 8, 16, 8, 16, 2, 1, 1, 1, 1, 16, 16, 16, cl::sycl::half, - float>, + Tile<4, 8, 16, 8, 16, 2, 1, 1, 1, 1, 16, 16, 16, sycl::half, float>, _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::standard), static_cast(gemm_vectorization_t::none), is_beta_zero, 1, @@ -99,8 +98,7 @@ _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K, return blas::Gemm_Launcher< container_0_t, container_1_t, container_2_t, 128, false, true, true, 128, - Tile<2, 4, 16, 8, 16, 2, 1, 1, 1, 1, 16, 16, 16, cl::sycl::half, - float>, + Tile<2, 4, 16, 8, 16, 2, 1, 1, 1, 1, 16, 16, 16, sycl::half, float>, _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::standard), static_cast(gemm_vectorization_t::none), is_beta_zero, 1, diff --git a/src/operations/blas1/IndexMaxMin.hpp b/src/operations/blas1/IndexMaxMin.hpp index c2e7b0746..10104c443 100644 --- a/src/operations/blas1/IndexMaxMin.hpp +++ b/src/operations/blas1/IndexMaxMin.hpp @@ -64,7 +64,7 @@ IndexMaxMin::get_size() const { template PORTBLAS_INLINE bool IndexMaxMin::valid_thread( - cl::sycl::nd_item<1> ndItem) const { + sycl::nd_item<1> ndItem) const { return true; } @@ -73,7 +73,7 @@ PORTBLAS_INLINE bool IndexMaxMin::valid_thread( */ template PORTBLAS_INLINE void IndexMaxMin::eval( - cl::sycl::nd_item<1> ndItem) { + sycl::nd_item<1> ndItem) { using op = typename SelectOperator::op; const auto size = rhs_.get_size(); auto sg = ndItem.get_sub_group(); @@ -96,8 +96,8 @@ PORTBLAS_INLINE void IndexMaxMin::eval( // reduction within the sub_group for (index_t i = sg_local_range >> 1; i > 0; i >>= 1) { - element_t shfl_val = cl::sycl::shift_group_left(sg, val.get_value(), i); - index_t shfl_idx = cl::sycl::shift_group_left(sg, val.get_index(), i); + element_t shfl_val = sycl::shift_group_left(sg, val.get_value(), i); + index_t shfl_idx = sycl::shift_group_left(sg, val.get_index(), i); value_t shfl{shfl_idx, shfl_val}; val = op::eval(val, shfl); } @@ -126,7 +126,7 @@ PORTBLAS_INLINE void IndexMaxMin::eval( template template PORTBLAS_INLINE void IndexMaxMin::eval( - sharedT scratch, cl::sycl::nd_item<1> ndItem) { + sharedT scratch, sycl::nd_item<1> ndItem) { using op = typename SelectOperator::op; const auto size = rhs_.get_size(); const auto local_range = ndItem.get_local_range(0); @@ -142,7 +142,7 @@ PORTBLAS_INLINE void IndexMaxMin::eval( } scratch[local_id] = val; - ndItem.barrier(cl::sycl::access::fence_space::local_space); + ndItem.barrier(sycl::access::fence_space::local_space); value_t local_val = op::template init(); // reduction within the work group @@ -152,7 +152,7 @@ PORTBLAS_INLINE void IndexMaxMin::eval( local_val = scratch[local_id + i]; scratch[local_id] = op::eval(val, local_val); } - ndItem.barrier(cl::sycl::access::fence_space::local_space); + ndItem.barrier(sycl::access::fence_space::local_space); } // write IndexValueTuple to Global Memory iff reduction step0 @@ -172,7 +172,7 @@ PORTBLAS_INLINE void IndexMaxMin::eval( template PORTBLAS_INLINE void IndexMaxMin::bind( - cl::sycl::handler& h) { + sycl::handler& h) { lhs_.bind(h); rhs_.bind(h); } diff --git a/src/operations/blas1/WGAtomicReduction.hpp b/src/operations/blas1/WGAtomicReduction.hpp index ca46b8269..ce29d1cb5 100644 --- a/src/operations/blas1/WGAtomicReduction.hpp +++ b/src/operations/blas1/WGAtomicReduction.hpp @@ -54,7 +54,7 @@ template PORTBLAS_INLINE bool WGAtomicReduction::valid_thread( - cl::sycl::nd_item<1> ndItem) const { + sycl::nd_item<1> ndItem) const { return true; } @@ -63,12 +63,11 @@ template ::value_t WGAtomicReduction::eval( - cl::sycl::nd_item<1> ndItem) { - auto atomic_res = - cl::sycl::atomic_ref( - lhs_.get_data()[0]); + sycl::nd_item<1> ndItem) { + auto atomic_res = sycl::atomic_ref( + lhs_.get_data()[0]); const auto size = get_size(); int lid = ndItem.get_global_linear_id(); value_t val = operator_t::template init(); @@ -80,8 +79,8 @@ PORTBLAS_INLINE val = operator_t::eval(val, rhs_.eval(id)); } - val = cl::sycl::reduce_over_group(ndItem.get_sub_group(), val, - cl::sycl::plus()); + val = sycl::reduce_over_group(ndItem.get_sub_group(), val, + sycl::plus()); if ((ndItem.get_local_id()[0] & (ndItem.get_sub_group().get_local_range()[0] - 1)) == 0) { @@ -96,7 +95,7 @@ template PORTBLAS_INLINE typename WGAtomicReduction::value_t WGAtomicReduction::eval( - sharedT scratch, cl::sycl::nd_item<1> ndItem) { + sharedT scratch, sycl::nd_item<1> ndItem) { const auto size = get_size(); const int lid = static_cast(ndItem.get_global_linear_id()); const auto loop_stride = @@ -108,8 +107,8 @@ PORTBLAS_INLINE val = operator_t::eval(val, rhs_.eval(id)); } - val = cl::sycl::reduce_over_group(ndItem.get_sub_group(), val, - cl::sycl::plus()); + val = sycl::reduce_over_group(ndItem.get_sub_group(), val, + sycl::plus()); if (ndItem.get_sub_group().get_local_id()[0] == 0) { scratch[ndItem.get_sub_group().get_group_linear_id()] = val; @@ -122,17 +121,16 @@ PORTBLAS_INLINE ? scratch[ndItem.get_sub_group().get_local_id()] : 0; if (ndItem.get_sub_group().get_group_id()[0] == 0) { - val = cl::sycl::reduce_over_group(ndItem.get_sub_group(), val, - cl::sycl::plus()); + val = sycl::reduce_over_group(ndItem.get_sub_group(), val, + sycl::plus()); } if (ndItem.get_local_id()[0] == 0) { - constexpr cl::sycl::access::address_space addr_sp = - usmManagedMem ? cl::sycl::access::address_space::generic_space - : cl::sycl::access::address_space::global_space; - auto atomic_res = - cl::sycl::atomic_ref( - lhs_.get_data()[0]); + constexpr sycl::access::address_space addr_sp = + usmManagedMem ? sycl::access::address_space::generic_space + : sycl::access::address_space::global_space; + auto atomic_res = sycl::atomic_ref( + lhs_.get_data()[0]); atomic_res += val; } @@ -142,7 +140,7 @@ PORTBLAS_INLINE template PORTBLAS_INLINE void WGAtomicReduction::bind(cl::sycl::handler& h) { + rhs_t>::bind(sycl::handler& h) { lhs_.bind(h); rhs_.bind(h); } diff --git a/src/operations/blas1_trees.hpp b/src/operations/blas1_trees.hpp index 6a78c0508..f74d390b9 100644 --- a/src/operations/blas1_trees.hpp +++ b/src/operations/blas1_trees.hpp @@ -26,13 +26,11 @@ #ifndef PORTBLAS_BLAS1_TREES_HPP #define PORTBLAS_BLAS1_TREES_HPP +#include "blas1/IndexMaxMin.hpp" +#include "blas1/WGAtomicReduction.hpp" #include "operations/blas1_trees.h" #include "operations/blas_operators.hpp" #include "views/view.hpp" -#if SYCL_LANGUAGE_VERSION >= 202000 -#include "blas1/WGAtomicReduction.hpp" -#endif -#include "blas1/IndexMaxMin.hpp" #include "views/view_sycl.hpp" #include #include @@ -84,8 +82,8 @@ struct DetectScalar { * @brief See Detect Scalar. */ template <> -struct DetectScalar { - using element_t = cl::sycl::half; +struct DetectScalar { + using element_t = sycl::half; static element_t get_scalar(element_t &scalar) { return scalar; } }; @@ -124,7 +122,7 @@ Join::get_size() const { template PORTBLAS_INLINE bool Join::valid_thread( - cl::sycl::nd_item<1> ndItem) const { + sycl::nd_item<1> ndItem) const { return ((ndItem.get_global_id(0) < Join::get_size())); } @@ -137,11 +135,11 @@ PORTBLAS_INLINE typename Join::value_t Join::eval( template PORTBLAS_INLINE typename Join::value_t Join::eval( - cl::sycl::nd_item<1> ndItem) { + sycl::nd_item<1> ndItem) { return Join::eval(ndItem.get_global_id(0)); } template -PORTBLAS_INLINE void Join::bind(cl::sycl::handler &h) { +PORTBLAS_INLINE void Join::bind(sycl::handler &h) { lhs_.bind(h); rhs_.bind(h); } @@ -165,7 +163,7 @@ Assign::get_size() const { template PORTBLAS_INLINE bool Assign::valid_thread( - cl::sycl::nd_item<1> ndItem) const { + sycl::nd_item<1> ndItem) const { using index_t = typename Assign::index_t; return (static_cast(ndItem.get_global_id(0)) < Assign::get_size()); @@ -180,12 +178,12 @@ Assign::eval(typename Assign::index_t i) { template PORTBLAS_INLINE typename Assign::value_t -Assign::eval(cl::sycl::nd_item<1> ndItem) { +Assign::eval(sycl::nd_item<1> ndItem) { return Assign::eval(ndItem.get_global_id(0)); } template -PORTBLAS_INLINE void Assign::bind(cl::sycl::handler &h) { +PORTBLAS_INLINE void Assign::bind(sycl::handler &h) { lhs_.bind(h); rhs_.bind(h); } @@ -215,7 +213,7 @@ template PORTBLAS_INLINE bool DoubleAssign::valid_thread( - cl::sycl::nd_item<1> ndItem) const { + sycl::nd_item<1> ndItem) const { return ((ndItem.get_global_id(0) < get_size())); } @@ -237,14 +235,14 @@ template ::value_t DoubleAssign::eval( - cl::sycl::nd_item<1> ndItem) { + sycl::nd_item<1> ndItem) { return DoubleAssign::eval( ndItem.get_global_id(0)); } template PORTBLAS_INLINE void DoubleAssign::bind( - cl::sycl::handler &h) { + sycl::handler &h) { lhs_1_.bind(h); rhs_1_.bind(h); lhs_2_.bind(h); @@ -276,7 +274,7 @@ ScalarOp::get_size() const { } template PORTBLAS_INLINE bool ScalarOp::valid_thread( - cl::sycl::nd_item<1> ndItem) const { + sycl::nd_item<1> ndItem) const { return ((ndItem.get_global_id(0) < ScalarOp::get_size())); } @@ -288,12 +286,12 @@ ScalarOp::eval( } template PORTBLAS_INLINE typename ScalarOp::value_t -ScalarOp::eval(cl::sycl::nd_item<1> ndItem) { +ScalarOp::eval(sycl::nd_item<1> ndItem) { return ScalarOp::eval(ndItem.get_global_id(0)); } template PORTBLAS_INLINE void ScalarOp::bind( - cl::sycl::handler &h) { + sycl::handler &h) { rhs_.bind(h); } @@ -316,7 +314,7 @@ UnaryOp::get_size() const { template PORTBLAS_INLINE bool UnaryOp::valid_thread( - cl::sycl::nd_item<1> ndItem) const { + sycl::nd_item<1> ndItem) const { return ((ndItem.get_global_id(0) < UnaryOp::get_size())); } @@ -329,11 +327,11 @@ UnaryOp::eval( template PORTBLAS_INLINE typename UnaryOp::value_t -UnaryOp::eval(cl::sycl::nd_item<1> ndItem) { +UnaryOp::eval(sycl::nd_item<1> ndItem) { return UnaryOp::eval(ndItem.get_global_id(0)); } template -PORTBLAS_INLINE void UnaryOp::bind(cl::sycl::handler &h) { +PORTBLAS_INLINE void UnaryOp::bind(sycl::handler &h) { rhs_.bind(h); } template @@ -355,7 +353,7 @@ BinaryOp::get_size() const { } template PORTBLAS_INLINE bool BinaryOp::valid_thread( - cl::sycl::nd_item<1> ndItem) const { + sycl::nd_item<1> ndItem) const { return ((ndItem.get_global_id(0) < get_size())); } @@ -367,12 +365,12 @@ BinaryOp::eval( } template PORTBLAS_INLINE typename BinaryOp::value_t -BinaryOp::eval(cl::sycl::nd_item<1> ndItem) { +BinaryOp::eval(sycl::nd_item<1> ndItem) { return BinaryOp::eval(ndItem.get_global_id(0)); } template PORTBLAS_INLINE void BinaryOp::bind( - cl::sycl::handler &h) { + sycl::handler &h) { lhs_.bind(h); rhs_.bind(h); } @@ -398,7 +396,7 @@ BinaryOpConst::get_size() const { } template PORTBLAS_INLINE bool BinaryOpConst::valid_thread( - cl::sycl::nd_item<1> ndItem) const { + sycl::nd_item<1> ndItem) const { return ((ndItem.get_global_id(0) < get_size())); } @@ -410,13 +408,12 @@ BinaryOpConst::eval( } template PORTBLAS_INLINE typename BinaryOpConst::value_t -BinaryOpConst::eval( - cl::sycl::nd_item<1> ndItem) const { +BinaryOpConst::eval(sycl::nd_item<1> ndItem) const { return BinaryOpConst::eval(ndItem.get_global_id(0)); } template PORTBLAS_INLINE void BinaryOpConst::bind( - cl::sycl::handler &h) { + sycl::handler &h) { lhs_.bind(h); rhs_.bind(h); } @@ -442,7 +439,7 @@ PORTBLAS_INLINE typename TupleOp::index_t TupleOp::get_size() template PORTBLAS_INLINE bool TupleOp::valid_thread( - cl::sycl::nd_item<1> ndItem) const { + sycl::nd_item<1> ndItem) const { return ((ndItem.get_global_id(0) < get_size())); } @@ -454,11 +451,11 @@ PORTBLAS_INLINE typename TupleOp::value_t TupleOp::eval( template PORTBLAS_INLINE typename TupleOp::value_t TupleOp::eval( - cl::sycl::nd_item<1> ndItem) { + sycl::nd_item<1> ndItem) { return TupleOp::eval(ndItem.get_global_id(0)); } template -PORTBLAS_INLINE void TupleOp::bind(cl::sycl::handler &h) { +PORTBLAS_INLINE void TupleOp::bind(sycl::handler &h) { rhs_.bind(h); } template @@ -484,7 +481,7 @@ AssignReduction::get_size() const { template PORTBLAS_INLINE bool AssignReduction::valid_thread( - cl::sycl::nd_item<1> ndItem) const { + sycl::nd_item<1> ndItem) const { return true; } template @@ -519,7 +516,7 @@ AssignReduction::eval( } template PORTBLAS_INLINE typename AssignReduction::value_t -AssignReduction::eval(cl::sycl::nd_item<1> ndItem) { +AssignReduction::eval(sycl::nd_item<1> ndItem) { return AssignReduction::eval( ndItem.get_global_id(0)); } @@ -527,7 +524,7 @@ template template PORTBLAS_INLINE typename AssignReduction::value_t AssignReduction::eval(sharedT scratch, - cl::sycl::nd_item<1> ndItem) { + sycl::nd_item<1> ndItem) { index_t localid = ndItem.get_local_id(0); index_t localSz = ndItem.get_local_range(0); index_t groupid = ndItem.get_group(0); @@ -548,7 +545,7 @@ AssignReduction::eval(sharedT scratch, scratch[localid] = val; // This barrier is mandatory to be sure the data is on the shared memory - ndItem.barrier(cl::sycl::access::fence_space::local_space); + ndItem.barrier(sycl::access::fence_space::local_space); // Reduction inside the block for (index_t offset = localSz >> 1; offset > 0; offset >>= 1) { @@ -557,7 +554,7 @@ AssignReduction::eval(sharedT scratch, operator_t::eval(scratch[localid], scratch[localid + offset]); } // This barrier is mandatory to be sure the data are on the shared memory - ndItem.barrier(cl::sycl::access::fence_space::local_space); + ndItem.barrier(sycl::access::fence_space::local_space); } if (localid == 0) { lhs_.eval(groupid) = scratch[localid]; @@ -567,7 +564,7 @@ AssignReduction::eval(sharedT scratch, template PORTBLAS_INLINE void AssignReduction::bind( - cl::sycl::handler &h) { + sycl::handler &h) { lhs_.bind(h); rhs_.bind(h); } @@ -634,18 +631,18 @@ PORTBLAS_INLINE typename Rotg::value_t Rotg::eval( template PORTBLAS_INLINE typename Rotg::value_t Rotg::eval( - cl::sycl::nd_item<1> ndItem) { + sycl::nd_item<1> ndItem) { return Rotg::eval(ndItem.get_global_id(0)); } template PORTBLAS_INLINE bool Rotg::valid_thread( - cl::sycl::nd_item<1> ndItem) const { + sycl::nd_item<1> ndItem) const { return ((ndItem.get_global_id(0) < Rotg::get_size())); } template -PORTBLAS_INLINE void Rotg::bind(cl::sycl::handler &h) { +PORTBLAS_INLINE void Rotg::bind(sycl::handler &h) { a_.bind(h); b_.bind(h); c_.bind(h); @@ -904,18 +901,18 @@ PORTBLAS_INLINE typename Rotmg::value_t Rotmg::eval( template PORTBLAS_INLINE typename Rotmg::value_t Rotmg::eval( - cl::sycl::nd_item<1> ndItem) { + sycl::nd_item<1> ndItem) { return Rotmg::eval(ndItem.get_global_id(0)); } template PORTBLAS_INLINE bool Rotmg::valid_thread( - cl::sycl::nd_item<1> ndItem) const { + sycl::nd_item<1> ndItem) const { return ((ndItem.get_global_id(0) < Rotmg::get_size())); } template -PORTBLAS_INLINE void Rotmg::bind(cl::sycl::handler &h) { +PORTBLAS_INLINE void Rotmg::bind(sycl::handler &h) { d1_.bind(h); d2_.bind(h); x1_.bind(h); diff --git a/src/operations/blas2/gbmv.hpp b/src/operations/blas2/gbmv.hpp index df0383ff9..58c763799 100644 --- a/src/operations/blas2/gbmv.hpp +++ b/src/operations/blas2/gbmv.hpp @@ -69,7 +69,7 @@ template PORTBLAS_INLINE bool Gbmv::valid_thread( - cl::sycl::nd_item<1> ndItem) const { + sycl::nd_item<1> ndItem) const { // Valid threads are established by ::eval. return true; } @@ -77,9 +77,9 @@ Gbmv::valid_thread( template PORTBLAS_INLINE typename Gbmv::value_t + is_transposed>::value_t Gbmv::eval( - cl::sycl::nd_item<1> ndItem) { + sycl::nd_item<1> ndItem) { const index_t lhs_idx = ndItem.get_global_id(0); value_t val = 0; @@ -87,9 +87,8 @@ Gbmv::eval( const index_t k_lower = is_transposed ? ku_ : kl_; const index_t k_upper = is_transposed ? kl_ : ku_; - const index_t k_beg = cl::sycl::max(index_t(0), lhs_idx - k_lower); - const index_t k_end = - cl::sycl::min(vector_.get_size(), lhs_idx + k_upper + 1); + const index_t k_beg = sycl::max(index_t(0), lhs_idx - k_lower); + const index_t k_end = sycl::min(vector_.get_size(), lhs_idx + k_upper + 1); const index_t k_off = ku_ + (is_transposed ? -lhs_idx : lhs_idx); for (index_t s_idx = k_beg; s_idx < k_end; ++s_idx) { @@ -110,7 +109,7 @@ Gbmv::eval( template PORTBLAS_INLINE void Gbmv::bind(cl::sycl::handler &h) { + is_transposed>::bind(sycl::handler &h) { lhs_.bind(h); matrix_.bind(h); vector_.bind(h); diff --git a/src/operations/blas2/gemv.hpp b/src/operations/blas2/gemv.hpp index d6da74e93..7b393185e 100644 --- a/src/operations/blas2/gemv.hpp +++ b/src/operations/blas2/gemv.hpp @@ -25,6 +25,7 @@ #ifndef GEMV_HPP #define GEMV_HPP + #include "operations/blas2_trees.h" #include "operations/blas_operators.hpp" #include "views/view_sycl.hpp" @@ -47,7 +48,7 @@ SumMatrixColumns::get_size() const { template PORTBLAS_INLINE bool SumMatrixColumns::valid_thread( - cl::sycl::nd_item<1> ndItem) const { + sycl::nd_item<1> ndItem) const { return ((ndItem.get_global_id(0) < get_size())); } @@ -68,12 +69,12 @@ SumMatrixColumns::eval(typename SumMatrixColumns::index_t i) { template PORTBLAS_INLINE typename SumMatrixColumns::value_t -SumMatrixColumns::eval(cl::sycl::nd_item<1> ndItem) const { +SumMatrixColumns::eval(sycl::nd_item<1> ndItem) const { return eval(ndItem.get_global_id(0)); } template -PORTBLAS_INLINE void SumMatrixColumns::bind(cl::sycl::handler &h) { +PORTBLAS_INLINE void SumMatrixColumns::bind(sycl::handler &h) { rhs_.bind(h); } @@ -111,7 +112,7 @@ template PORTBLAS_INLINE bool Gemv::valid_thread(cl::sycl::nd_item<1>) const { + work_per_thread>::valid_thread(sycl::nd_item<1>) const { return true; } @@ -129,7 +130,7 @@ PORTBLAS_INLINE typename Gemv::value_t Gemv::eval(cl::sycl::nd_item<1> ndItem) { + work_per_thread>::eval(sycl::nd_item<1> ndItem) { const index_t local_id = ndItem.get_local_id(0); const index_t group_id = ndItem.get_group(0); const index_t group_range = ndItem.get_group_range(0); @@ -157,8 +158,8 @@ PORTBLAS_INLINE sum = 0; for (index_t col_id = 0; col_id < contract_dim; ++col_id) { - sum = cl::sycl::mad(matrix_a_.template eval(non_contract_dim_index), - vector_x_.eval(col_id), sum); + sum = sycl::mad(matrix_a_.template eval(non_contract_dim_index), + vector_x_.eval(col_id), sum); non_contract_dim_index += contract_stride; } @@ -182,7 +183,7 @@ PORTBLAS_INLINE cache_line_size, work_per_thread>::value_t Gemv::eval(local_memory_t local_mem, - cl::sycl::nd_item<1> ndItem) { + sycl::nd_item<1> ndItem) { const index_t local_id = ndItem.get_local_id(0); const index_t group_id = ndItem.get_group(0); @@ -206,7 +207,7 @@ PORTBLAS_INLINE : value_t{0}; // Barrier to ensure whole portion of vector X is in local memory - ndItem.barrier(cl::sycl::access::fence_space::local_space); + ndItem.barrier(sycl::access::fence_space::local_space); // Non-contracting dimension index const index_t nc_dim_index = local_id + nc_group_id * local_range; @@ -225,13 +226,13 @@ PORTBLAS_INLINE // Calculate the matrix index index_t mat_index = nc_dim_index + c_group_id * local_range * lda; - const index_t last_c_dim_id = cl::sycl::min( + const index_t last_c_dim_id = sycl::min( index_t(c_dim - c_group_id * local_range), index_t(local_range)); // Computes the partial dot product for a row for (index_t c_dim_id = 0; c_dim_id < last_c_dim_id; ++c_dim_id) { - sum = cl::sycl::mad(matrix_a_.template eval(mat_index), - vector_scratch[c_dim_id], sum); + sum = sycl::mad(matrix_a_.template eval(mat_index), + vector_scratch[c_dim_id], sum); mat_index += lda; } @@ -252,19 +253,19 @@ PORTBLAS_INLINE extract_input_block(matrix_scratch, local_id, group_id, lda, c_tile_id); // Ensure memory synchronization within work group - ndItem.barrier(cl::sycl::access::fence_space::local_space); + ndItem.barrier(sycl::access::fence_space::local_space); index_t mat_index = local_id; #pragma unroll for (index_t c_dim_id = 0; c_dim_id < cl_elems; ++c_dim_id) { - sum = cl::sycl::mad(matrix_scratch[mat_index], *vector_scratch++, sum); + sum = sycl::mad(matrix_scratch[mat_index], *vector_scratch++, sum); mat_index += local_range + 1; // Adding one as bank offset } // Ensure memory synchronization within work group - ndItem.barrier(cl::sycl::access::fence_space::local_space); + ndItem.barrier(sycl::access::fence_space::local_space); } const index_t out_index = nc_dim_index + (c_group_id * nc_dim); @@ -352,7 +353,7 @@ template PORTBLAS_INLINE void Gemv::bind(cl::sycl::handler &h) { + work_per_thread>::bind(sycl::handler &h) { lhs_.bind(h); matrix_a_.bind(h); vector_x_.bind(h); @@ -405,7 +406,7 @@ template PORTBLAS_INLINE bool GemvRow::valid_thread(cl::sycl::nd_item<1> ndItem) const { + vector_t>::valid_thread(sycl::nd_item<1> ndItem) const { return true; } @@ -433,9 +434,9 @@ GemvRow::eval( template PORTBLAS_INLINE typename GemvRow::value_t + matrix_t, vector_t>::value_t GemvRow::eval( - cl::sycl::nd_item<1> ndItem) { + sycl::nd_item<1> ndItem) { using index_t = typename GemvRow::index_t; index_t localid = ndItem.get_local_id(0); @@ -456,10 +457,10 @@ GemvRow::eval( ((dimC + (localSz * nWG_col_) - 1) / (localSz * nWG_col_)) * localSz; index_t frs_row = idWFR * rowSz; - index_t lst_row = cl::sycl::min(index_t(dimR), index_t(frs_row + rowSz)); + index_t lst_row = sycl::min(index_t(dimR), index_t(frs_row + rowSz)); index_t frs_col = idWFC * dimWFC + interLoop * localid; - index_t lst_col = cl::sycl::min(index_t(dimC), index_t(frs_col + dimWFC)); + index_t lst_col = sycl::min(index_t(dimC), index_t(frs_col + dimWFC)); index_t id_col_thr = idWFC * localSz + localid; @@ -519,17 +520,17 @@ GemvRow::eval( // If the row length isn't a multiple of localSz * interLoop // we need to go for fewer columns. Pick the min. auto lst_k_int = - cl::sycl::min(index_t(id_col + interLoop), index_t(lst_col)); + sycl::min(index_t(id_col + interLoop), index_t(lst_col)); // Handle lower diagonal etc for (index_t k_int = - ((Lower) ? id_col - : cl::sycl::max( - index_t(row + ((!Diag || Unit) ? 1 : 0)), - index_t(id_col))); - k_int < ((Upper) ? lst_k_int - : cl::sycl::min( - index_t(row + ((!Diag || Unit) ? 0 : 1)), - index_t(lst_k_int))); + ((Lower) + ? id_col + : sycl::max(index_t(row + ((!Diag || Unit) ? 1 : 0)), + index_t(id_col))); + k_int < + ((Upper) ? lst_k_int + : sycl::min(index_t(row + ((!Diag || Unit) ? 0 : 1)), + index_t(lst_k_int))); k_int++) { // calculate the product between the row and the vector_. auto prod = ProductOperator::eval(matrix_.eval(id_row, k_int), @@ -555,9 +556,9 @@ template template PORTBLAS_INLINE typename GemvRow::value_t + matrix_t, vector_t>::value_t GemvRow::eval( - local_memory_t shrMem, cl::sycl::nd_item<1> ndItem) { + local_memory_t shrMem, sycl::nd_item<1> ndItem) { using index_t = typename GemvRow::index_t; using value_t = typename GemvRow::eval( ((dimC + (localSz * nWG_col_) - 1) / (localSz * nWG_col_)) * localSz; index_t frs_row = idWFR * rowSz; - index_t lst_row = cl::sycl::min(index_t(dimR), index_t(frs_row + rowSz)); + index_t lst_row = sycl::min(index_t(dimR), index_t(frs_row + rowSz)); index_t frs_col = idWFC * dimWFC + interLoop * localid; - index_t lst_col = cl::sycl::min(index_t(dimC), index_t(frs_col + dimWFC)); + index_t lst_col = sycl::min(index_t(dimC), index_t(frs_col + dimWFC)); // TODO(Peter): This should be constexpr once half supports it static const value_t init_val = AddOperator::template init(); // PROBLEM IF ONLY SOME THREADS OF A WORKGROUP ARE CANCELED @@ -600,7 +601,7 @@ GemvRow::eval( } else { for (index_t rowid = frs_row; rowid < lst_row; rowid += shrSz) { value_t val = init_val; - auto blqSz = cl::sycl::min(index_t(shrSz), index_t(lst_row - rowid)); + auto blqSz = sycl::min(index_t(shrSz), index_t(lst_row - rowid)); if (interLoop == 1) { for (index_t row = 0, id_row = rowid; row < blqSz; row++, id_row++) { val = (Diag && Unit && @@ -631,7 +632,7 @@ GemvRow::eval( id_col += localSz * interLoop) { for (index_t k_int = id_col; k_int < - cl::sycl::min(index_t(id_col + interLoop), index_t(lst_col)); + sycl::min(index_t(id_col + interLoop), index_t(lst_col)); k_int++) { if (Lower && Upper && Diag && !Unit) { auto prod = ProductOperator::eval(matrix_.eval(id_row, k_int), @@ -657,7 +658,7 @@ GemvRow::eval( } // This barrier is mandatory to be sure the data is on the shared memory - ndItem.barrier(cl::sycl::access::fence_space::local_space); + ndItem.barrier(sycl::access::fence_space::local_space); // Reduction inside the block for (index_t offset = localSz >> 1; offset > 0; offset >>= 1) { if (localid < offset) { @@ -669,7 +670,7 @@ GemvRow::eval( } // This barrier is mandatory to be sure the data are on the shared // memory - ndItem.barrier(cl::sycl::access::fence_space::local_space); + ndItem.barrier(sycl::access::fence_space::local_space); } if (localid == 0) { for (index_t row = 0, id_row = rowid; row < blqSz; row++, id_row++) { @@ -684,7 +685,7 @@ GemvRow::eval( template PORTBLAS_INLINE void GemvRow::bind(cl::sycl::handler &h) { + matrix_t, vector_t>::bind(sycl::handler &h) { lhs_.bind(h); matrix_.bind(h); vector_.bind(h); @@ -734,7 +735,7 @@ template PORTBLAS_INLINE bool GemvCol::valid_thread( - cl::sycl::nd_item<1> ndItem) const { + sycl::nd_item<1> ndItem) const { return true; } @@ -756,9 +757,9 @@ GemvCol::eval(index_t i) { template PORTBLAS_INLINE typename GemvCol::value_t + vector_t>::value_t GemvCol::eval( - cl::sycl::nd_item<1> ndItem) { + sycl::nd_item<1> ndItem) { using index_t = typename GemvCol::index_t; index_t localid = ndItem.get_local_id(0); @@ -775,10 +776,10 @@ GemvCol::eval( (dimR + (localSz * nWG_row_) - 1) / (localSz * nWG_row_) * localSz; index_t frs_row = idWFR * dimWFR + localid; - index_t lst_row = cl::sycl::min(index_t(dimR), index_t(frs_row + dimWFR)); + index_t lst_row = sycl::min(index_t(dimR), index_t(frs_row + dimWFR)); index_t frs_col = idWFC * colSz; - index_t lst_col = cl::sycl::min(index_t(dimC), index_t(frs_col + colSz)); + index_t lst_col = sycl::min(index_t(dimC), index_t(frs_col + colSz)); // PROBLEM IF ONLY SOME THREADS OF A WORKGROUP ARE CANCELED // TO SOLVE IT, USE GLOBAL VALUES OF frs_row AND lst_row if ((!Upper && @@ -796,14 +797,13 @@ GemvCol::eval( ? matrix_.eval(rowid, rowid) : AdditionIdentity::eval(vector_.eval(0)); for (index_t id_col = - ((Lower) - ? frs_col - : cl::sycl::max(index_t(rowid + ((!Diag || Unit) ? 1 : 0)), + ((Lower) ? frs_col + : sycl::max(index_t(rowid + ((!Diag || Unit) ? 1 : 0)), index_t(frs_col))); - id_col < - ((Upper) ? lst_col - : cl::sycl::min(index_t(rowid + ((!Diag || Unit) ? 0 : 1)), - index_t(lst_col))); + id_col < ((Upper) + ? lst_col + : sycl::min(index_t(rowid + ((!Diag || Unit) ? 0 : 1)), + index_t(lst_col))); id_col++) { auto prod = ProductOperator::eval(matrix_.eval(rowid, id_col), vector_.eval(id_col)); @@ -821,9 +821,9 @@ template template PORTBLAS_INLINE typename GemvCol::value_t + vector_t>::value_t GemvCol::eval( - local_memory_t shrMem, cl::sycl::nd_item<1> ndItem) { + local_memory_t shrMem, sycl::nd_item<1> ndItem) { using index_t = typename GemvCol::index_t; index_t localid = ndItem.get_local_id(0); @@ -840,10 +840,10 @@ GemvCol::eval( (dimR + (localSz * nWG_row_) - 1) / (localSz * nWG_row_) * localSz; index_t frs_row = idWFR * dimWFR + localid; - index_t lst_row = cl::sycl::min(index_t(dimR), index_t(frs_row + dimWFR)); + index_t lst_row = sycl::min(index_t(dimR), index_t(frs_row + dimWFR)); index_t frs_col = idWFC * colSz; - index_t lst_col = cl::sycl::min(index_t(dimC), index_t(frs_col + colSz)); + index_t lst_col = sycl::min(index_t(dimC), index_t(frs_col + colSz)); // PROBLEM IF ONLY SOME THREADS OF A WORKGROUP ARE CANCELED // TO SOLVE IT, USE GLOBAL VALUES OF frs_row AND lst_row @@ -861,17 +861,17 @@ GemvCol::eval( if (colid > frs_col) { // This barrier is mandatory to be sure the data is on the shared // memory - ndItem.barrier(cl::sycl::access::fence_space::local_space); + ndItem.barrier(sycl::access::fence_space::local_space); } auto blqSz = - cl::sycl::min(index_t(local_memory_size_), index_t(lst_col - colid)); + sycl::min(index_t(local_memory_size_), index_t(lst_col - colid)); // Copy a block of elements of vector_ vector_ to the shared memory, // executing the expresion tree if it is needed for (index_t col = localid; (col < blqSz); col += localSz) { shrMem[col] = vector_.eval(colid + col); } // This barrier is mandatory to be sure the data is on the shared memory - ndItem.barrier(cl::sycl::access::fence_space::local_space); + ndItem.barrier(sycl::access::fence_space::local_space); // The product is computed for (index_t rowid = frs_row; rowid < lst_row; rowid += localSz) { @@ -906,7 +906,7 @@ GemvCol::eval( template PORTBLAS_INLINE void GemvCol::bind(cl::sycl::handler &h) { + vector_t>::bind(sycl::handler &h) { lhs_.bind(h); matrix_.bind(h); vector_.bind(h); diff --git a/src/operations/blas2/ger.hpp b/src/operations/blas2/ger.hpp index 63e604746..08eff8b8f 100644 --- a/src/operations/blas2/ger.hpp +++ b/src/operations/blas2/ger.hpp @@ -54,13 +54,13 @@ Ger::get_size() const { } template PORTBLAS_INLINE bool Ger::valid_thread( - cl::sycl::nd_item<1> ndItem) const { + sycl::nd_item<1> ndItem) const { return true; } template PORTBLAS_INLINE typename Ger::value_t -Ger::eval(cl::sycl::nd_item<1> ndItem) { +Ger::eval(sycl::nd_item<1> ndItem) { using index_t = typename Ger::index_t; const index_t subgroup_size = ndItem.get_sub_group().get_local_range().get(0); @@ -106,7 +106,7 @@ Ger::eval(cl::sycl::nd_item<1> ndItem) { for (index_t sub_id_col = 0; sub_id_col < col_per_workitem; sub_id_col++) { const value_t rhs_2_sub_id_col = #ifndef __ADAPTIVECPP__ - cl::sycl::group_broadcast(ndItem.get_sub_group(), rhs_2, sub_id_col); + sycl::group_broadcast(ndItem.get_sub_group(), rhs_2, sub_id_col); #else rhs_2_.eval(id_col0 + sub_id_col); #endif @@ -125,8 +125,7 @@ Ger::eval(cl::sycl::nd_item<1> ndItem) { template template PORTBLAS_INLINE typename Ger::value_t -Ger::eval(sharedT shrMem, - cl::sycl::nd_item<1> ndItem) { +Ger::eval(sharedT shrMem, sycl::nd_item<1> ndItem) { using index_t = typename Ger::index_t; const index_t group_id = ndItem.get_group(0); @@ -172,7 +171,7 @@ Ger::eval(sharedT shrMem, value_t prefetch_lhs_ = (id_row1 < dimR && id_col1 < dimC) ? lhs_.eval(id_row1, id_col1) : 0; - ndItem.barrier(cl::sycl::access::fence_space::local_space); + ndItem.barrier(sycl::access::fence_space::local_space); for (index_t id_col = 0; id_col < col_per_workitem; id_col++) { const value_t val = l_rhs_1[id_row0] * l_rhs_2[id_col0 + id_col]; @@ -188,7 +187,7 @@ Ger::eval(sharedT shrMem, } template -PORTBLAS_INLINE void Ger::bind(cl::sycl::handler &h) { +PORTBLAS_INLINE void Ger::bind(sycl::handler &h) { lhs_.bind(h); rhs_1_.bind(h); rhs_2_.bind(h); @@ -229,7 +228,7 @@ template PORTBLAS_INLINE bool GerRow::valid_thread( - cl::sycl::nd_item<1> ndItem) const { + sycl::nd_item<1> ndItem) const { return true; } @@ -253,9 +252,9 @@ GerRow::eval( template PORTBLAS_INLINE typename GerRow::value_t + rhs_2_t>::value_t GerRow::eval( - cl::sycl::nd_item<1> ndItem) { + sycl::nd_item<1> ndItem) { using index_t = typename GerRow::index_t; index_t localid = ndItem.get_local_id(0); @@ -326,9 +325,9 @@ template template PORTBLAS_INLINE typename GerRow::value_t + rhs_2_t>::value_t GerRow::eval( - sharedT shrMem, cl::sycl::nd_item<1> ndItem) { + sharedT shrMem, sycl::nd_item<1> ndItem) { using index_t = typename GerRow::index_t; index_t localid = ndItem.get_local_id(0); @@ -362,7 +361,7 @@ GerRow::eval( if (rowid > frs_row) // This barrier is mandatory to be sure the data is on the shared // memory - ndItem.barrier(cl::sycl::access::fence_space::local_space); + ndItem.barrier(sycl::access::fence_space::local_space); auto blqSz = std::min(shrSz, lst_row - rowid); for (index_t row = localid, id_row = rowid + localid; (row < blqSz); row += localSz, id_row += localSz) { @@ -370,7 +369,7 @@ GerRow::eval( } // This barrier is mandatory to be sure the data is on the shared memory - ndItem.barrier(cl::sycl::access::fence_space::local_space); + ndItem.barrier(sycl::access::fence_space::local_space); for (index_t colid = frs_col; (colid < lst_col); colid += localSz) { auto val = rhs_2_.eval(colid); @@ -392,7 +391,7 @@ GerRow::eval( if (rowid > frs_row) // This barrier is mandatory to be sure the data is on the shared // memory - ndItem.barrier(cl::sycl::access::fence_space::local_space); + ndItem.barrier(sycl::access::fence_space::local_space); auto blqSz = std::min(shrSz1, lst_row - rowid); for (index_t row = localid, id_row = rowid + localid; (row < blqSz); row += localSz, id_row += localSz) { @@ -401,7 +400,7 @@ GerRow::eval( } // This barrier is mandatory to be sure the data is on the shared memory - ndItem.barrier(cl::sycl::access::fence_space::local_space); + ndItem.barrier(sycl::access::fence_space::local_space); for (index_t colid = frs_col; (colid < lst_col); colid += localSz) { auto val1 = rhs_1_.eval(colid); @@ -427,7 +426,7 @@ GerRow::eval( template PORTBLAS_INLINE void GerRow::bind(cl::sycl::handler &h) { + rhs_2_t>::bind(sycl::handler &h) { lhs_.bind(h); rhs_1_.bind(h); rhs_2_.bind(h); @@ -469,7 +468,7 @@ template PORTBLAS_INLINE bool GerCol::valid_thread( - cl::sycl::nd_item<1> ndItem) const { + sycl::nd_item<1> ndItem) const { return true; } template ::eval( template PORTBLAS_INLINE typename GerCol::value_t + rhs_2_t>::value_t GerCol::eval( - cl::sycl::nd_item<1> ndItem) { + sycl::nd_item<1> ndItem) { using index_t = typename GerCol::index_t; index_t localid = ndItem.get_local_id(0); @@ -554,9 +553,9 @@ template template PORTBLAS_INLINE typename GerCol::value_t + rhs_2_t>::value_t GerCol::eval( - sharedT shrMem, cl::sycl::nd_item<1> ndItem) { + sharedT shrMem, sycl::nd_item<1> ndItem) { using index_t = typename GerCol::index_t; index_t localid = ndItem.get_local_id(0); @@ -590,7 +589,7 @@ GerCol::eval( if (colid > frs_col) { // This barrier is mandatory to be sure the data is on the shared // memory - ndItem.barrier(cl::sycl::access::fence_space::local_space); + ndItem.barrier(sycl::access::fence_space::local_space); } auto blqSz = std::min(local_memory_size_, lst_col - colid); @@ -599,7 +598,7 @@ GerCol::eval( } // This barrier is mandatory to be sure the data is on the shared memory - ndItem.barrier(cl::sycl::access::fence_space::local_space); + ndItem.barrier(sycl::access::fence_space::local_space); for (index_t id_row = frs_row; id_row < lst_row; id_row += localSz) { auto val = rhs_1_.eval(id_row); @@ -622,7 +621,7 @@ GerCol::eval( if (colid > frs_col) { // This barrier is mandatory to be sure the data is on the shared // memory - ndItem.barrier(cl::sycl::access::fence_space::local_space); + ndItem.barrier(sycl::access::fence_space::local_space); } auto blqSz = std::min(shrSz1, lst_col - colid); @@ -632,7 +631,7 @@ GerCol::eval( } // This barrier is mandatory to be sure the data is on the shared memory - ndItem.barrier(cl::sycl::access::fence_space::local_space); + ndItem.barrier(sycl::access::fence_space::local_space); for (index_t id_row = frs_row; id_row < lst_row; id_row += localSz) { auto val1 = rhs_1_.eval(id_row); @@ -658,7 +657,7 @@ GerCol::eval( template PORTBLAS_INLINE void GerCol::bind(cl::sycl::handler &h) { + rhs_2_t>::bind(sycl::handler &h) { lhs_.bind(h); rhs_1_.bind(h); rhs_2_.bind(h); diff --git a/src/operations/blas2/sbmv.hpp b/src/operations/blas2/sbmv.hpp index 53dc76a25..31e125839 100644 --- a/src/operations/blas2/sbmv.hpp +++ b/src/operations/blas2/sbmv.hpp @@ -66,7 +66,7 @@ template PORTBLAS_INLINE bool Sbmv::valid_thread( - cl::sycl::nd_item<1> ndItem) const { + sycl::nd_item<1> ndItem) const { // Valid threads are established by ::eval. return true; } @@ -76,13 +76,13 @@ template ::value_t Sbmv::eval( - cl::sycl::nd_item<1> ndItem) { + sycl::nd_item<1> ndItem) { const index_t lhs_idx = ndItem.get_global_id(0); value_t val = 0; if (lhs_idx < lhs_.get_size()) { - const index_t k_beg = cl::sycl::max(index_t(0), lhs_idx - k_); - const index_t k_end = cl::sycl::min(vector_.get_size(), lhs_idx + k_ + 1); + const index_t k_beg = sycl::max(index_t(0), lhs_idx - k_); + const index_t k_end = sycl::min(vector_.get_size(), lhs_idx + k_ + 1); for (index_t s_idx = k_beg; s_idx < k_end; ++s_idx) { index_t K, J; @@ -108,8 +108,8 @@ PORTBLAS_INLINE template -PORTBLAS_INLINE void Sbmv::bind(cl::sycl::handler &h) { +PORTBLAS_INLINE void +Sbmv::bind(sycl::handler &h) { lhs_.bind(h); matrix_.bind(h); vector_.bind(h); diff --git a/src/operations/blas2/spr.hpp b/src/operations/blas2/spr.hpp index 9a69866d3..a94ecf4b6 100644 --- a/src/operations/blas2/spr.hpp +++ b/src/operations/blas2/spr.hpp @@ -92,7 +92,7 @@ Spr::compute_row_col( template typename rhs_1_t::value_t Spr::eval( - cl::sycl::nd_item<1> ndItem) { + sycl::nd_item<1> ndItem) { const index_t id = ndItem.get_local_linear_id(); const index_t group_id = ndItem.get_group(0); const index_t local_range = static_cast(ndItem.get_local_range(0)); @@ -109,8 +109,8 @@ typename rhs_1_t::value_t Spr::eval( #ifndef __ADAPTIVECPP__ } - row = cl::sycl::group_broadcast(ndItem.get_group(), row); - col = cl::sycl::group_broadcast(ndItem.get_group(), col); + row = sycl::group_broadcast(ndItem.get_group(), row); + col = sycl::group_broadcast(ndItem.get_group(), col); #endif if (global_idx < lhs_size) { @@ -151,7 +151,7 @@ typename rhs_1_t::value_t Spr::eval( template PORTBLAS_INLINE void Spr::bind( - cl::sycl::handler& h) { + sycl::handler& h) { lhs_.bind(h); rhs_1_.bind(h); rhs_2_.bind(h); @@ -174,9 +174,8 @@ Spr::get_size() const { } template -PORTBLAS_INLINE bool -Spr::valid_thread( - cl::sycl::nd_item<1> ndItem) const { +PORTBLAS_INLINE bool Spr::valid_thread(sycl::nd_item<1> ndItem) const { return true; } diff --git a/src/operations/blas2/tbmv.hpp b/src/operations/blas2/tbmv.hpp index f0b768e14..2eb0c33d0 100644 --- a/src/operations/blas2/tbmv.hpp +++ b/src/operations/blas2/tbmv.hpp @@ -63,7 +63,7 @@ template PORTBLAS_INLINE bool Tbmv::valid_thread(cl::sycl::nd_item<1> ndItem) const { + is_unitdiag>::valid_thread(sycl::nd_item<1> ndItem) const { // Valid threads are established by ::eval. return true; } @@ -72,9 +72,9 @@ template PORTBLAS_INLINE typename Tbmv::value_t + is_transposed, is_unitdiag>::value_t Tbmv::eval(cl::sycl::nd_item<1> ndItem) { + is_unitdiag>::eval(sycl::nd_item<1> ndItem) { const index_t lhs_idx = ndItem.get_global_id(0); value_t val = 0; @@ -86,9 +86,8 @@ Tbmv -PORTBLAS_INLINE void -Tbmv::bind(cl::sycl::handler &h) { +PORTBLAS_INLINE void Tbmv::bind(sycl::handler &h) { lhs_.bind(h); matrix_.bind(h); vector_.bind(h); diff --git a/src/operations/blas2/txsv.hpp b/src/operations/blas2/txsv.hpp index a974253f2..54fe0ef15 100644 --- a/src/operations/blas2/txsv.hpp +++ b/src/operations/blas2/txsv.hpp @@ -93,14 +93,14 @@ template template -PORTBLAS_INLINE typename Txsv::value_t -Txsv::eval(local_memory_t local_mem, - cl::sycl::nd_item<1> ndItem) { +PORTBLAS_INLINE + typename Txsv::value_t + Txsv::eval(local_memory_t local_mem, + sycl::nd_item<1> ndItem) { value_t ret = 0; -#if (SYCL_LANGUAGE_VERSION >= 202000) && !(defined __ADAPTIVECPP__) +#ifndef __ADAPTIVECPP__ constexpr bool is_forward = (is_upper && is_transposed) || (!is_upper && !is_transposed); @@ -137,9 +137,9 @@ Txsv( + auto a = sycl::atomic_ref( sync_.eval(0)); // Get the wg_id of actual workgroup @@ -185,7 +185,7 @@ Txsv ready_block)))) ready_block = - cl::sycl::group_broadcast(ndItem.get_sub_group(), not_wi0 ? 0 : *p); + sycl::group_broadcast(ndItem.get_sub_group(), not_wi0 ? 0 : *p); loc_x[l_idx] = (curr_offset < _N) ? lhs_.eval(curr_offset) : value_t(0); } @@ -220,7 +220,7 @@ Txsv PORTBLAS_INLINE void Txsv::bind(cl::sycl::handler &h) { + is_upper, is_transposed, is_unitdiag>::bind(sycl::handler &h) { lhs_.bind(h); matrix_.bind(h); sync_.bind(h); diff --git a/src/operations/blas2/xpmv.hpp b/src/operations/blas2/xpmv.hpp index f30573aa1..b35b4d440 100644 --- a/src/operations/blas2/xpmv.hpp +++ b/src/operations/blas2/xpmv.hpp @@ -56,9 +56,9 @@ PORTBLAS_INLINE Xpmv -PORTBLAS_INLINE bool Xpmv::valid_thread(cl::sycl::nd_item<1> ndItem) +PORTBLAS_INLINE bool +Xpmv::valid_thread(sycl::nd_item<1> ndItem) const { // Valid threads are established by ::eval. return true; @@ -73,7 +73,7 @@ PORTBLAS_INLINE is_symmetric, is_upper, is_transposed, is_unit>::value_t Xpmv::eval(sharedT shrMem, - cl::sycl::nd_item<1> ndItem) { + sycl::nd_item<1> ndItem) { const index_t gid = ndItem.get_group(0); constexpr index_t loc_x_dim = local_range_x; @@ -136,13 +136,13 @@ PORTBLAS_INLINE A += _mat_next_stride(stride); } - ndItem.barrier(cl::sycl::access::fence_space::local_space); + ndItem.barrier(sycl::access::fence_space::local_space); #pragma unroll for (index_t _j = 0; _j < priv_y_dim; ++_j) priv_res += priv_A[_j] * loc_x[l_y_offset + _j]; - ndItem.barrier(cl::sycl::access::fence_space::local_space); + ndItem.barrier(sycl::access::fence_space::local_space); } } @@ -195,7 +195,7 @@ PORTBLAS_INLINE for (index_t b = (is_upper ? 0 : gid + 1); b < (is_upper ? gid : nblock); ++b) { - ndItem.barrier(cl::sycl::access::fence_space::local_space); + ndItem.barrier(sycl::access::fence_space::local_space); #pragma unroll for (index_t _j = 0; _j < priv_y_dim; ++_j) { @@ -215,7 +215,7 @@ PORTBLAS_INLINE A += _mat_next_stride(stride); } - ndItem.barrier(cl::sycl::access::fence_space::local_space); + ndItem.barrier(sycl::access::fence_space::local_space); if (!l_idy) { const index_t x_idx = b * loc_x_dim + l_idx; @@ -231,7 +231,7 @@ PORTBLAS_INLINE } } - ndItem.barrier(cl::sycl::access::fence_space::local_space); + ndItem.barrier(sycl::access::fence_space::local_space); #pragma unroll for (index_t _j = 0; _j < priv_y_dim; ++_j) { @@ -239,11 +239,11 @@ PORTBLAS_INLINE priv_res += loc_A[loc_lda * j + l_idx] * loc_x[j]; } - ndItem.barrier(cl::sycl::access::fence_space::local_space); + ndItem.barrier(sycl::access::fence_space::local_space); loc_A[loc_lda * l_idy + l_idx] = priv_res; - ndItem.barrier(cl::sycl::access::fence_space::local_space); + ndItem.barrier(sycl::access::fence_space::local_space); if (!l_idy) { value_t res = value_t(0); @@ -268,7 +268,7 @@ template PORTBLAS_INLINE void Xpmv::bind(cl::sycl::handler &h) { + is_upper, is_transposed, is_unit>::bind(sycl::handler &h) { lhs_.bind(h); matrix_.bind(h); vector_.bind(h); diff --git a/src/operations/blas3/gemm_common.hpp b/src/operations/blas3/gemm_common.hpp index 5c26e3984..522d2852b 100644 --- a/src/operations/blas3/gemm_common.hpp +++ b/src/operations/blas3/gemm_common.hpp @@ -27,8 +27,8 @@ #include "operations/blas3_trees.h" #include "views/view.h" -#include #include +#include #include namespace blas { @@ -47,7 +47,7 @@ static PORTBLAS_INLINE Tout mul_add(Tin a, Tin b, Tout c, typename std::enable_if::value && is_sycl_scalar::value>::type * = 0) { - return (cl::sycl::mad(a, b, c)); + return (sycl::mad(a, b, c)); } template diff --git a/src/operations/blas3/gemm_interleaved.hpp b/src/operations/blas3/gemm_interleaved.hpp index 0997b31ce..405e38c7f 100644 --- a/src/operations/blas3/gemm_interleaved.hpp +++ b/src/operations/blas3/gemm_interleaved.hpp @@ -33,7 +33,7 @@ namespace internal { template struct packet { - using type = cl::sycl::vec; + using type = sycl::vec; }; template @@ -41,7 +41,7 @@ struct packet { using type = T; }; -using address_t = cl::sycl::access::address_space; +using address_t = sycl::access::address_space; /*! * @brief Load a packet of size 1. @@ -64,8 +64,8 @@ PORTBLAS_INLINE void store(T packet, PtrT ptr) { */ template -PORTBLAS_INLINE void load(cl::sycl::vec &packet, PtrT ptr) { - packet.template load
(0, cl::sycl::multi_ptr(ptr)); +PORTBLAS_INLINE void load(sycl::vec &packet, PtrT ptr) { + packet.template load
(0, sycl::multi_ptr(ptr)); } /*! @@ -73,8 +73,8 @@ PORTBLAS_INLINE void load(cl::sycl::vec &packet, PtrT ptr) { */ template -PORTBLAS_INLINE void store(const cl::sycl::vec &packet, PtrT ptr) { - packet.template store
(0, cl::sycl::multi_ptr(ptr)); +PORTBLAS_INLINE void store(const sycl::vec &packet, PtrT ptr) { + packet.template store
(0, sycl::multi_ptr(ptr)); } } // namespace internal @@ -113,7 +113,7 @@ class Gemm::type; using index_t = typename std::make_signed::type; - using address_t = cl::sycl::access::address_space; + using address_t = sycl::access::address_space; static constexpr int local_memory_size = 0; /*! @brief The number of rows processed by each work item */ static constexpr index_t item_rows = tile_type::item_rows; @@ -197,26 +197,26 @@ class Gemm get_nd_range(index_t) const noexcept { + PORTBLAS_INLINE sycl::nd_range<1> get_nd_range(index_t) const noexcept { const index_t number_of_block_per_row = ((m_ - 1) / block_rows) + 1; const index_t number_of_block_per_cols = ((n_ - 1) / block_cols) + 1; const index_t number_of_block_per_batch = ((batch_size_ - 1) / (block_batchs * item_batchs)) + 1; - const cl::sycl::range<1> nwg(number_of_block_per_row * - number_of_block_per_cols * - number_of_block_per_batch); - const cl::sycl::range<1> wgs(wg_rows * wg_cols * wg_batchs); + const sycl::range<1> nwg(number_of_block_per_row * + number_of_block_per_cols * + number_of_block_per_batch); + const sycl::range<1> wgs(wg_rows * wg_cols * wg_batchs); - return cl::sycl::nd_range<1>(nwg * wgs, wgs); + return sycl::nd_range<1>(nwg * wgs, wgs); } - PORTBLAS_INLINE bool valid_thread(const cl::sycl::nd_item<1> &) const { + PORTBLAS_INLINE bool valid_thread(const sycl::nd_item<1> &) const { return true; } - PORTBLAS_INLINE void eval(cl::sycl::nd_item<1> id) noexcept { + PORTBLAS_INLINE void eval(sycl::nd_item<1> id) noexcept { auto A = a_.get_pointer(); auto B = b_.get_pointer(); auto C = c_.get_pointer(); @@ -501,9 +501,9 @@ class Gemm::value #endif // __ADAPTIVECPP__ ) { - *reg_res = cl::sycl::mad(reg_a[j * (item_batchs / VectorSize) + b], - reg_b[i * (item_batchs / VectorSize) + b], - *reg_res); + *reg_res = + sycl::mad(reg_a[j * (item_batchs / VectorSize) + b], + reg_b[i * (item_batchs / VectorSize) + b], *reg_res); } else { #pragma unroll for (int v = 0; v < VectorSize; ++v) { @@ -523,7 +523,7 @@ class Gemm struct Packetize { #ifdef GEMM_VECTORIZATION_SUPPORT - using PacketType = cl::sycl::vec; + using PacketType = sycl::vec; static constexpr int packet_size = vector_size; template PORTBLAS_INLINE static constexpr bool check_size() { @@ -47,7 +47,7 @@ struct Packetize { } #else // In the case where vectorization is not enabled, always set to 1 - using PacketType = cl::sycl::vec; + using PacketType = sycl::vec; static constexpr int packet_size = 1; template PORTBLAS_INLINE static constexpr bool check_size() { @@ -86,9 +86,9 @@ struct Packetize { PacketType packet{}; if (in_range) { - using address_t = cl::sycl::access::address_space; + using address_t = sycl::access::address_space; packet.template load( - 0, cl::sycl::multi_ptr(src)); + 0, sycl::multi_ptr(src)); } else { #pragma unroll for (index_t i = 0; i < packet_size; i++) { @@ -119,9 +119,9 @@ struct Packetize { template static PORTBLAS_INLINE typename std::enable_if::type store( PacketType &packet, DestPointerType dest) { - using address_t = cl::sycl::access::address_space; + using address_t = sycl::access::address_space; packet.template store( - 0, cl::sycl::multi_ptr(dest)); + 0, sycl::multi_ptr(dest)); } }; diff --git a/src/operations/blas3/gemm_load_store_complex.hpp b/src/operations/blas3/gemm_load_store_complex.hpp index 7b1eb769b..8c1c96c98 100644 --- a/src/operations/blas3/gemm_load_store_complex.hpp +++ b/src/operations/blas3/gemm_load_store_complex.hpp @@ -39,8 +39,8 @@ template class vec_complex { static_assert(NumElements == 1, "Vector wrapper arround sycl::complex of size>1 unsupported."); - using address_t = cl::sycl::access::address_space; - using decorated_t = cl::sycl::access::decorated; + using address_t = sycl::access::address_space; + using decorated_t = sycl::access::decorated; using DataType = DataT; static constexpr int getNumElements() { return NumElements; } size_t size() const noexcept { return NumElements; } @@ -114,14 +114,14 @@ class vec_complex { // Load template void load(size_t Offset, - cl::sycl::multi_ptr Ptr) { + sycl::multi_ptr Ptr) { m_Data = *(Ptr + Offset * NumElements); } // Store template void store(size_t Offset, - cl::sycl::multi_ptr Ptr) const { + sycl::multi_ptr Ptr) const { *(Ptr + Offset * NumElements) = m_Data; } }; diff --git a/src/operations/blas3/gemm_load_store_joint_matrix.hpp b/src/operations/blas3/gemm_load_store_joint_matrix.hpp index 876817158..e9700b015 100644 --- a/src/operations/blas3/gemm_load_store_joint_matrix.hpp +++ b/src/operations/blas3/gemm_load_store_joint_matrix.hpp @@ -39,7 +39,7 @@ supported). template struct PacketizeJointMatrix { #ifdef GEMM_VECTORIZATION_SUPPORT - using PacketType = cl::sycl::vec; + using PacketType = sycl::vec; static constexpr int packet_size = vector_size; template PORTBLAS_INLINE static constexpr bool check_size() { @@ -47,7 +47,7 @@ struct PacketizeJointMatrix { } #else // In the case where vectorization is not enabled, always set to 1 - using PacketType = cl::sycl::vec; + using PacketType = sycl::vec; static constexpr int packet_size = 1; template PORTBLAS_INLINE static constexpr bool check_size() { @@ -67,20 +67,20 @@ struct PacketizeJointMatrix { const bool in_range, SrcPointerType src, DestPointerType dest, EdgePredicate) { value_t val = in_range ? *src : value_t{0}; - using address_t = cl::sycl::access::address_space; - if constexpr (std::is_same, - DestPointerType>::value) { - using dtype = cl::sycl::half; + using address_t = sycl::access::address_space; + if constexpr (std::is_same< + sycl::multi_ptr, + DestPointerType>::value) { + using dtype = sycl::half; *dest = static_cast(val); - } else if constexpr (std::is_same, - DestPointerType>::value) { - using namespace cl::sycl::ext::oneapi; + } else if constexpr (std::is_same< + sycl::multi_ptr, + DestPointerType>::value) { + using namespace sycl::ext::oneapi; *dest = bfloat16(val); } else { - using namespace cl::sycl::ext::oneapi::experimental::matrix; + using namespace sycl::ext::oneapi::experimental::matrix; *dest = round_to_tf32(val); } } @@ -99,10 +99,10 @@ struct PacketizeJointMatrix { EdgePredicate edge_in_range) { PacketType packet{}; - using address_t = cl::sycl::access::address_space; + using address_t = sycl::access::address_space; if (in_range) { packet.template load( - 0, cl::sycl::multi_ptr(src)); + 0, sycl::multi_ptr(src)); store(packet, dest); } else { // avoid writing to variable, instead directly write to @@ -110,19 +110,19 @@ struct PacketizeJointMatrix { // with release compiler. #pragma unroll for (index_t i = 0; i < packet_size; i++, dest++, src++) { - if constexpr (std::is_same, - DestPointerType>::value) { - using dtype = cl::sycl::half; + if constexpr (std::is_same< + sycl::multi_ptr, + DestPointerType>::value) { + using dtype = sycl::half; *dest = static_cast(edge_in_range(i) ? *src : 0); - } else if constexpr (std::is_same, - DestPointerType>::value) { - using namespace cl::sycl::ext::oneapi; + } else if constexpr (std::is_same< + sycl::multi_ptr, + DestPointerType>::value) { + using namespace sycl::ext::oneapi; *dest = bfloat16(edge_in_range(i) ? *src : 0.f); } else { - using namespace cl::sycl::ext::oneapi::experimental::matrix; + using namespace sycl::ext::oneapi::experimental::matrix; *dest = edge_in_range(i) ? round_to_tf32(*src) : 0.f; } } @@ -134,39 +134,39 @@ struct PacketizeJointMatrix { */ template static PORTBLAS_INLINE void store(PacketType &packet, DestPointerType dest) { - using address_t = cl::sycl::access::address_space; - if constexpr (std::is_same, - DestPointerType>::value) { - using dtype = cl::sycl::half; - cl::sycl::vec new_vec{}; + using address_t = sycl::access::address_space; + if constexpr (std::is_same< + sycl::multi_ptr, + DestPointerType>::value) { + using dtype = sycl::half; + sycl::vec new_vec{}; for (index_t i = 0; i < packet_size; i++) { reinterpret_cast(&new_vec)[i] = static_cast(reinterpret_cast(&packet)[i]); } new_vec.template store( - 0, cl::sycl::multi_ptr(dest)); - } else if constexpr (std::is_same, - DestPointerType>::value) { + 0, sycl::multi_ptr(dest)); + } else if constexpr (std::is_same< + sycl::multi_ptr, + DestPointerType>::value) { // sycl::vec doesn't accept bfloat16 as a valid input type // so we need to write the packet elements individually to // the shared memory. - using namespace cl::sycl::ext::oneapi; + using namespace sycl::ext::oneapi; for (index_t i = 0; i < packet_size; i++, dest++) { *dest = bfloat16(reinterpret_cast(&packet)[i]); } } else { - using namespace cl::sycl::ext::oneapi::experimental::matrix; + using namespace sycl::ext::oneapi::experimental::matrix; using dtype = float; - cl::sycl::vec new_vec; + sycl::vec new_vec; for (index_t i = 0; i < packet_size; i++) { reinterpret_cast(&new_vec)[i] = round_to_tf32(reinterpret_cast(&packet)[i]); } new_vec.template store( - 0, cl::sycl::multi_ptr(dest)); + 0, sycl::multi_ptr(dest)); } } }; diff --git a/src/operations/blas3/gemm_local.hpp b/src/operations/blas3/gemm_local.hpp index 776c1dc08..a69da1cda 100644 --- a/src/operations/blas3/gemm_local.hpp +++ b/src/operations/blas3/gemm_local.hpp @@ -89,7 +89,7 @@ class Gemm; using vector_t = typename packetize_t::PacketType; using vector_out_t = typename packetize_out_t::PacketType; - using address_t = cl::sycl::access::address_space; + using address_t = sycl::access::address_space; // enable easier access to tile dimensions static constexpr index_t item_rows = tile_type::item_rows; @@ -234,11 +234,11 @@ class Gemm get_nd_range( + PORTBLAS_INLINE sycl::nd_range<1> get_nd_range( index_t compute_units) const noexcept { - const cl::sycl::range<1> nwg(get_workgroup_cluster() * - get_num_workgroup_cluster(compute_units)); - const cl::sycl::range<1> wgs(wg_size); + const sycl::range<1> nwg(get_workgroup_cluster() * + get_num_workgroup_cluster(compute_units)); + const sycl::range<1> wgs(wg_size); #ifdef VERBOSE std::cout << " M: " << a_.get_size_row() << " , N " << b_.get_size_col() << " , big_tile_rows: " << big_tile_rows @@ -246,7 +246,7 @@ class Gemm(nwg * wgs, wgs); + return sycl::nd_range<1>(nwg * wgs, wgs); } PORTBLAS_INLINE index_t get_size() const { @@ -261,7 +261,7 @@ class Gemm PORTBLAS_INLINE void eval(local_memory_t scratch_acc, - const cl::sycl::nd_item<1> &id) noexcept { + const sycl::nd_item<1> &id) noexcept { index_t m = a_.get_size_row(); index_t n = b_.get_size_col(); const index_t k = a_.get_size_col(); @@ -354,7 +354,7 @@ class Gemm &ndItem) const { + PORTBLAS_INLINE bool valid_thread(const sycl::nd_item<1> &ndItem) const { return true; } @@ -425,17 +425,16 @@ class Gemm PORTBLAS_INLINE void compute_panel_gemm( - const cl::sycl::nd_item<1> &id, const index_t &item_id, - const index_t &row_a, const index_t &col_a, const index_t &row_b, - const index_t &col_b, const index_t &m, const index_t &n, - const index_t &orig_k, const index_t &mc, const index_t &nc, - const index_t &a_size, const index_t &b_size, const index_t &c_size, - InputPointerType orig_A, const index_t &lda, InputPointerType orig_B, - const index_t &ldb, OutputPointerType orig_C, const index_t &ldc, - ScratchPointerType s1, ScratchPointerType s2, ScratchPointerType s3, - ScratchPointerType s4, value_t *reg_a, value_t ®_b, - const bool out_of_range, index_t batch_stride, index_t wg_batch_id, - index_t batch_size) noexcept { + const sycl::nd_item<1> &id, const index_t &item_id, const index_t &row_a, + const index_t &col_a, const index_t &row_b, const index_t &col_b, + const index_t &m, const index_t &n, const index_t &orig_k, + const index_t &mc, const index_t &nc, const index_t &a_size, + const index_t &b_size, const index_t &c_size, InputPointerType orig_A, + const index_t &lda, InputPointerType orig_B, const index_t &ldb, + OutputPointerType orig_C, const index_t &ldc, ScratchPointerType s1, + ScratchPointerType s2, ScratchPointerType s3, ScratchPointerType s4, + value_t *reg_a, value_t ®_b, const bool out_of_range, + index_t batch_stride, index_t wg_batch_id, index_t batch_size) noexcept { index_t ofs = 1; do { auto A = orig_A; @@ -453,7 +452,7 @@ class Gemm(item_id, m, n, k, ra, ca, rb, cb, A, lda, B, ldb, s1, s3, out_of_range); - id.barrier(cl::sycl::access::fence_space::local_space); + id.barrier(sycl::access::fence_space::local_space); compute_block_gemm(item_id, s2, s4, reg_a, reg_b, reg_res); A += cl_elems * (trans_a ? 1 : lda); @@ -498,7 +497,7 @@ class Gemm(item_id, m, n, k, ra, ca, rb, cb, A, lda, B, ldb, s1, s3, out_of_range); - id.barrier(cl::sycl::access::fence_space::local_space); + id.barrier(sycl::access::fence_space::local_space); compute_block_gemm(item_id, s2, s4, reg_a, reg_b, reg_res); @@ -531,11 +530,11 @@ class Gemm( - 0, cl::sycl::multi_ptr(reg)); + 0, sycl::multi_ptr(reg)); out_vec *= alpha_; out_vec.template store( - 0, cl::sycl::multi_ptr(out_ptr)); + 0, sycl::multi_ptr(out_ptr)); } /*! * @brief Store the computed gemm result to the C matrix @@ -791,22 +790,21 @@ class Gemm static PORTBLAS_INLINE typename std::enable_if::type sync_smem( - const cl::sycl::nd_item<1> &id, index_t &ofs_sign, P &s, - Ps &...ss) noexcept { + const sycl::nd_item<1> &id, index_t &ofs_sign, P &s, Ps &...ss) noexcept { s += ofs_sign * o; sync_smem(id, ofs_sign, ss...); } template static PORTBLAS_INLINE typename std::enable_if::type sync_smem( - const cl::sycl::nd_item<1> &, index_t &ofs_sign) noexcept { + const sycl::nd_item<1> &, index_t &ofs_sign) noexcept { ofs_sign = -ofs_sign; } template static PORTBLAS_INLINE typename std::enable_if::type sync_smem( - const cl::sycl::nd_item<1> &id, index_t &, Ps &...) noexcept { - id.barrier(cl::sycl::access::fence_space::local_space); + const sycl::nd_item<1> &id, index_t &, Ps &...) noexcept { + id.barrier(sycl::access::fence_space::local_space); } /** diff --git a/src/operations/blas3/gemm_local_joint_matrix.hpp b/src/operations/blas3/gemm_local_joint_matrix.hpp index 35e5ac434..267ad38b5 100644 --- a/src/operations/blas3/gemm_local_joint_matrix.hpp +++ b/src/operations/blas3/gemm_local_joint_matrix.hpp @@ -83,7 +83,7 @@ class Gemm::type; using index_t = typename std::make_signed::type; using packetize_t = PacketizeJointMatrix; - using address_t = cl::sycl::access::address_space; + using address_t = sycl::access::address_space; // enable easier access to tile dimensions static constexpr index_t item_rows = tile_type::item_rows; @@ -244,7 +244,7 @@ class Gemm get_nd_range(index_t) const noexcept { + PORTBLAS_INLINE sycl::nd_range<1> get_nd_range(index_t) const noexcept { size_t x_groups = static_cast((get_wg_x_cluster() - 1) / jm_row_frags + 1); size_t y_groups = @@ -256,8 +256,8 @@ class Gemm{x_groups * batch_size_ * y_groups * wg_size, - wg_size}; + return sycl::nd_range<1>{x_groups * batch_size_ * y_groups * wg_size, + wg_size}; } PORTBLAS_INLINE index_t get_size() const { @@ -272,7 +272,7 @@ class Gemm PORTBLAS_INLINE void eval(local_memory_t scratch_acc, - const cl::sycl::nd_item<1> &id) noexcept { + const sycl::nd_item<1> &id) noexcept { index_t m = a_.get_size_row(); index_t n = b_.get_size_col(); index_t k = a_.get_size_col(); @@ -354,7 +354,7 @@ class Gemm::value) { auto s1 = scratch + s1_offset; auto s2 = scratch + s2_offset; @@ -373,7 +373,7 @@ class Gemm *>(&scratch); auto s1 = input_scratch + s1_offset; @@ -394,7 +394,7 @@ class Gemm &ndItem) const { + PORTBLAS_INLINE bool valid_thread(const sycl::nd_item<1> &ndItem) const { return true; } @@ -426,7 +426,7 @@ class Gemm PORTBLAS_INLINE void compute_panel_gemm( - const cl::sycl::nd_item<1> &id, const index_t &item_id, const index_t &m, + const sycl::nd_item<1> &id, const index_t &item_id, const index_t &m, const index_t &n, const index_t &orig_k, const index_t &mc, const index_t &nc, InputPointerType orig_A, const index_t &lda, InputPointerType orig_B, const index_t &ldb, OutputPointerType orig_C, @@ -436,11 +436,10 @@ class Gemm; + using namespace sycl::ext::oneapi::experimental::matrix; + using CType = joint_matrix; do { auto A = orig_A; auto B = orig_B; @@ -450,7 +449,7 @@ class Gemm= cl_elems) { extract_input_blocks( item_id, m, n, k, A, lda, B, ldb, s1, s3, out_of_range); - id.barrier(cl::sycl::access::fence_space::local_space); + id.barrier(sycl::access::fence_space::local_space); compute_block_gemm(id, s2, s4, reg_res); A += cl_elems * (trans_a ? 1 : lda); B += cl_elems * (trans_b ? ldb : 1); @@ -466,7 +465,7 @@ class Gemm 0) { extract_input_blocks( item_id, m, n, k, A, lda, B, ldb, s1, s3, out_of_range); - id.barrier(cl::sycl::access::fence_space::local_space); + id.barrier(sycl::access::fence_space::local_space); compute_block_gemm(id, s2, s4, reg_res); sync_smem - PORTBLAS_INLINE void store_output_block(cl::sycl::nd_item<1> id, index_t mc, + PORTBLAS_INLINE void store_output_block(sycl::nd_item<1> id, index_t mc, index_t nc, OutputPointerType C, ScratchPointerType scratch, index_t ldc, @@ -518,9 +517,9 @@ class Gemm; Cfloat_Type float_out; @@ -568,12 +567,12 @@ class Gemm= block_rows && nc >= nc_conditional) { @@ -785,21 +784,19 @@ class Gemm PORTBLAS_INLINE void compute_block_gemm( - const cl::sycl::nd_item<1> &id, InputPointerType s2, InputPointerType s4, + const sycl::nd_item<1> &id, InputPointerType s2, InputPointerType s4, CType (®_res)[frags_per_sg]) noexcept { - using namespace cl::sycl::ext::oneapi::experimental::matrix; + using namespace sycl::ext::oneapi::experimental::matrix; constexpr layout pattern_a = trans_a ? layout::row_major : layout::col_major; constexpr layout pattern_b = trans_b ? layout::row_major : layout::col_major; - using AType = - joint_matrix; - using BType = - joint_matrix; + using AType = joint_matrix; + using BType = joint_matrix; const index_t strideA = ldsa; const index_t strideB = ldsb; @@ -850,22 +847,21 @@ class Gemm static PORTBLAS_INLINE typename std::enable_if::type sync_smem( - const cl::sycl::nd_item<1> &id, index_t &ofs_sign, P &s, - Ps &...ss) noexcept { + const sycl::nd_item<1> &id, index_t &ofs_sign, P &s, Ps &...ss) noexcept { s += ofs_sign * o; sync_smem(id, ofs_sign, ss...); } template static PORTBLAS_INLINE typename std::enable_if::type sync_smem( - const cl::sycl::nd_item<1> &, index_t &ofs_sign) noexcept { + const sycl::nd_item<1> &, index_t &ofs_sign) noexcept { ofs_sign = -ofs_sign; } template static PORTBLAS_INLINE typename std::enable_if::type sync_smem( - const cl::sycl::nd_item<1> &id, index_t &, Ps &...) noexcept { - id.barrier(cl::sycl::access::fence_space::local_space); + const sycl::nd_item<1> &id, index_t &, Ps &...) noexcept { + id.barrier(sycl::access::fence_space::local_space); } }; // Gemm diff --git a/src/operations/blas3/gemm_no_local_full_vec.hpp b/src/operations/blas3/gemm_no_local_full_vec.hpp index 24d9e50ac..b377b06a5 100644 --- a/src/operations/blas3/gemm_no_local_full_vec.hpp +++ b/src/operations/blas3/gemm_no_local_full_vec.hpp @@ -70,7 +70,7 @@ class Gemm::type; using index_t = typename std::make_signed::type; - using address_t = cl::sycl::access::address_space; + using address_t = sycl::access::address_space; using packetize_t = Packetize; static constexpr int local_memory_size = 0; /*! @brief The number of rows processed by each work item */ @@ -172,24 +172,24 @@ class Gemm get_nd_range( + PORTBLAS_INLINE sycl::nd_range<1> get_nd_range( index_t compute_units) const noexcept { - const cl::sycl::range<1> nwg(get_workgroup_cluster() * - get_num_workgroup_cluster(compute_units)); - const cl::sycl::range<1> wgs(wg_rows * wg_cols); + const sycl::range<1> nwg(get_workgroup_cluster() * + get_num_workgroup_cluster(compute_units)); + const sycl::range<1> wgs(wg_rows * wg_cols); - return cl::sycl::nd_range<1>(nwg * wgs, wgs); + return sycl::nd_range<1>(nwg * wgs, wgs); } PORTBLAS_INLINE index_t get_size() const { return a_.get_size_row() * b_.get_size_col(); } - PORTBLAS_INLINE bool valid_thread(const cl::sycl::nd_item<1> &) const { + PORTBLAS_INLINE bool valid_thread(const sycl::nd_item<1> &) const { return true; } - PORTBLAS_INLINE void eval(cl::sycl::nd_item<1> id) noexcept { + PORTBLAS_INLINE void eval(sycl::nd_item<1> id) noexcept { index_t m = a_.get_size_row(); index_t n = b_.get_size_col(); const index_t original_m = m; @@ -332,12 +332,12 @@ class Gemm( - 0, cl::sycl::multi_ptr( + 0, sycl::multi_ptr( C + j * wg_rows * packet_size)); out_vec *= beta_; out_vec.template store( - 0, cl::sycl::multi_ptr( + 0, sycl::multi_ptr( reg_res + i * item_rows + j * packet_size)); } } @@ -420,7 +420,7 @@ class Gemm( - 0, cl::sycl::multi_ptr( + 0, sycl::multi_ptr( ptr + i * ld + j * ptr_next)); } else { // if not in range perform element-wise load checking boundaries at @@ -586,7 +586,7 @@ class Gemm( - 0, cl::sycl::multi_ptr(out_reg)); + 0, sycl::multi_ptr(out_reg)); } } } @@ -648,7 +648,7 @@ class Gemm( - 0, cl::sycl::multi_ptr( + 0, sycl::multi_ptr( ptr + (i * next_element + j) * ld)); } else { @@ -725,7 +725,7 @@ class Gemm( - 0, cl::sycl::multi_ptr(ptr)); + 0, sycl::multi_ptr(ptr)); } else { // Otherwise perform an element-wise load, checking boundaries each load. #pragma unroll @@ -737,7 +737,7 @@ class Gemm( - 0, cl::sycl::multi_ptr(reg)); + 0, sycl::multi_ptr(reg)); } /*! @@ -789,7 +789,7 @@ class Gemm( - 0, cl::sycl::multi_ptr(ptr)); + 0, sycl::multi_ptr(ptr)); } else { // Otherwise perform an element-wise load, checking boundaries each load. #pragma unroll @@ -801,7 +801,7 @@ class Gemm( - 0, cl::sycl::multi_ptr(reg)); + 0, sycl::multi_ptr(reg)); } /*! * @brief The following function computes the partial GEMM for the input @@ -922,12 +922,12 @@ class Gemm( - 0, cl::sycl::multi_ptr( + 0, sycl::multi_ptr( reg_res + i * item_rows + j * packet_size)); out_vec *= alpha_; out_vec.template store( - 0, cl::sycl::multi_ptr( + 0, sycl::multi_ptr( C + j * wg_rows * packet_size)); } } diff --git a/src/operations/blas3/gemm_no_local_partial_vec.hpp b/src/operations/blas3/gemm_no_local_partial_vec.hpp index 1b7027c0a..1a99c7be4 100644 --- a/src/operations/blas3/gemm_no_local_partial_vec.hpp +++ b/src/operations/blas3/gemm_no_local_partial_vec.hpp @@ -70,7 +70,7 @@ class Gemm::type; using index_t = typename std::make_signed::type; - using address_t = cl::sycl::access::address_space; + using address_t = sycl::access::address_space; using packetize_t = Packetize; using vector_t = typename packetize_t::PacketType; static constexpr int local_memory_size = 0; @@ -169,24 +169,24 @@ class Gemm get_nd_range( + PORTBLAS_INLINE sycl::nd_range<1> get_nd_range( index_t compute_units) const noexcept { - const cl::sycl::range<1> nwg(get_workgroup_cluster() * - get_num_workgroup_cluster(compute_units)); - const cl::sycl::range<1> wgs(wg_rows * wg_cols); + const sycl::range<1> nwg(get_workgroup_cluster() * + get_num_workgroup_cluster(compute_units)); + const sycl::range<1> wgs(wg_rows * wg_cols); - return cl::sycl::nd_range<1>(nwg * wgs, wgs); + return sycl::nd_range<1>(nwg * wgs, wgs); } PORTBLAS_INLINE index_t get_size() const { return a_.get_size_row() * b_.get_size_col(); } - PORTBLAS_INLINE bool valid_thread(const cl::sycl::nd_item<1> &) const { + PORTBLAS_INLINE bool valid_thread(const sycl::nd_item<1> &) const { return true; } - PORTBLAS_INLINE void eval(cl::sycl::nd_item<1> id) noexcept { + PORTBLAS_INLINE void eval(sycl::nd_item<1> id) noexcept { index_t m = a_.get_size_row(); index_t n = b_.get_size_col(); const index_t k = a_.get_size_col(); @@ -418,7 +418,7 @@ class Gemm( - 0, - cl::sycl::multi_ptr(ptr)); + 0, sycl::multi_ptr(ptr)); } in_vec.template store( - 0, cl::sycl::multi_ptr(reg)); + 0, sycl::multi_ptr(reg)); // Move pointers and update index for next load ptr += ld; @@ -519,11 +518,11 @@ class Gemm( - 0, cl::sycl::multi_ptr(reg)); + 0, sycl::multi_ptr(reg)); out_vec *= alpha_; out_vec.template store( - 0, cl::sycl::multi_ptr(out_ptr)); + 0, sycl::multi_ptr(out_ptr)); } /*! @@ -564,12 +563,12 @@ class Gemm( - 0, cl::sycl::multi_ptr( + 0, sycl::multi_ptr( reg_res + i * item_rows + j * a_packet_size)); out_vec *= alpha_; out_vec.template store( - 0, cl::sycl::multi_ptr( + 0, sycl::multi_ptr( C + j * wg_rows * a_packet_size)); } } diff --git a/src/operations/blas3/gemm_partial_local.hpp b/src/operations/blas3/gemm_partial_local.hpp index c58bd821e..579c6842e 100644 --- a/src/operations/blas3/gemm_partial_local.hpp +++ b/src/operations/blas3/gemm_partial_local.hpp @@ -173,7 +173,7 @@ class GemmPartial get_nd_range( + PORTBLAS_INLINE sycl::nd_range<1> get_nd_range( index_t compute_units) noexcept { - const cl::sycl::range<1> nwg(get_workgroup_cluster(compute_units)); - const cl::sycl::range<1> wgs(local_thread_size); - return cl::sycl::nd_range<1>(nwg * wgs, wgs); + const sycl::range<1> nwg(get_workgroup_cluster(compute_units)); + const sycl::range<1> wgs(local_thread_size); + return sycl::nd_range<1>(nwg * wgs, wgs); } template PORTBLAS_INLINE void eval(local_memory_t scratch, - cl::sycl::nd_item<1> id) noexcept { + sycl::nd_item<1> id) noexcept { /* Pointers to the scratch memory (lhs and rhs) */ value_t* scratch_ptr = scratch.localAcc.get_pointer(); value_t* rhs_scratch_ptr = scratch_ptr + rhs_scratch_offset; @@ -272,7 +272,7 @@ class GemmPartial -PORTBLAS_INLINE cl::sycl::nd_range<1> +PORTBLAS_INLINE sycl::nd_range<1> Gemm::get_nd_range(index_t compute_units) const noexcept { - const cl::sycl::range<1> nwg( + const sycl::range<1> nwg( Gemm::get_num_workgroup_cluster(compute_units)); - const cl::sycl::range<1> wgs(wg_size); - return cl::sycl::nd_range<1>(nwg * wgs, wgs); + const sycl::range<1> wgs(wg_size); + return sycl::nd_range<1>(nwg * wgs, wgs); } template ::valid_thread(const cl::sycl::nd_item<1>& ndItem) const { + UseJointMatrix>::valid_thread(const sycl::nd_item<1>& ndItem) const { return true; } @@ -201,7 +201,7 @@ PORTBLAS_INLINE void Gemm::eval(cl::sycl::nd_item<1> id) noexcept { + UseJointMatrix>::eval(sycl::nd_item<1> id) noexcept { const index_t wg_batch_id = id.get_group(0) / get_workgroup_cluster(); // This will disable all workgroups that dont have any batch to work on if (wg_batch_id >= batch_size_) { @@ -237,7 +237,7 @@ Gemm 0) { - reg_res = cl::sycl::mad(A[0], B[0], reg_res); + reg_res = sycl::mad(A[0], B[0], reg_res); --k_; A = A + (trans_a ? 1 : lda_); B = B + (trans_b ? ldb_ : 1); @@ -268,7 +268,7 @@ PORTBLAS_INLINE void Gemm::bind(cl::sycl::handler& h) { + UseJointMatrix>::bind(sycl::handler& h) { a_.bind(h); b_.bind(h); c_.bind(h); diff --git a/src/operations/blas3/trsm.hpp b/src/operations/blas3/trsm.hpp index b637e216f..39fdcca19 100644 --- a/src/operations/blas3/trsm.hpp +++ b/src/operations/blas3/trsm.hpp @@ -26,7 +26,7 @@ #include "operations/blas3_trees.h" #include "views/view.h" -#include +#include namespace blas { @@ -41,15 +41,14 @@ template PORTBLAS_INLINE bool DiagonalBlocksInverter::valid_thread( - cl::sycl::nd_item<1> id) const { + sycl::nd_item<1> id) const { return true; } template -PORTBLAS_INLINE void -DiagonalBlocksInverter::bind( - cl::sycl::handler& cgh) { +PORTBLAS_INLINE void DiagonalBlocksInverter::bind(sycl::handler& cgh) { A_.bind(cgh); invA_.bind(cgh); } @@ -67,7 +66,7 @@ template PORTBLAS_INLINE void DiagonalBlocksInverter::eval( - local_memory_t localMem, cl::sycl::nd_item<1> item) noexcept { + local_memory_t localMem, sycl::nd_item<1> item) noexcept { auto A = A_.get_pointer(); auto invA = invA_.get_pointer(); value_t* local = localMem.localAcc.get_pointer(); @@ -103,13 +102,13 @@ DiagonalBlocksInverter::eval( ? (onUnitDiag ? value_t{1} : A[j * lda_ + i + srcBlockOffset]) : value_t{0}; } - item.barrier(cl::sycl::access::fence_space::local_space); + item.barrier(sycl::access::fence_space::local_space); // Inverts the diagonal elements if (!UnitDiag) { local[i + i * internalBlockSize] = value_t{1} / local[i + i * internalBlockSize]; - item.barrier(cl::sycl::access::fence_space::local_space); + item.barrier(sycl::access::fence_space::local_space); } if (Upper) { @@ -117,16 +116,16 @@ DiagonalBlocksInverter::eval( value_t sum = value_t{0}; if (i < j) { for (index_t k = 0; k < j; ++k) { - sum = cl::sycl::mad(local[k + i * internalBlockSize], - local[j + k * internalBlockSize], sum); + sum = sycl::mad(local[k + i * internalBlockSize], + local[j + k * internalBlockSize], sum); } } - item.barrier(cl::sycl::access::fence_space::local_space); + item.barrier(sycl::access::fence_space::local_space); if (i < j) { local[j + i * internalBlockSize] = sum * (UnitDiag ? -1 : -local[j + j * internalBlockSize]); } - item.barrier(cl::sycl::access::fence_space::local_space); + item.barrier(sycl::access::fence_space::local_space); } } else { // Computes the elements j+1:internalBlock-1 of the j-th column @@ -134,16 +133,16 @@ DiagonalBlocksInverter::eval( value_t sum = value_t{0}; if (i > j) { for (index_t k = j + 1; k < internalBlockSize; ++k) { - sum = cl::sycl::mad(local[k + i * internalBlockSize], - local[j + k * internalBlockSize], sum); + sum = sycl::mad(local[k + i * internalBlockSize], + local[j + k * internalBlockSize], sum); } } - item.barrier(cl::sycl::access::fence_space::local_space); + item.barrier(sycl::access::fence_space::local_space); if (i > j) { local[j + i * internalBlockSize] = sum * (UnitDiag ? -1 : -local[j + j * internalBlockSize]); } - item.barrier(cl::sycl::access::fence_space::local_space); + item.barrier(sycl::access::fence_space::local_space); } } diff --git a/src/operations/blas_operators.hpp b/src/operations/blas_operators.hpp index 86e6afc06..1e5569311 100644 --- a/src/operations/blas_operators.hpp +++ b/src/operations/blas_operators.hpp @@ -32,7 +32,7 @@ #include #include -#include +#include #include "operations/blas_constants.hpp" #include "operations/blas_operators.h" @@ -55,7 +55,7 @@ struct AbsoluteValue { using is_floating_point = std::integral_constant::value || - std::is_same::value>; + std::is_same::value>; #else template using is_floating_point = std::is_floating_point; @@ -65,14 +65,14 @@ struct AbsoluteValue { static PORTBLAS_INLINE value_t eval( const value_t &val, typename std::enable_if::value>::type * = 0) { - return cl::sycl::abs(val); + return sycl::abs(val); } template static PORTBLAS_INLINE value_t eval(const value_t &val, typename std::enable_if::value>::type * = 0) { - return cl::sycl::fabs(val); + return sycl::fabs(val); } }; @@ -103,7 +103,7 @@ struct IdentityOperator : public Operators { struct SignOperator : public Operators { template static PORTBLAS_INLINE rhs_t eval(const rhs_t r) { - return cl::sycl::sign(r); + return sycl::sign(r); } }; @@ -117,14 +117,14 @@ struct NegationOperator : public Operators { struct SqrtOperator : public Operators { template static PORTBLAS_INLINE rhs_t eval(const rhs_t r) { - return (cl::sycl::sqrt(r)); + return (sycl::sqrt(r)); } }; struct HypotenuseOperator : public Operators { template static PORTBLAS_INLINE rhs_t eval(const lhs_t l, const rhs_t r) { - return (cl::sycl::hypot(l, r)); + return (sycl::hypot(l, r)); } }; diff --git a/src/operations/extension/axpy_batch.hpp b/src/operations/extension/axpy_batch.hpp index 344d4ec2a..3a220ac38 100644 --- a/src/operations/extension/axpy_batch.hpp +++ b/src/operations/extension/axpy_batch.hpp @@ -53,7 +53,7 @@ template PORTBLAS_INLINE typename lhs_t::value_t Axpy_batch::eval( - cl::sycl::nd_item<1> ndItem) { + sycl::nd_item<1> ndItem) { const index_t n{n_}; const value_t alpha{alpha_}; const auto vx = rhs_.get_data(); @@ -68,7 +68,7 @@ Axpy_batch::eval( const index_t size_compute_rateo = (n > nbl * localSize) ? n / (nbl * localSize) : batch_size_; - const index_t jump_value{cl::sycl::min(batch_size_, size_compute_rateo)}; + const index_t jump_value{sycl::min(batch_size_, size_compute_rateo)}; if (group_id >= jump_value || l_id > n) return {}; @@ -109,7 +109,7 @@ Axpy_batch::eval( template PORTBLAS_INLINE void Axpy_batch::bind(cl::sycl::handler& h) { + rhs_t>::bind(sycl::handler& h) { lhs_.bind(h); rhs_.bind(h); } @@ -133,7 +133,7 @@ template PORTBLAS_INLINE bool Axpy_batch::valid_thread( - cl::sycl::nd_item<1> ndItem) const { + sycl::nd_item<1> ndItem) const { return true; } } // namespace blas diff --git a/src/operations/extension/matcopy_batch.hpp b/src/operations/extension/matcopy_batch.hpp index 8ff3923d6..f410c1549 100644 --- a/src/operations/extension/matcopy_batch.hpp +++ b/src/operations/extension/matcopy_batch.hpp @@ -197,9 +197,8 @@ Matcopy_batch -typename lhs_t::value_t -Matcopy_batch::eval( - cl::sycl::nd_item<1> ndItem) { +typename lhs_t::value_t Matcopy_batch::eval(sycl::nd_item<1> ndItem) { const index_t m{m_}; const index_t n{n_}; @@ -239,7 +238,7 @@ Matcopy_batch::eval( template PORTBLAS_INLINE void Matcopy_batch::bind(cl::sycl::handler& h) { + rhs_2_t>::bind(sycl::handler& h) { lhs_.bind(h); rhs_1_.bind(h); rhs_2_.bind(h); @@ -265,7 +264,7 @@ template PORTBLAS_INLINE bool Matcopy_batch::valid_thread( - cl::sycl::nd_item<1> ndItem) const { + sycl::nd_item<1> ndItem) const { return true; } } // namespace blas diff --git a/src/operations/extension/reduction.hpp b/src/operations/extension/reduction.hpp index b807d2b04..419d303bf 100644 --- a/src/operations/extension/reduction.hpp +++ b/src/operations/extension/reduction.hpp @@ -28,8 +28,8 @@ #include "blas_meta.h" #include "operations/extension/reduction.h" #include "views/view.h" -#include #include +#include namespace blas { template PORTBLAS_INLINE bool Reduction::valid_thread( - cl::sycl::nd_item<1> id) const { + sycl::nd_item<1> id) const { return true; } template PORTBLAS_INLINE void Reduction::bind( - cl::sycl::handler& h) { + sycl::handler& h) { in_.bind(h); out_.bind(h); } @@ -92,7 +92,7 @@ PORTBLAS_INLINE void Reduction -PORTBLAS_INLINE cl::sycl::nd_range<1> +PORTBLAS_INLINE sycl::nd_range<1> Reduction::get_nd_range( index_t compute_units) noexcept { constexpr index_t local_range = params_t::get_local_thread_size_preserve() * @@ -106,8 +106,8 @@ Reduction::get_nd_range( const index_t global_range = preserve_num_groups * reduced_group_count_ * local_range; - return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), - cl::sycl::range<1>(local_range)); + return sycl::nd_range<1>(sycl::range<1>(global_range), + sycl::range<1>(local_range)); } /*! @@ -153,7 +153,7 @@ template template PORTBLAS_INLINE void Reduction::eval( - local_memory_t scratch, cl::sycl::nd_item<1> id) noexcept { + local_memory_t scratch, sycl::nd_item<1> id) noexcept { const index_t local_id = id.get_local_id(0); const index_t group_id = id.get_group(0); index_t preserve_local_id = @@ -201,7 +201,7 @@ PORTBLAS_INLINE void Reduction::eval( } element_t* out_scratch_ptr = scratch_ptr + scratch_idx; - id.barrier(cl::sycl::access::fence_space::local_space); + id.barrier(sycl::access::fence_space::local_space); if (!params_t::is_outer_dim()) { accumulator = *out_scratch_ptr; } @@ -220,7 +220,7 @@ PORTBLAS_INLINE void Reduction::eval( *out_scratch_ptr = accumulator; } - id.barrier(cl::sycl::access::fence_space::local_space); + id.barrier(sycl::access::fence_space::local_space); } // Write result to the output vector diff --git a/src/operations/extension/transpose.hpp b/src/operations/extension/transpose.hpp index 87485660e..6e10bda18 100644 --- a/src/operations/extension/transpose.hpp +++ b/src/operations/extension/transpose.hpp @@ -34,7 +34,7 @@ template PORTBLAS_INLINE bool Transpose::valid_thread(cl::sycl::nd_item<1> item) const { + element_t>::valid_thread(sycl::nd_item<1> item) const { index_t idx = item.get_global_linear_id(); return (idx < get_size()); } @@ -43,7 +43,7 @@ template PORTBLAS_INLINE void Transpose::bind(cl::sycl::handler &cgh) { + element_t>::bind(sycl::handler &cgh) { A_.bind(cgh); At_.bind(cgh); } @@ -71,7 +71,7 @@ Transpose of the current work_item + * @param id [input] the sycl::nd_item<1> of the current work_item * @param in_idx [output] the input global-memory index * @param out_idx [output] the output global-memory index * @param i [output] the global row-index @@ -81,7 +81,7 @@ template PORTBLAS_INLINE void Transpose::get_indices(cl::sycl::nd_item<1> id, index_t &in_idx, + element_t>::get_indices(sycl::nd_item<1> id, index_t &in_idx, index_t &out_idx, index_t &i, index_t &j) { index_t idg = id.get_group(0); index_t idc = id.get_local_id(0); @@ -114,7 +114,7 @@ template PORTBLAS_INLINE void Transpose::eval(cl::sycl::nd_item<1> id) { + element_t>::eval(sycl::nd_item<1> id) { index_t idx = id.get_global_linear_id(); index_t in_index, out_index, i_id, j_id; @@ -137,7 +137,7 @@ Transpose of the current work_item + * @param id [input] the sycl::nd_item<1> of the current work_item * @param in_idx [output] the input global-memory index * @param out_idx [output] the output global-memory index * @param in_local_idx [output] the input local-memory linear index @@ -152,7 +152,7 @@ template PORTBLAS_INLINE void Transpose::get_indices(cl::sycl::nd_item<1> id, index_t &in_idx, + element_t>::get_indices(sycl::nd_item<1> id, index_t &in_idx, index_t &in_local_idx, index_t &out_idx, index_t &out_local_idx, index_t &i_block_start, @@ -193,7 +193,7 @@ template PORTBLAS_INLINE void Transpose::eval(local_memory_t local_mem, cl::sycl::nd_item<1> id) { + element_t>::eval(local_memory_t local_mem, sycl::nd_item<1> id) { value_t *local = local_mem.localAcc.get_pointer(); auto A = A_.get_pointer(); auto At = At_.get_pointer(); @@ -212,7 +212,7 @@ Transpose PORTBLAS_INLINE bool TransposeAdd< both_trans, Tile_size, wg_size, cl_size, local_memory, in1_t, in2_t, out_t, - element_t>::valid_thread(cl::sycl::nd_item<1> item) const { + element_t>::valid_thread(sycl::nd_item<1> item) const { auto idx = item.get_global_linear_id(); return idx < get_size(); } @@ -241,7 +241,7 @@ template PORTBLAS_INLINE void TransposeAdd::bind(cl::sycl::handler &cgh) { + in2_t, out_t, element_t>::bind(sycl::handler &cgh) { A_.bind(cgh); B_.bind(cgh); C_.bind(cgh); @@ -272,7 +272,7 @@ TransposeAdd of the current work_item + * @param id [input] the sycl::nd_item<1> of the current work_item * @param in_a_idx [output] the input A global-memory index * @param in_b_idx [output] the input B global-memory index * @param out_idx [output] the output C global-memory index @@ -286,7 +286,7 @@ template PORTBLAS_INLINE void TransposeAdd::get_indices(cl::sycl::nd_item<1> id, + in2_t, out_t, element_t>::get_indices(sycl::nd_item<1> id, index_t &in_a_idx, index_t &in_b_idx, index_t &out_idx, index_t &i, @@ -335,7 +335,7 @@ template PORTBLAS_INLINE void TransposeAdd::eval(cl::sycl::nd_item<1> id) { + in2_t, out_t, element_t>::eval(sycl::nd_item<1> id) { auto A = A_.get_pointer(); auto B = B_.get_pointer(); auto C = C_.get_pointer(); @@ -387,7 +387,7 @@ template PORTBLAS_INLINE void TransposeAdd< both_trans, Tile_size, wg_size, cl_size, local_memory, in1_t, in2_t, out_t, - element_t>::get_indices(cl::sycl::nd_item<1> id, index_t &in_a_idx, + element_t>::get_indices(sycl::nd_item<1> id, index_t &in_a_idx, index_t &in_b_idx, index_t &in_local_idx, index_t &out_idx, index_t &out_local_idx, index_t &i_block_start, index_t &j_block_start, @@ -442,7 +442,7 @@ template PORTBLAS_INLINE void TransposeAdd::eval(local_memory_t local_mem, - cl::sycl::nd_item<1> id) { + sycl::nd_item<1> id) { value_t *local = local_mem.localAcc.get_pointer(); auto A = A_.get_pointer(); @@ -472,7 +472,7 @@ TransposeAdd #include +#include namespace blas { /*! @brief A struct for containing a local accessor if shared memory is enabled. @@ -44,8 +45,8 @@ struct LocalMemory { @param size Size in elements of the local accessor. @param cgh SYCL command group handler. */ - PORTBLAS_INLINE LocalMemory(size_t size, cl::sycl::handler &cgh) - : localAcc(cl::sycl::range<1>(size), cgh) {} + PORTBLAS_INLINE LocalMemory(size_t size, sycl::handler &cgh) + : localAcc(sycl::range<1>(size), cgh) {} /*! @brief Subscript operator that forwards on to the local accessor subscript @@ -53,16 +54,12 @@ struct LocalMemory { @param id SYCL id. @return Reference to an element of the local accessor. */ - PORTBLAS_INLINE value_t &operator[](cl::sycl::id<1> id) { - return localAcc[id]; - } + PORTBLAS_INLINE value_t &operator[](sycl::id<1> id) { return localAcc[id]; } /*! @brief Local accessor. */ - cl::sycl::accessor - localAcc; + sycl::local_accessor localAcc; }; /*! @@ -78,7 +75,7 @@ struct LocalMemory { @param size Size in elements of the local accessor. @param cgh SYCL command group handler. */ - PORTBLAS_INLINE LocalMemory(size_t, cl::sycl::handler &) {} + PORTBLAS_INLINE LocalMemory(size_t, sycl::handler &) {} }; /*! @@ -103,7 +100,7 @@ struct ExpressionTreeEvaluator { static PORTBLAS_INLINE void eval( expression_tree_t &tree, LocalMemory scratch, - cl::sycl::nd_item<1> index) { + sycl::nd_item<1> index) { tree.eval(scratch, index); } }; @@ -128,7 +125,7 @@ struct ExpressionTreeEvaluator, - cl::sycl::nd_item<1> index) { + sycl::nd_item<1> index) { if (tree.valid_thread(index)) { tree.eval(index); } @@ -156,7 +153,7 @@ index. static PORTBLAS_INLINE void eval( expression_tree_t &tree, LocalMemory scratch, - cl::sycl::nd_item<1> index) { + sycl::nd_item<1> index) { tree.eval(scratch, index); } }; @@ -178,7 +175,7 @@ struct ExpressionTreeFunctor { PORTBLAS_INLINE ExpressionTreeFunctor(local_memory_t scratch, expression_tree_t t) : scratch_(scratch), t_(t) {} - PORTBLAS_INLINE void operator()(cl::sycl::nd_item<1> i) const { + PORTBLAS_INLINE void operator()(sycl::nd_item<1> i) const { expression_tree_t &non_const_t = *const_cast(&t_); non_const_t.adjust_access_displacement(); ExpressionTreeEvaluator -static PORTBLAS_INLINE cl::sycl::event execute_tree( +static PORTBLAS_INLINE sycl::event execute_tree( queue_t q_, expression_tree_t t, size_t _localSize, size_t _globalSize, - size_t _shMem, std::vector dependencies) { + size_t _shMem, std::vector dependencies) { using value_t = typename LocalMemoryType::type; auto localSize = _localSize; auto globalSize = _globalSize; auto shMem = _shMem; - cl::sycl::event ev; + sycl::event ev; try { - auto cg1 = [=](cl::sycl::handler &h) mutable { -#if SYCL_LANGUAGE_VERSION < 202000 - cl::sycl::event::wait(dependencies); -#else + auto cg1 = [=](sycl::handler &h) mutable { h.depends_on(dependencies); -#endif t.bind(h); auto scratch = LocalMemory(shMem, h); - cl::sycl::nd_range<1> gridConfiguration = cl::sycl::nd_range<1>{ - cl::sycl::range<1>{globalSize}, cl::sycl::range<1>{localSize}}; + sycl::nd_range<1> gridConfiguration = sycl::nd_range<1>{ + sycl::range<1>{globalSize}, sycl::range<1>{localSize}}; h.parallel_for( gridConfiguration, ExpressionTreeFunctoracquire_usm_mem(size); else - return cl::sycl::malloc_device(size, q_); + return sycl::malloc_device(size, q_); } template @@ -91,10 +91,10 @@ SB_Handle::release_temp_mem(const typename SB_Handle::event_t& dependencies, if (tempMemPool_ != nullptr) return tempMemPool_->release_usm_mem(dependencies, mem); else { - cl::sycl::context context = q_.get_context(); - return {q_.submit([&](cl::sycl::handler& cgh) { + sycl::context context = q_.get_context(); + return {q_.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - cgh.host_task([=]() { cl::sycl::free(mem, context); }); + cgh.host_task([=]() { sycl::free(mem, context); }); })}; } } diff --git a/src/sb_handle/temp_memory_pool.hpp b/src/sb_handle/temp_memory_pool.hpp index 1d57b0c6f..a7fb75541 100644 --- a/src/sb_handle/temp_memory_pool.hpp +++ b/src/sb_handle/temp_memory_pool.hpp @@ -14,13 +14,13 @@ Temp_Mem_Pool::acquire_buff_mem(size_t size) { temp_buffer_map_mutex_.lock(); // lock auto found = temp_buffer_map_.lower_bound(byteSize); if (found != temp_buffer_map_.end()) { - cl::sycl::buffer buff = + sycl::buffer buff = found->second; temp_buffer_map_tot_byte_size_ -= found->first; temp_buffer_map_.erase(found); temp_buffer_map_mutex_.unlock(); // unlock return blas::BufferIterator{buff.reinterpret( - cl::sycl::range<1>(buff.byte_size() / sizeof(value_t)))}; + sycl::range<1>(buff.byte_size() / sizeof(value_t)))}; } else { temp_buffer_map_mutex_.unlock(); // unlock #ifdef VERBOSE @@ -37,7 +37,7 @@ void Temp_Mem_Pool::release_buff_mem_(const container_t& mem) { auto rebuff = mem.get_buffer() .template reinterpret( - cl::sycl::range<1>( + sycl::range<1>( byteSize / sizeof(temp_buffer_map_t::mapped_type::value_type))); temp_buffer_map_mutex_.lock(); // lock @@ -52,7 +52,7 @@ template typename Temp_Mem_Pool::event_t Temp_Mem_Pool::release_buff_mem( const typename Temp_Mem_Pool::event_t& dependencies, const container_t& mem) { - return {q_.submit([&](cl::sycl::handler& cgh) { + return {q_.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); cgh.host_task([&, mem]() { release_buff_mem_(mem); }); })}; @@ -77,7 +77,7 @@ Temp_Mem_Pool::acquire_usm_mem(size_t size) { std::cout << "Create a temporary USM allocation of " << byteSize << " bytes." << std::endl; #endif - value_t* tmp = cl::sycl::malloc_device(size, q_); + value_t* tmp = sycl::malloc_device(size, q_); temp_usm_map_mutex_.lock(); // lock temp_usm_size_map_.emplace( reinterpret_cast(tmp), byteSize); @@ -95,7 +95,7 @@ void Temp_Mem_Pool::release_usm_mem_(const container_t& mem) { if (temp_usm_map_tot_byte_size_ + byteSize > max_size_temp_mem_) { temp_usm_size_map_.erase(found); temp_usm_map_mutex_.unlock(); // unlock - cl::sycl::free(mem, q_); + sycl::free(mem, q_); } else { temp_usm_map_tot_byte_size_ += byteSize; temp_usm_map_.emplace(byteSize, @@ -108,7 +108,7 @@ template typename Temp_Mem_Pool::event_t Temp_Mem_Pool::release_usm_mem( const typename Temp_Mem_Pool::event_t& dependencies, const container_t& mem) { - return {q_.submit([&](cl::sycl::handler& cgh) { + return {q_.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); cgh.host_task([&, mem]() { release_usm_mem_(mem); }); })}; diff --git a/src/views/view_sycl.hpp b/src/views/view_sycl.hpp index fa5ef0197..5b52ecf2a 100644 --- a/src/views/view_sycl.hpp +++ b/src/views/view_sycl.hpp @@ -26,7 +26,7 @@ #ifndef PORTBLAS_VIEW_SYCL_HPP #define PORTBLAS_VIEW_SYCL_HPP -#include +#include #include #include "blas_meta.h" @@ -40,20 +40,19 @@ namespace blas { * @tparam scalar_t Value type of accessor. */ -template +template struct VectorView< - cl::sycl::accessor, + sycl::accessor, view_index_t, view_increment_t> { using scalar_t = ViewScalarT; using value_t = scalar_t; using index_t = view_index_t; using increment_t = view_increment_t; - static constexpr cl::sycl::access::mode access_mode_t = acc_mode_t; - using container_t = cl::sycl::accessor; + static constexpr sycl::access_mode access_mode_t = acc_mode_t; + using container_t = + sycl::accessor; using self_t = VectorView; // Accessor to the data containing the vector values. @@ -73,7 +72,7 @@ struct VectorView< const increment_t stride_; // global pointer access inside the kernel - cl::sycl::global_ptr ptr_; + sycl::global_ptr ptr_; // Round up the ration num / den, i.e. compute ceil(num / den) static PORTBLAS_INLINE index_t round_up_ratio(index_t num, index_t den) { @@ -86,7 +85,7 @@ struct VectorView< static PORTBLAS_INLINE index_t calculate_input_data_size( container_t &data, index_t, increment_t stride, index_t size) noexcept { increment_t const positive_stride = stride < 0 ? -stride : stride; - index_t const calc_size = round_up_ratio(data.get_count(), positive_stride); + index_t const calc_size = round_up_ratio(data.size(), positive_stride); return std::min(size, calc_size); } @@ -157,11 +156,11 @@ struct VectorView< return (stride_ == 1) ? *(ptr_ + i) : *(ptr_ + i * stride_); } - PORTBLAS_INLINE scalar_t &eval(cl::sycl::nd_item<1> ndItem) { + PORTBLAS_INLINE scalar_t &eval(sycl::nd_item<1> ndItem) { return eval(ndItem.get_global_id(0)); } - PORTBLAS_INLINE scalar_t eval(cl::sycl::nd_item<1> ndItem) const { + PORTBLAS_INLINE scalar_t eval(sycl::nd_item<1> ndItem) const { return eval(ndItem.get_global_id(0)); } @@ -177,35 +176,33 @@ struct VectorView< return *(ptr_ + indx); } - PORTBLAS_INLINE void bind(cl::sycl::handler &h) { h.require(data_); } + PORTBLAS_INLINE void bind(sycl::handler &h) { h.require(data_); } PORTBLAS_INLINE void adjust_access_displacement() { ptr_ = data_.get_pointer() + disp_; } }; -template +template struct MatrixView< - cl::sycl::accessor, + sycl::accessor, view_index_t, layout, has_inc>; /*! * @brief Specialization of an MatrixView with an accessor. */ -template +template struct MatrixView< - cl::sycl::accessor, + sycl::accessor, view_index_t, layout, has_inc> { using access_layout_t = layout; using scalar_t = ViewScalarT; using index_t = view_index_t; - static constexpr cl::sycl::access::mode access_mode_t = acc_mode_t; - using container_t = cl::sycl::accessor; + static constexpr sycl::access_mode access_mode_t = acc_mode_t; + using container_t = + sycl::accessor; using self_t = MatrixView; using value_t = scalar_t; @@ -217,8 +214,7 @@ struct MatrixView< const index_t sizeL_; // size of the leading dimension const index_t inc_; // internal increment between same row/column elements const index_t disp_; // displacementt od the first element - cl::sycl::global_ptr - ptr_; // global pointer access inside the kernel + sycl::global_ptr ptr_; // global pointer access inside the kernel /**** CONSTRUCTORS ****/ PORTBLAS_INLINE MatrixView(container_t data, index_t sizeR, index_t sizeC, @@ -320,11 +316,11 @@ struct MatrixView< return eval(i, j); } - PORTBLAS_INLINE scalar_t &eval(cl::sycl::nd_item<1> ndItem) { + PORTBLAS_INLINE scalar_t &eval(sycl::nd_item<1> ndItem) { return eval(ndItem.get_global_id(0)); } - PORTBLAS_INLINE scalar_t eval(cl::sycl::nd_item<1> ndItem) const noexcept { + PORTBLAS_INLINE scalar_t eval(sycl::nd_item<1> ndItem) const noexcept { return eval(ndItem.get_global_id(0)); } @@ -340,7 +336,7 @@ struct MatrixView< return *(ptr_ + indx); } - PORTBLAS_INLINE void bind(cl::sycl::handler &h) { h.require(data_); } + PORTBLAS_INLINE void bind(sycl::handler &h) { h.require(data_); } PORTBLAS_INLINE void adjust_access_displacement() { ptr_ = data_.get_pointer() + disp_; diff --git a/test/blas_test.hpp b/test/blas_test.hpp index 0eee0fc17..42a845451 100644 --- a/test/blas_test.hpp +++ b/test/blas_test.hpp @@ -63,12 +63,12 @@ using index_t = BLAS_INDEX_T; * Construct a SYCL queue using the device specified in the command line, or * using the default device if not specified. */ -inline cl::sycl::queue make_queue_impl() { - auto async_handler = [=](cl::sycl::exception_list eL) { +inline sycl::queue make_queue_impl() { + auto async_handler = [=](sycl::exception_list eL) { for (auto &e : eL) { try { std::rethrow_exception(e); - } catch (cl::sycl::exception &e) { + } catch (sycl::exception &e) { std::cout << "Sycl Exception " << e.what() << std::endl; } catch (std::exception &e) { std::cout << "Standard Exception " << e.what() << std::endl; @@ -78,25 +78,13 @@ inline cl::sycl::queue make_queue_impl() { } }; -#if SYCL_LANGUAGE_VERSION >= 202002 - std::function selector; + std::function selector; if (args.device.empty()) { - selector = cl::sycl::default_selector_v; + selector = sycl::default_selector_v; } else { selector = utils::cli_device_selector(args.device); } - auto q = cl::sycl::queue(selector, async_handler); -#else - std::unique_ptr selector; - if (args.device.empty()) { - selector = std::unique_ptr( - new cl::sycl::default_selector()); - } else { - selector = std::unique_ptr( - new utils::cli_device_selector(args.device)); - } - auto q = cl::sycl::queue(*selector, async_handler); -#endif // HAS_SYCL2020_SELECTORS + auto q = sycl::queue(selector, async_handler); utils::print_queue_information(q); return q; @@ -105,9 +93,9 @@ inline cl::sycl::queue make_queue_impl() { /** * Get a SYCL queue to use in tests. */ -inline cl::sycl::queue make_queue() { +inline sycl::queue make_queue() { // Provide cached SYCL queue, to avoid recompiling kernels for each test case. - static cl::sycl::queue queue = make_queue_impl(); + static sycl::queue queue = make_queue_impl(); return queue; } @@ -121,8 +109,7 @@ static inline scalar_t random_scalar(scalar_t rangeMin, scalar_t rangeMax) { static std::random_device rd; static std::default_random_engine gen(rd()); using random_scalar_t = - std::conditional_t, float, - scalar_t>; + std::conditional_t, float, scalar_t>; std::uniform_real_distribution dis(rangeMin, rangeMax); return dis(gen); } @@ -274,7 +261,7 @@ struct dump_arg_helper { }; /** Specialization of dump_arg_helper for float and double. NB this is not a - * specialization for half. std::is_floating_point::value will + * specialization for half. std::is_floating_point::value will * return false. * * @tparam StdFloat A standard floating point type. @@ -293,7 +280,7 @@ struct dump_arg_helper< * @param f Floating point number to format */ inline void operator()(std::ostream &ss, StdFloat f) { - static_assert(!std::is_same::value, + static_assert(!std::is_same::value, "std library functions will not work with half."); if (std::isnan(f)) { ss << "nan"; @@ -313,12 +300,12 @@ struct dump_arg_helper< } }; -/** Specialization of dump_arg_helper for cl::sycl::half. +/** Specialization of dump_arg_helper for sycl::half. * This is required since half will not work with standard library functions. **/ template <> -struct dump_arg_helper { - inline void operator()(std::ostream &ss, cl::sycl::half f) { +struct dump_arg_helper { + inline void operator()(std::ostream &ss, sycl::half f) { dump_arg_helper{}(ss, f); } }; diff --git a/test/blas_test_macros.hpp b/test/blas_test_macros.hpp index 3147b610d..c449deed0 100644 --- a/test/blas_test_macros.hpp +++ b/test/blas_test_macros.hpp @@ -33,11 +33,11 @@ #endif /* ifdef VERBOSE */ #ifndef SYCL_DEVICE -#define SYCL_DEVICE_SELECTOR cl::sycl::default_selector +#define SYCL_DEVICE_SELECTOR sycl::default_selector #else #define PASTER(x, y) x##y #define EVALUATOR(x, y) PASTER(x, y) -#define SYCL_DEVICE_SELECTOR cl::sycl::EVALUATOR(SYCL_DEVICE, _selector) +#define SYCL_DEVICE_SELECTOR sycl::EVALUATOR(SYCL_DEVICE, _selector) #undef PASTER #undef EVALUATOR #endif /* ifndef SYCL_DEVICE */ @@ -73,33 +73,31 @@ #endif // BLAS_DATA_TYPE_DOUBLE #ifdef BLAS_ENABLE_HALF -/** Registers test for the cl::sycl::half type +/** Registers test for the sycl::half type * @see BLAS_REGISTER_TEST_CUSTOM_NAME */ -#define BLAS_REGISTER_TEST_HALF_CUSTOM_NAME(test_suite, class_name, \ - test_function, combination_t, \ - combination, name_generator) \ - class class_name##Half \ - : public ::testing::TestWithParam> {}; \ - TEST_P(class_name##Half, test) { \ - test_function(GetParam()); \ - }; \ - INSTANTIATE_TEST_SUITE_P(test_suite, class_name##Half, \ - combination, \ - name_generator); - -/** Registers test for the cl::sycl::half input type & float output type +#define BLAS_REGISTER_TEST_HALF_CUSTOM_NAME(test_suite, class_name, \ + test_function, combination_t, \ + combination, name_generator) \ + class class_name##Half \ + : public ::testing::TestWithParam> {}; \ + TEST_P(class_name##Half, test) { test_function(GetParam()); }; \ + INSTANTIATE_TEST_SUITE_P(test_suite, class_name##Half, \ + combination, \ + name_generator); + +/** Registers test for the sycl::half input type & float output type * @see BLAS_REGISTER_GEMM_TEST_CUSTOM_NAME */ #define BLAS_REGISTER_TEST_HALF_FLOAT_CUSTOM_NAME( \ test_suite, class_name, test_function, combination_t, combination, \ name_generator) \ class class_name##Half \ - : public ::testing::TestWithParam> {}; \ + : public ::testing::TestWithParam> {}; \ TEST_P(class_name##Half, test) { test_function(GetParam()); }; \ INSTANTIATE_TEST_SUITE_P(test_suite, class_name##Half, \ - combination, \ - name_generator); \ + combination, \ + name_generator); \ \ class class_name##Float \ : public ::testing::TestWithParam> {}; \ @@ -179,18 +177,18 @@ * @param combination Combinations object * @param name_generator Function used to generate test names */ -#define BLAS_REGISTER_GEMM_TEST_CUSTOM_NAME(test_suite, class_name, \ - test_function, combination_t, \ - combination, name_generator) \ - BLAS_REGISTER_TEST_FLOAT_CUSTOM_NAME(test_suite, class_name##Float, \ - test_function, combination_t, \ - combination, name_generator); \ - BLAS_REGISTER_TEST_DOUBLE_CUSTOM_NAME(test_suite, class_name##Double, \ - test_function, combination_t, \ - combination, name_generator); \ - BLAS_REGISTER_TEST_HALF_FLOAT_CUSTOM_NAME( \ - test_suite, class_name##Half, test_function, \ - combination_t, combination, name_generator); +#define BLAS_REGISTER_GEMM_TEST_CUSTOM_NAME(test_suite, class_name, \ + test_function, combination_t, \ + combination, name_generator) \ + BLAS_REGISTER_TEST_FLOAT_CUSTOM_NAME(test_suite, class_name##Float, \ + test_function, combination_t, \ + combination, name_generator); \ + BLAS_REGISTER_TEST_DOUBLE_CUSTOM_NAME(test_suite, class_name##Double, \ + test_function, combination_t, \ + combination, name_generator); \ + BLAS_REGISTER_TEST_HALF_FLOAT_CUSTOM_NAME( \ + test_suite, class_name##Half, test_function, combination_t, \ + combination, name_generator); #ifdef BLAS_ENABLE_COMPLEX #define BLAS_REGISTER_CPLX_TEST_CUSTOM_NAME(test_suite, class_name, \ diff --git a/test/unittest/blas1/blas1_asum_test.cpp b/test/unittest/blas1/blas1_asum_test.cpp index 6442794d5..252822f24 100644 --- a/test/unittest/blas1/blas1_asum_test.cpp +++ b/test/unittest/blas1/blas1_asum_test.cpp @@ -43,7 +43,7 @@ void run_test(const combination_t combi) { std::vector x_v(vector_size); fill_random(x_v); - // We need to guarantee that cl::sycl::half can hold the sum + // We need to guarantee that sycl::half can hold the sum // of x_v without overflow by making sum(x_v) to be 1.0 std::transform(std::begin(x_v), std::end(x_v), std::begin(x_v), [=](scalar_t x) { return x / x_v.size(); }); diff --git a/test/unittest/blas1/blas1_axpy_test.cpp b/test/unittest/blas1/blas1_axpy_test.cpp index 2759cc6c3..ac8c06dfa 100644 --- a/test/unittest/blas1/blas1_axpy_test.cpp +++ b/test/unittest/blas1/blas1_axpy_test.cpp @@ -49,8 +49,8 @@ void run_test(const combination_t combi) { auto q = make_queue(); - if (std::is_same_v && - !q.get_device().has(cl::sycl::aspect::fp16)) { + if (std::is_same_v && + !q.get_device().has(sycl::aspect::fp16)) { GTEST_SKIP() << "Unsupported fp16 (half) on this device."; } diff --git a/test/unittest/blas1/blas1_scal_test.cpp b/test/unittest/blas1/blas1_scal_test.cpp index bf8c07240..a4ad5a011 100644 --- a/test/unittest/blas1/blas1_scal_test.cpp +++ b/test/unittest/blas1/blas1_scal_test.cpp @@ -42,8 +42,8 @@ void run_test(const combination_t combi) { auto q = make_queue(); - if (std::is_same_v && - !q.get_device().has(cl::sycl::aspect::fp16)) { + if (std::is_same_v && + !q.get_device().has(sycl::aspect::fp16)) { GTEST_SKIP() << "Unsupported fp16 (half) on this device."; } diff --git a/test/unittest/blas3/blas3_gemm_common.hpp b/test/unittest/blas3/blas3_gemm_common.hpp index b0c635918..a8f35f653 100644 --- a/test/unittest/blas3/blas3_gemm_common.hpp +++ b/test/unittest/blas3/blas3_gemm_common.hpp @@ -118,8 +118,8 @@ inline void verify_gemm(const gemm_arguments_t arguments) { auto q = make_queue(); - if (std::is_same_v && - !q.get_device().has(cl::sycl::aspect::fp16)) { + if (std::is_same_v && + !q.get_device().has(sycl::aspect::fp16)) { GTEST_SKIP() << "Unsupported fp16 (half) on this device."; } blas::SB_Handle sb_handle(q); @@ -314,8 +314,8 @@ inline void verify_gemm( auto q = make_queue(); - if (std::is_same_v && - !q.get_device().has(cl::sycl::aspect::fp16)) { + if (std::is_same_v && + !q.get_device().has(sycl::aspect::fp16)) { GTEST_SKIP() << "Unsupported fp16 (half) on this device."; } diff --git a/test/unittest/extension/reduction_test.cpp b/test/unittest/extension/reduction_test.cpp index 7dc84d5c1..03c2a0881 100644 --- a/test/unittest/extension/reduction_test.cpp +++ b/test/unittest/extension/reduction_test.cpp @@ -223,7 +223,7 @@ void run_test(const combination_t combi) { {copy_m, copy_v}); break; } - } catch (cl::sycl::exception& e) { + } catch (sycl::exception& e) { std::cerr << "Exception occured:" << std::endl; std::cerr << e.what() << std::endl; } diff --git a/test/unittest/joint_matrix/joint_matrix_common.hpp b/test/unittest/joint_matrix/joint_matrix_common.hpp index c18366a83..ccf9def8d 100644 --- a/test/unittest/joint_matrix/joint_matrix_common.hpp +++ b/test/unittest/joint_matrix/joint_matrix_common.hpp @@ -125,40 +125,37 @@ inline void verify_gemm(const joint_matrix_arguments_t arguments) { typename blas::SB_Handle::event_t gemm_event; if (jm_inType == "half" && jm_outType == "float") { if (jm_m == 16 && jm_n == 16) { - gemm_event = launch_gemm_with_beta<16, 16, 16, cl::sycl::half, float>( + gemm_event = launch_gemm_with_beta<16, 16, 16, sycl::half, float>( sb_handle, transa, transb, m, n, k, alpha, m_a_gpu + offset, lda, size_a, m_b_gpu + offset, ldb, size_b, beta, m_c_gpu + offset, ldc, size_c, batch, batch_type, {copy_a, copy_b, copy_c}); } else if (jm_m == 32 && jm_n == 8) { - gemm_event = launch_gemm_with_beta<32, 8, 16, cl::sycl::half, float>( + gemm_event = launch_gemm_with_beta<32, 8, 16, sycl::half, float>( sb_handle, transa, transb, m, n, k, alpha, m_a_gpu + offset, lda, size_a, m_b_gpu + offset, ldb, size_b, beta, m_c_gpu + offset, ldc, size_c, batch, batch_type, {copy_a, copy_b, copy_c}); } else if (jm_n == 32 && jm_m == 8) { - gemm_event = launch_gemm_with_beta<8, 32, 16, cl::sycl::half, float>( + gemm_event = launch_gemm_with_beta<8, 32, 16, sycl::half, float>( sb_handle, transa, transb, m, n, k, alpha, m_a_gpu + offset, lda, size_a, m_b_gpu + offset, ldb, size_b, beta, m_c_gpu + offset, ldc, size_c, batch, batch_type, {copy_a, copy_b, copy_c}); } } else if (jm_inType == "half" && jm_outType == "half") { if (jm_m == 16 && jm_n == 16) { - gemm_event = - launch_gemm_with_beta<16, 16, 16, cl::sycl::half, cl::sycl::half>( - sb_handle, transa, transb, m, n, k, alpha, m_a_gpu + offset, lda, - size_a, m_b_gpu + offset, ldb, size_b, beta, m_c_gpu + offset, - ldc, size_c, batch, batch_type, {copy_a, copy_b, copy_c}); + gemm_event = launch_gemm_with_beta<16, 16, 16, sycl::half, sycl::half>( + sb_handle, transa, transb, m, n, k, alpha, m_a_gpu + offset, lda, + size_a, m_b_gpu + offset, ldb, size_b, beta, m_c_gpu + offset, ldc, + size_c, batch, batch_type, {copy_a, copy_b, copy_c}); } else if (jm_m == 32 && jm_n == 8) { - gemm_event = - launch_gemm_with_beta<32, 8, 16, cl::sycl::half, cl::sycl::half>( - sb_handle, transa, transb, m, n, k, alpha, m_a_gpu + offset, lda, - size_a, m_b_gpu + offset, ldb, size_b, beta, m_c_gpu + offset, - ldc, size_c, batch, batch_type, {copy_a, copy_b, copy_c}); + gemm_event = launch_gemm_with_beta<32, 8, 16, sycl::half, sycl::half>( + sb_handle, transa, transb, m, n, k, alpha, m_a_gpu + offset, lda, + size_a, m_b_gpu + offset, ldb, size_b, beta, m_c_gpu + offset, ldc, + size_c, batch, batch_type, {copy_a, copy_b, copy_c}); } else if (jm_n == 32 && jm_m == 8) { - gemm_event = - launch_gemm_with_beta<8, 32, 16, cl::sycl::half, cl::sycl::half>( - sb_handle, transa, transb, m, n, k, alpha, m_a_gpu + offset, lda, - size_a, m_b_gpu + offset, ldb, size_b, beta, m_c_gpu + offset, - ldc, size_c, batch, batch_type, {copy_a, copy_b, copy_c}); + gemm_event = launch_gemm_with_beta<8, 32, 16, sycl::half, sycl::half>( + sb_handle, transa, transb, m, n, k, alpha, m_a_gpu + offset, lda, + size_a, m_b_gpu + offset, ldb, size_b, beta, m_c_gpu + offset, ldc, + size_c, batch, batch_type, {copy_a, copy_b, copy_c}); } } else if (jm_inType == "bfloat16" && jm_outType == "float") { if (jm_m == 16 && jm_n == 16) { diff --git a/tools/auto_tuner/CMakeLists.txt b/tools/auto_tuner/CMakeLists.txt index bd39ad9f3..39f3d6d34 100644 --- a/tools/auto_tuner/CMakeLists.txt +++ b/tools/auto_tuner/CMakeLists.txt @@ -8,7 +8,7 @@ list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../cmake/ ${CMAKE_CURRENT_SOURCE_DIR}/../../cmake/Modules ) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations -Wno-deprecated-declarations -Wno-shorten-64-to-32 -Wno-cast-align") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations -Wno-shorten-64-to-32 -Wno-cast-align") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-copy-with-user-provided-copy -Wno-unused-variable") # Setup datatypes, workgroup sizes and other options. diff --git a/tools/auto_tuner/include/gemm_tuner.hpp b/tools/auto_tuner/include/gemm_tuner.hpp index e3ad3e391..2803e1aac 100644 --- a/tools/auto_tuner/include/gemm_tuner.hpp +++ b/tools/auto_tuner/include/gemm_tuner.hpp @@ -30,7 +30,7 @@ #include "reference_gemm.hpp" #include "portblas.hpp" -using namespace cl::sycl; +using namespace sycl; using namespace blas; // Convert batch_type=strided to interleaved on the host template diff --git a/tools/auto_tuner/include/utils.hpp b/tools/auto_tuner/include/utils.hpp index 4bcbebc04..e0fe39379 100644 --- a/tools/auto_tuner/include/utils.hpp +++ b/tools/auto_tuner/include/utils.hpp @@ -34,21 +34,20 @@ #include #include -inline cl::sycl::queue make_sycl_queue() { - cl::sycl::queue q( - [=](cl::sycl::exception_list ex_list) { +inline sycl::queue make_sycl_queue() { + sycl::queue q( + [=](sycl::exception_list ex_list) { try { for (auto &e_ptr : ex_list) { std::rethrow_exception(e_ptr); } - } catch (cl::sycl::exception &e) { + } catch (sycl::exception &e) { throw std::runtime_error(e.what()); } }, - {cl::sycl::property::queue::in_order()}); + {sycl::property::queue::in_order()}); std::cout << "\nDevice: " - << q.get_device().get_info() - << std::endl; + << q.get_device().get_info() << std::endl; return q; }