Refactor iamax and iamin api to match oneapi spec (#482)

codeplaysoftware · Dec 15, 2023 · 101c87a · 101c87a
1 parent 4221640
commit 101c87a
Show file tree

Hide file tree

Showing 20 changed files with 630 additions and 104 deletions.
diff --git a/benchmark/portblas/blas1/iamax.cpp b/benchmark/portblas/blas1/iamax.cpp
@@ -42,19 +42,17 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
   blas::SB_Handle& sb_handle = *sb_handle_ptr;
   auto q = sb_handle.get_queue();
 
-  using tuple_scalar_t = blas::IndexValueTuple<index_t, scalar_t>;
-
   // Create data
   std::vector<scalar_t> v1 = blas_benchmark::utils::random_data<scalar_t>(size);
-  tuple_scalar_t out{-1, 0};
+  index_t out = 0;
 
   // This will clamp the values to what scalar_t can represent
   std::transform(std::begin(v1), std::end(v1), std::begin(v1), [](scalar_t v) {
     return utils::clamp_to_limits<scalar_t>(v);
   });
 
   auto inx = blas::helper::allocate<mem_alloc, scalar_t>(size, q);
-  auto outI = blas::helper::allocate<mem_alloc, tuple_scalar_t>(1, q);
+  auto outI = blas::helper::allocate<mem_alloc, index_t>(1, q);
 
   auto copy_x = blas::helper::copy_to_device<scalar_t>(q, v1.data(), inx, size);
 
@@ -64,9 +62,9 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
   // Run a first time with a verification of the results
   index_t idx_ref =
       static_cast<index_t>(reference_blas::iamax(size, v1.data(), 1));
-  tuple_scalar_t idx_temp{-1, 0};
+  index_t idx_temp = -1;
   {
-    auto idx_temp_gpu = blas::helper::allocate<mem_alloc, tuple_scalar_t>(1, q);
+    auto idx_temp_gpu = blas::helper::allocate<mem_alloc, index_t>(1, q);
     auto iamax_event =
         _iamax(sb_handle, size, inx, static_cast<index_t>(1), idx_temp_gpu);
     sb_handle.wait(iamax_event);
@@ -77,10 +75,9 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
     blas::helper::deallocate<mem_alloc>(idx_temp_gpu, q);
   }
 
-  if (idx_temp.ind != idx_ref) {
+  if (idx_temp != idx_ref) {
     std::ostringstream err_stream;
-    err_stream << "Index mismatch: " << idx_temp.ind << "; expected "
-               << idx_ref;
+    err_stream << "Index mismatch: " << idx_temp << "; expected " << idx_ref;
     const std::string& err_str = err_stream.str();
     state.SkipWithError(err_str.c_str());
     *success = false;

diff --git a/benchmark/portblas/blas1/iamin.cpp b/benchmark/portblas/blas1/iamin.cpp
@@ -42,18 +42,16 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
   blas::SB_Handle& sb_handle = *sb_handle_ptr;
   auto q = sb_handle.get_queue();
 
-  using tuple_scalar_t = blas::IndexValueTuple<index_t, scalar_t>;
-
   // Create data
   std::vector<scalar_t> v1 = blas_benchmark::utils::random_data<scalar_t>(size);
-  tuple_scalar_t out{0, 0};
+  index_t out{0};
 
   std::transform(std::begin(v1), std::end(v1), std::begin(v1), [](scalar_t v) {
     return utils::clamp_to_limits<scalar_t>(v);
   });
 
   auto inx = blas::helper::allocate<mem_alloc, scalar_t>(size, q);
-  auto outI = blas::helper::allocate<mem_alloc, tuple_scalar_t>(1, q);
+  auto outI = blas::helper::allocate<mem_alloc, index_t>(1, q);
 
   auto copy_x = blas::helper::copy_to_device<scalar_t>(q, v1.data(), inx, size);
 
@@ -63,9 +61,9 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
   // Run a first time with a verification of the results
   index_t idx_ref =
       static_cast<index_t>(reference_blas::iamin(size, v1.data(), 1));
-  tuple_scalar_t idx_temp{-1, -1};
+  index_t idx_temp = -1;
   {
-    auto idx_temp_gpu = blas::helper::allocate<mem_alloc, tuple_scalar_t>(1, q);
+    auto idx_temp_gpu = blas::helper::allocate<mem_alloc, index_t>(1, q);
     auto iamin_event =
         _iamin(sb_handle, size, inx, static_cast<index_t>(1), idx_temp_gpu);
     sb_handle.wait(iamin_event);
@@ -76,10 +74,9 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
     blas::helper::deallocate<mem_alloc>(idx_temp_gpu, q);
   }
 
-  if (idx_temp.ind != idx_ref) {
+  if (idx_temp != idx_ref) {
     std::ostringstream err_stream;
-    err_stream << "Index mismatch: " << idx_temp.ind << "; expected "
-               << idx_ref;
+    err_stream << "Index mismatch: " << idx_temp << "; expected " << idx_ref;
     const std::string& err_str = err_stream.str();
     state.SkipWithError(err_str.c_str());
     *success = false;

diff --git a/include/blas_meta.h b/include/blas_meta.h
@@ -32,8 +32,12 @@
 #ifdef BLAS_ENABLE_COMPLEX
 #define SYCL_EXT_ONEAPI_COMPLEX
 #include <complex>
+#if __has_include(<ext/oneapi/experimental/complex/complex.hpp>)
+#include <ext/oneapi/experimental/complex/complex.hpp>
+#else
 #include <ext/oneapi/experimental/sycl_complex.hpp>
 #endif
+#endif
 
 namespace blas {
 
@@ -167,7 +171,7 @@ int append_vector(vector_t &lhs_vector, vector_t const &rhs_vector) {
 
 template <typename first_vector_t, typename... other_vector_t>
 first_vector_t concatenate_vectors(first_vector_t first_vector,
-                                   other_vector_t &&...other_vectors) {
+                                   other_vector_t &&... other_vectors) {
   int first_Vector_size = static_cast<int>(first_vector.size());
   int s[] = {vec_total_size(first_Vector_size, other_vectors)..., 0};
   first_vector.reserve(first_Vector_size);

diff --git a/include/interface/blas1_interface.h b/include/interface/blas1_interface.h
@@ -144,6 +144,14 @@ typename sb_handle_t::event_t _asum_impl(
     container_1_t _rs, const index_t number_WG,
     const typename sb_handle_t::event_t &_dependencies);
 
+template <int localSize, int localMemSize, bool is_max, bool single,
+          typename sb_handle_t, typename container_0_t, typename container_1_t,
+          typename index_t, typename increment_t>
+typename sb_handle_t::event_t _iamax_iamin_impl(
+    sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
+    container_1_t _rs, const index_t _nWG,
+    const typename sb_handle_t::event_t &_dependencies);
+
 /**
  * \brief IAMAX finds the index of the first element having maximum
  * @param _vx BufferIterator or USM pointer
@@ -156,6 +164,7 @@ template <typename sb_handle_t, typename container_t, typename ContainerI,
 typename sb_handle_t::event_t _iamax(
     sb_handle_t &sb_handle, index_t _N, container_t _vx, increment_t _incx,
     ContainerI _rs, const typename sb_handle_t::event_t &_dependencies);
+
 /**
  * \brief IAMIN finds the index of the first element having minimum
  * @param _vx BufferIterator or USM pointer

diff --git a/include/operations/blas1_trees.h b/include/operations/blas1_trees.h
@@ -224,6 +224,36 @@ struct WGAtomicReduction {
   void adjust_access_displacement();
 };
 
+/**
+ * @brief Generic implementation for operators that require a
+ * reduction inside kernel code for computing index of max/min value within the
+ * input (i.e. iamax and iamin).
+ *
+ * The class is constructed using the make_index_max_min
+ * function below.
+ *
+ * @tparam is_max Whether the operator is iamax or iamin
+ * @tparam is_step0 Decides whether to write IndexValueTuple to output or final
+ * index output
+ * @tparam lhs_t Buffer or USM memory object type for output memory
+ * @tparam rhs_t Buffer or USM memory object type for input memory
+ */
+template <bool is_max, bool is_step0, typename lhs_t, typename rhs_t>
+struct IndexMaxMin {
+  using value_t = typename rhs_t::value_t;
+  using index_t = typename rhs_t::index_t;
+  lhs_t lhs_;
+  rhs_t rhs_;
+  IndexMaxMin(lhs_t &_l, rhs_t &_r);
+  index_t get_size() const;
+  bool valid_thread(cl::sycl::nd_item<1> ndItem) const;
+  void eval(cl::sycl::nd_item<1> ndItem);
+  template <typename sharedT>
+  void eval(sharedT scratch, cl::sycl::nd_item<1> ndItem);
+  void bind(cl::sycl::handler &h);
+  void adjust_access_displacement();
+};
+
 /*! Rotg.
  * @brief Implements the rotg (blas level 1 api)
  */
@@ -280,6 +310,12 @@ inline WGAtomicReduction<operator_t, lhs_t, rhs_t> make_wg_atomic_reduction(
   return WGAtomicReduction<operator_t, lhs_t, rhs_t>(lhs_, rhs_);
 }
 
+template <bool is_max, bool is_step0, typename lhs_t, typename rhs_t>
+inline IndexMaxMin<is_max, is_step0, lhs_t, rhs_t> make_index_max_min(
+    lhs_t &lhs_, rhs_t &rhs_) {
+  return IndexMaxMin<is_max, is_step0, lhs_t, rhs_t>(lhs_, rhs_);
+}
+
 /*!
 @brief Template function for constructing operation nodes based on input
 template and function arguments. Non-specialized case for N reference operands.

diff --git a/include/operations/blas_constants.h b/include/operations/blas_constants.h
@@ -269,4 +269,12 @@ template <typename ind_t, typename val_t>
 struct sycl::is_device_copyable<blas::IndexValueTuple<ind_t, val_t>>
     : std::true_type {};
 
+template <typename ind_t, typename val_t>
+struct sycl::is_device_copyable<const blas::IndexValueTuple<ind_t, val_t>>
+    : std::true_type {};
+
+template <typename ind_t, typename val_t>
+struct std::is_trivially_copyable<blas::IndexValueTuple<ind_t, val_t>>
+    : std::true_type {};
+
 #endif  // BLAS_CONSTANTS_H
diff --git a/include/portblas_helper.h b/include/portblas_helper.h
@@ -138,22 +138,24 @@ inline size_t get_num_compute_units(cl::sycl::queue &q) {
   @param size is the number of elements to be copied
 */
 template <typename element_t>
-inline cl::sycl::event copy_to_device(cl::sycl::queue q, const element_t *src,
-                                      BufferIterator<element_t> dst,
-                                      size_t size) {
+inline cl::sycl::event copy_to_device(
+    cl::sycl::queue q, const element_t *src, BufferIterator<element_t> dst,
+    size_t size, const std::vector<cl::sycl::event> &_dependencies = {}) {
   auto event = q.submit([&](cl::sycl::handler &cgh) {
     auto acc = dst.template get_range_accessor<cl::sycl::access::mode::write>(
         cgh, size);
+    cgh.depends_on(_dependencies);
     cgh.copy(src, acc);
   });
   return event;
 }
 
 #ifdef SB_ENABLE_USM
 template <typename element_t>
-inline cl::sycl::event copy_to_device(cl::sycl::queue q, const element_t *src,
-                                      element_t *dst, size_t size) {
-  auto event = q.memcpy(dst, src, size * sizeof(element_t));
+inline cl::sycl::event copy_to_device(
+    cl::sycl::queue q, const element_t *src, element_t *dst, size_t size,
+    const std::vector<cl::sycl::event> &_dependencies = {}) {
+  auto event = q.memcpy(dst, src, size * sizeof(element_t), _dependencies);
   return event;
 }
 #endif
@@ -195,8 +197,9 @@ inline cl::sycl::event copy_to_host(cl::sycl::queue q, const element_t *src,
 template <typename element_t>
 inline cl::sycl::event fill(cl::sycl::queue q, BufferIterator<element_t> buff,
                             element_t value, size_t size,
-                            const std::vector<cl::sycl::event> &) {
+                            const std::vector<cl::sycl::event> &_dependencies) {
   auto event = q.submit([&](cl::sycl::handler &cgh) {
+    cgh.depends_on(_dependencies);
     auto acc = buff.template get_range_accessor<cl::sycl::access::mode::write>(
         cgh, size);
     cgh.fill(acc, value);

diff --git a/src/interface/blas1/backend/amd_gpu.hpp b/src/interface/blas1/backend/amd_gpu.hpp
@@ -49,6 +49,51 @@ typename sb_handle_t::event_t _asum(
 }  // namespace backend
 }  // namespace asum
 
+namespace iamax {
+namespace backend {
+template <typename sb_handle_t, typename container_0_t, typename container_1_t,
+          typename index_t, typename increment_t>
+typename sb_handle_t::event_t _iamax(
+    sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
+    container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
+  if (_N < 65536) {
+    constexpr int localSize = 1024;
+    return blas::internal::_iamax_iamin_impl<localSize, localSize, true, true>(
+        sb_handle, _N, _vx, _incx, _rs, static_cast<index_t>(1), _dependencies);
+  } else {
+    constexpr int localSize = 256;
+    const index_t nWG = std::min((_N + localSize - 1) / (localSize * 4),
+                                 static_cast<index_t>(512));
+    return blas::internal::_iamax_iamin_impl<localSize, localSize, true, false>(
+        sb_handle, _N, _vx, _incx, _rs, nWG, _dependencies);
+  }
+}
+}  // namespace backend
+}  // namespace iamax
+
+namespace iamin {
+namespace backend {
+template <typename sb_handle_t, typename container_0_t, typename container_1_t,
+          typename index_t, typename increment_t>
+typename sb_handle_t::event_t _iamin(
+    sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
+    container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
+  if (_N < 65536) {
+    constexpr int localSize = 1024;
+    return blas::internal::_iamax_iamin_impl<localSize, localSize, false, true>(
+        sb_handle, _N, _vx, _incx, _rs, static_cast<index_t>(1), _dependencies);
+  } else {
+    constexpr int localSize = 256;
+    const index_t nWG = std::min((_N + localSize - 1) / (localSize * 4),
+                                 static_cast<index_t>(512));
+    return blas::internal::_iamax_iamin_impl<localSize, localSize, false,
+                                             false>(sb_handle, _N, _vx, _incx,
+                                                    _rs, nWG, _dependencies);
+  }
+}
+}  // namespace backend
+}  // namespace iamin
+
 namespace nrm2 {
 namespace backend {
 template <typename sb_handle_t, typename container_0_t, typename container_1_t,

diff --git a/src/interface/blas1/backend/default_cpu.hpp b/src/interface/blas1/backend/default_cpu.hpp
@@ -42,6 +42,48 @@ typename sb_handle_t::event_t _asum(
 }  // namespace backend
 }  // namespace asum
 
+namespace iamax {
+namespace backend {
+template <typename sb_handle_t, typename container_0_t, typename container_1_t,
+          typename index_t, typename increment_t>
+typename sb_handle_t::event_t _iamax(
+    sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
+    container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
+  constexpr int localSize = 128;
+  if (_N < 8192) {
+    return blas::internal::_iamax_iamin_impl<localSize, 0, true, true>(
+        sb_handle, _N, _vx, _incx, _rs, static_cast<index_t>(1), _dependencies);
+  } else {
+    const index_t nWG = std::min((_N + localSize - 1) / (localSize * 4),
+                                 static_cast<index_t>(512));
+    return blas::internal::_iamax_iamin_impl<localSize, 0, true, false>(
+        sb_handle, _N, _vx, _incx, _rs, nWG, _dependencies);
+  }
+}
+}  // namespace backend
+}  // namespace iamax
+
+namespace iamin {
+namespace backend {
+template <typename sb_handle_t, typename container_0_t, typename container_1_t,
+          typename index_t, typename increment_t>
+typename sb_handle_t::event_t _iamin(
+    sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
+    container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
+  constexpr int localSize = 128;
+  if (_N < 8192) {
+    return blas::internal::_iamax_iamin_impl<localSize, 0, false, true>(
+        sb_handle, _N, _vx, _incx, _rs, static_cast<index_t>(1), _dependencies);
+  } else {
+    const index_t nWG = std::min((_N + localSize - 1) / (localSize * 4),
+                                 static_cast<index_t>(512));
+    return blas::internal::_iamax_iamin_impl<localSize, 0, false, false>(
+        sb_handle, _N, _vx, _incx, _rs, nWG, _dependencies);
+  }
+}
+}  // namespace backend
+}  // namespace iamin
+
 namespace nrm2 {
 namespace backend {
 template <typename sb_handle_t, typename container_0_t, typename container_1_t,

diff --git a/src/interface/blas1/backend/intel_gpu.hpp b/src/interface/blas1/backend/intel_gpu.hpp
@@ -43,6 +43,48 @@ typename sb_handle_t::event_t _asum(
 }  // namespace backend
 }  // namespace asum
 
+namespace iamax {
+namespace backend {
+template <typename sb_handle_t, typename container_0_t, typename container_1_t,
+          typename index_t, typename increment_t>
+typename sb_handle_t::event_t _iamax(
+    sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
+    container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
+  constexpr int localSize = 256;
+  if (_N < 8192) {
+    return blas::internal::_iamax_iamin_impl<localSize, localSize, true, true>(
+        sb_handle, _N, _vx, _incx, _rs, static_cast<index_t>(1), _dependencies);
+  } else {
+    const index_t nWG = std::min((_N + localSize - 1) / (localSize * 4),
+                                 static_cast<index_t>(512));
+    return blas::internal::_iamax_iamin_impl<localSize, localSize, true, false>(
+        sb_handle, _N, _vx, _incx, _rs, nWG, _dependencies);
+  }
+}
+}  // namespace backend
+}  // namespace iamax
+
+namespace iamin {
+namespace backend {
+template <typename sb_handle_t, typename container_0_t, typename container_1_t,
+          typename index_t, typename increment_t>
+typename sb_handle_t::event_t _iamin(
+    sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
+    container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
+  constexpr int localSize = 256;
+  if (_N < 8192) {
+    return blas::internal::_iamax_iamin_impl<localSize, localSize, false, true>(
+        sb_handle, _N, _vx, _incx, _rs, static_cast<index_t>(1), _dependencies);
+  } else {
+    const index_t nWG = std::min((_N + localSize - 1) / (localSize * 4),
+                                 static_cast<index_t>(512));
+    return blas::internal::_iamax_iamin_impl<localSize, localSize, false,
+                                             false>(sb_handle, _N, _vx, _incx,
+                                                    _rs, nWG, _dependencies);
+  }
+}
+}  // namespace backend
+}  // namespace iamin
 namespace nrm2 {
 namespace backend {
 template <typename sb_handle_t, typename container_0_t, typename container_1_t,