Skip to content
This repository has been archived by the owner on Jan 13, 2025. It is now read-only.

Commit

Permalink
Refactor iamax and iamin api to match oneapi spec (#482)
Browse files Browse the repository at this point in the history
  • Loading branch information
muhammad-tanvir-1211 authored Dec 15, 2023
1 parent 4221640 commit 101c87a
Show file tree
Hide file tree
Showing 20 changed files with 630 additions and 104 deletions.
15 changes: 6 additions & 9 deletions benchmark/portblas/blas1/iamax.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,17 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
blas::SB_Handle& sb_handle = *sb_handle_ptr;
auto q = sb_handle.get_queue();

using tuple_scalar_t = blas::IndexValueTuple<index_t, scalar_t>;

// Create data
std::vector<scalar_t> v1 = blas_benchmark::utils::random_data<scalar_t>(size);
tuple_scalar_t out{-1, 0};
index_t out = 0;

// This will clamp the values to what scalar_t can represent
std::transform(std::begin(v1), std::end(v1), std::begin(v1), [](scalar_t v) {
return utils::clamp_to_limits<scalar_t>(v);
});

auto inx = blas::helper::allocate<mem_alloc, scalar_t>(size, q);
auto outI = blas::helper::allocate<mem_alloc, tuple_scalar_t>(1, q);
auto outI = blas::helper::allocate<mem_alloc, index_t>(1, q);

auto copy_x = blas::helper::copy_to_device<scalar_t>(q, v1.data(), inx, size);

Expand All @@ -64,9 +62,9 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
// Run a first time with a verification of the results
index_t idx_ref =
static_cast<index_t>(reference_blas::iamax(size, v1.data(), 1));
tuple_scalar_t idx_temp{-1, 0};
index_t idx_temp = -1;
{
auto idx_temp_gpu = blas::helper::allocate<mem_alloc, tuple_scalar_t>(1, q);
auto idx_temp_gpu = blas::helper::allocate<mem_alloc, index_t>(1, q);
auto iamax_event =
_iamax(sb_handle, size, inx, static_cast<index_t>(1), idx_temp_gpu);
sb_handle.wait(iamax_event);
Expand All @@ -77,10 +75,9 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
blas::helper::deallocate<mem_alloc>(idx_temp_gpu, q);
}

if (idx_temp.ind != idx_ref) {
if (idx_temp != idx_ref) {
std::ostringstream err_stream;
err_stream << "Index mismatch: " << idx_temp.ind << "; expected "
<< idx_ref;
err_stream << "Index mismatch: " << idx_temp << "; expected " << idx_ref;
const std::string& err_str = err_stream.str();
state.SkipWithError(err_str.c_str());
*success = false;
Expand Down
15 changes: 6 additions & 9 deletions benchmark/portblas/blas1/iamin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,16 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
blas::SB_Handle& sb_handle = *sb_handle_ptr;
auto q = sb_handle.get_queue();

using tuple_scalar_t = blas::IndexValueTuple<index_t, scalar_t>;

// Create data
std::vector<scalar_t> v1 = blas_benchmark::utils::random_data<scalar_t>(size);
tuple_scalar_t out{0, 0};
index_t out{0};

std::transform(std::begin(v1), std::end(v1), std::begin(v1), [](scalar_t v) {
return utils::clamp_to_limits<scalar_t>(v);
});

auto inx = blas::helper::allocate<mem_alloc, scalar_t>(size, q);
auto outI = blas::helper::allocate<mem_alloc, tuple_scalar_t>(1, q);
auto outI = blas::helper::allocate<mem_alloc, index_t>(1, q);

auto copy_x = blas::helper::copy_to_device<scalar_t>(q, v1.data(), inx, size);

Expand All @@ -63,9 +61,9 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
// Run a first time with a verification of the results
index_t idx_ref =
static_cast<index_t>(reference_blas::iamin(size, v1.data(), 1));
tuple_scalar_t idx_temp{-1, -1};
index_t idx_temp = -1;
{
auto idx_temp_gpu = blas::helper::allocate<mem_alloc, tuple_scalar_t>(1, q);
auto idx_temp_gpu = blas::helper::allocate<mem_alloc, index_t>(1, q);
auto iamin_event =
_iamin(sb_handle, size, inx, static_cast<index_t>(1), idx_temp_gpu);
sb_handle.wait(iamin_event);
Expand All @@ -76,10 +74,9 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
blas::helper::deallocate<mem_alloc>(idx_temp_gpu, q);
}

if (idx_temp.ind != idx_ref) {
if (idx_temp != idx_ref) {
std::ostringstream err_stream;
err_stream << "Index mismatch: " << idx_temp.ind << "; expected "
<< idx_ref;
err_stream << "Index mismatch: " << idx_temp << "; expected " << idx_ref;
const std::string& err_str = err_stream.str();
state.SkipWithError(err_str.c_str());
*success = false;
Expand Down
6 changes: 5 additions & 1 deletion include/blas_meta.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,12 @@
#ifdef BLAS_ENABLE_COMPLEX
#define SYCL_EXT_ONEAPI_COMPLEX
#include <complex>
#if __has_include(<ext/oneapi/experimental/complex/complex.hpp>)
#include <ext/oneapi/experimental/complex/complex.hpp>
#else
#include <ext/oneapi/experimental/sycl_complex.hpp>
#endif
#endif

namespace blas {

Expand Down Expand Up @@ -167,7 +171,7 @@ int append_vector(vector_t &lhs_vector, vector_t const &rhs_vector) {

template <typename first_vector_t, typename... other_vector_t>
first_vector_t concatenate_vectors(first_vector_t first_vector,
other_vector_t &&...other_vectors) {
other_vector_t &&... other_vectors) {
int first_Vector_size = static_cast<int>(first_vector.size());
int s[] = {vec_total_size(first_Vector_size, other_vectors)..., 0};
first_vector.reserve(first_Vector_size);
Expand Down
9 changes: 9 additions & 0 deletions include/interface/blas1_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,14 @@ typename sb_handle_t::event_t _asum_impl(
container_1_t _rs, const index_t number_WG,
const typename sb_handle_t::event_t &_dependencies);

template <int localSize, int localMemSize, bool is_max, bool single,
typename sb_handle_t, typename container_0_t, typename container_1_t,
typename index_t, typename increment_t>
typename sb_handle_t::event_t _iamax_iamin_impl(
sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _rs, const index_t _nWG,
const typename sb_handle_t::event_t &_dependencies);

/**
* \brief IAMAX finds the index of the first element having maximum
* @param _vx BufferIterator or USM pointer
Expand All @@ -156,6 +164,7 @@ template <typename sb_handle_t, typename container_t, typename ContainerI,
typename sb_handle_t::event_t _iamax(
sb_handle_t &sb_handle, index_t _N, container_t _vx, increment_t _incx,
ContainerI _rs, const typename sb_handle_t::event_t &_dependencies);

/**
* \brief IAMIN finds the index of the first element having minimum
* @param _vx BufferIterator or USM pointer
Expand Down
36 changes: 36 additions & 0 deletions include/operations/blas1_trees.h
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,36 @@ struct WGAtomicReduction {
void adjust_access_displacement();
};

/**
* @brief Generic implementation for operators that require a
* reduction inside kernel code for computing index of max/min value within the
* input (i.e. iamax and iamin).
*
* The class is constructed using the make_index_max_min
* function below.
*
* @tparam is_max Whether the operator is iamax or iamin
* @tparam is_step0 Decides whether to write IndexValueTuple to output or final
* index output
* @tparam lhs_t Buffer or USM memory object type for output memory
* @tparam rhs_t Buffer or USM memory object type for input memory
*/
template <bool is_max, bool is_step0, typename lhs_t, typename rhs_t>
struct IndexMaxMin {
using value_t = typename rhs_t::value_t;
using index_t = typename rhs_t::index_t;
lhs_t lhs_;
rhs_t rhs_;
IndexMaxMin(lhs_t &_l, rhs_t &_r);
index_t get_size() const;
bool valid_thread(cl::sycl::nd_item<1> ndItem) const;
void eval(cl::sycl::nd_item<1> ndItem);
template <typename sharedT>
void eval(sharedT scratch, cl::sycl::nd_item<1> ndItem);
void bind(cl::sycl::handler &h);
void adjust_access_displacement();
};

/*! Rotg.
* @brief Implements the rotg (blas level 1 api)
*/
Expand Down Expand Up @@ -280,6 +310,12 @@ inline WGAtomicReduction<operator_t, lhs_t, rhs_t> make_wg_atomic_reduction(
return WGAtomicReduction<operator_t, lhs_t, rhs_t>(lhs_, rhs_);
}

template <bool is_max, bool is_step0, typename lhs_t, typename rhs_t>
inline IndexMaxMin<is_max, is_step0, lhs_t, rhs_t> make_index_max_min(
lhs_t &lhs_, rhs_t &rhs_) {
return IndexMaxMin<is_max, is_step0, lhs_t, rhs_t>(lhs_, rhs_);
}

/*!
@brief Template function for constructing operation nodes based on input
template and function arguments. Non-specialized case for N reference operands.
Expand Down
8 changes: 8 additions & 0 deletions include/operations/blas_constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -269,4 +269,12 @@ template <typename ind_t, typename val_t>
struct sycl::is_device_copyable<blas::IndexValueTuple<ind_t, val_t>>
: std::true_type {};

template <typename ind_t, typename val_t>
struct sycl::is_device_copyable<const blas::IndexValueTuple<ind_t, val_t>>
: std::true_type {};

template <typename ind_t, typename val_t>
struct std::is_trivially_copyable<blas::IndexValueTuple<ind_t, val_t>>
: std::true_type {};

#endif // BLAS_CONSTANTS_H
17 changes: 10 additions & 7 deletions include/portblas_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -138,22 +138,24 @@ inline size_t get_num_compute_units(cl::sycl::queue &q) {
@param size is the number of elements to be copied
*/
template <typename element_t>
inline cl::sycl::event copy_to_device(cl::sycl::queue q, const element_t *src,
BufferIterator<element_t> dst,
size_t size) {
inline cl::sycl::event copy_to_device(
cl::sycl::queue q, const element_t *src, BufferIterator<element_t> dst,
size_t size, const std::vector<cl::sycl::event> &_dependencies = {}) {
auto event = q.submit([&](cl::sycl::handler &cgh) {
auto acc = dst.template get_range_accessor<cl::sycl::access::mode::write>(
cgh, size);
cgh.depends_on(_dependencies);
cgh.copy(src, acc);
});
return event;
}

#ifdef SB_ENABLE_USM
template <typename element_t>
inline cl::sycl::event copy_to_device(cl::sycl::queue q, const element_t *src,
element_t *dst, size_t size) {
auto event = q.memcpy(dst, src, size * sizeof(element_t));
inline cl::sycl::event copy_to_device(
cl::sycl::queue q, const element_t *src, element_t *dst, size_t size,
const std::vector<cl::sycl::event> &_dependencies = {}) {
auto event = q.memcpy(dst, src, size * sizeof(element_t), _dependencies);
return event;
}
#endif
Expand Down Expand Up @@ -195,8 +197,9 @@ inline cl::sycl::event copy_to_host(cl::sycl::queue q, const element_t *src,
template <typename element_t>
inline cl::sycl::event fill(cl::sycl::queue q, BufferIterator<element_t> buff,
element_t value, size_t size,
const std::vector<cl::sycl::event> &) {
const std::vector<cl::sycl::event> &_dependencies) {
auto event = q.submit([&](cl::sycl::handler &cgh) {
cgh.depends_on(_dependencies);
auto acc = buff.template get_range_accessor<cl::sycl::access::mode::write>(
cgh, size);
cgh.fill(acc, value);
Expand Down
45 changes: 45 additions & 0 deletions src/interface/blas1/backend/amd_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,51 @@ typename sb_handle_t::event_t _asum(
} // namespace backend
} // namespace asum

namespace iamax {
namespace backend {
template <typename sb_handle_t, typename container_0_t, typename container_1_t,
typename index_t, typename increment_t>
typename sb_handle_t::event_t _iamax(
sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
if (_N < 65536) {
constexpr int localSize = 1024;
return blas::internal::_iamax_iamin_impl<localSize, localSize, true, true>(
sb_handle, _N, _vx, _incx, _rs, static_cast<index_t>(1), _dependencies);
} else {
constexpr int localSize = 256;
const index_t nWG = std::min((_N + localSize - 1) / (localSize * 4),
static_cast<index_t>(512));
return blas::internal::_iamax_iamin_impl<localSize, localSize, true, false>(
sb_handle, _N, _vx, _incx, _rs, nWG, _dependencies);
}
}
} // namespace backend
} // namespace iamax

namespace iamin {
namespace backend {
template <typename sb_handle_t, typename container_0_t, typename container_1_t,
typename index_t, typename increment_t>
typename sb_handle_t::event_t _iamin(
sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
if (_N < 65536) {
constexpr int localSize = 1024;
return blas::internal::_iamax_iamin_impl<localSize, localSize, false, true>(
sb_handle, _N, _vx, _incx, _rs, static_cast<index_t>(1), _dependencies);
} else {
constexpr int localSize = 256;
const index_t nWG = std::min((_N + localSize - 1) / (localSize * 4),
static_cast<index_t>(512));
return blas::internal::_iamax_iamin_impl<localSize, localSize, false,
false>(sb_handle, _N, _vx, _incx,
_rs, nWG, _dependencies);
}
}
} // namespace backend
} // namespace iamin

namespace nrm2 {
namespace backend {
template <typename sb_handle_t, typename container_0_t, typename container_1_t,
Expand Down
42 changes: 42 additions & 0 deletions src/interface/blas1/backend/default_cpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,48 @@ typename sb_handle_t::event_t _asum(
} // namespace backend
} // namespace asum

namespace iamax {
namespace backend {
template <typename sb_handle_t, typename container_0_t, typename container_1_t,
typename index_t, typename increment_t>
typename sb_handle_t::event_t _iamax(
sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
constexpr int localSize = 128;
if (_N < 8192) {
return blas::internal::_iamax_iamin_impl<localSize, 0, true, true>(
sb_handle, _N, _vx, _incx, _rs, static_cast<index_t>(1), _dependencies);
} else {
const index_t nWG = std::min((_N + localSize - 1) / (localSize * 4),
static_cast<index_t>(512));
return blas::internal::_iamax_iamin_impl<localSize, 0, true, false>(
sb_handle, _N, _vx, _incx, _rs, nWG, _dependencies);
}
}
} // namespace backend
} // namespace iamax

namespace iamin {
namespace backend {
template <typename sb_handle_t, typename container_0_t, typename container_1_t,
typename index_t, typename increment_t>
typename sb_handle_t::event_t _iamin(
sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
constexpr int localSize = 128;
if (_N < 8192) {
return blas::internal::_iamax_iamin_impl<localSize, 0, false, true>(
sb_handle, _N, _vx, _incx, _rs, static_cast<index_t>(1), _dependencies);
} else {
const index_t nWG = std::min((_N + localSize - 1) / (localSize * 4),
static_cast<index_t>(512));
return blas::internal::_iamax_iamin_impl<localSize, 0, false, false>(
sb_handle, _N, _vx, _incx, _rs, nWG, _dependencies);
}
}
} // namespace backend
} // namespace iamin

namespace nrm2 {
namespace backend {
template <typename sb_handle_t, typename container_0_t, typename container_1_t,
Expand Down
42 changes: 42 additions & 0 deletions src/interface/blas1/backend/intel_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,48 @@ typename sb_handle_t::event_t _asum(
} // namespace backend
} // namespace asum

namespace iamax {
namespace backend {
template <typename sb_handle_t, typename container_0_t, typename container_1_t,
typename index_t, typename increment_t>
typename sb_handle_t::event_t _iamax(
sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
constexpr int localSize = 256;
if (_N < 8192) {
return blas::internal::_iamax_iamin_impl<localSize, localSize, true, true>(
sb_handle, _N, _vx, _incx, _rs, static_cast<index_t>(1), _dependencies);
} else {
const index_t nWG = std::min((_N + localSize - 1) / (localSize * 4),
static_cast<index_t>(512));
return blas::internal::_iamax_iamin_impl<localSize, localSize, true, false>(
sb_handle, _N, _vx, _incx, _rs, nWG, _dependencies);
}
}
} // namespace backend
} // namespace iamax

namespace iamin {
namespace backend {
template <typename sb_handle_t, typename container_0_t, typename container_1_t,
typename index_t, typename increment_t>
typename sb_handle_t::event_t _iamin(
sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
constexpr int localSize = 256;
if (_N < 8192) {
return blas::internal::_iamax_iamin_impl<localSize, localSize, false, true>(
sb_handle, _N, _vx, _incx, _rs, static_cast<index_t>(1), _dependencies);
} else {
const index_t nWG = std::min((_N + localSize - 1) / (localSize * 4),
static_cast<index_t>(512));
return blas::internal::_iamax_iamin_impl<localSize, localSize, false,
false>(sb_handle, _N, _vx, _incx,
_rs, nWG, _dependencies);
}
}
} // namespace backend
} // namespace iamin
namespace nrm2 {
namespace backend {
template <typename sb_handle_t, typename container_0_t, typename container_1_t,
Expand Down
Loading

0 comments on commit 101c87a

Please sign in to comment.