Skip to content
This repository was archived by the owner on Jan 13, 2025. It is now read-only.

Commit

Permalink
Added relevant copy to device & init of input/output result
Browse files Browse the repository at this point in the history
  • Loading branch information
OuadiElfarouki committed Oct 16, 2023
1 parent ca6342c commit 67bae5e
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 30 deletions.
11 changes: 7 additions & 4 deletions benchmark/portblas/blas1/dot.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,10 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
scalar_t vr_temp = 0;
{
auto vr_temp_gpu = blas::helper::allocate<mem_alloc, scalar_t>(1, q);
auto copyToD =
blas::helper::copy_to_device<scalar_t>(q, &vr_temp, vr_temp_gpu, 1);
auto dot_event = _dot(sb_handle, size, inx, static_cast<index_t>(1), iny,
static_cast<index_t>(1), vr_temp_gpu);
static_cast<index_t>(1), vr_temp_gpu, {copyToD});
sb_handle.wait(dot_event);
auto copy_output = blas::helper::copy_to_host(q, vr_temp_gpu, &vr_temp, 1);
sb_handle.wait(copy_output);
Expand Down Expand Up @@ -128,8 +130,8 @@ void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success,
};

benchmark::RegisterBenchmark(
blas_benchmark::utils::get_name<benchmark_op, scalar_t>(
size, mem_type).c_str(),
blas_benchmark::utils::get_name<benchmark_op, scalar_t>(size, mem_type)
.c_str(),
BM_lambda, sb_handle_ptr, size, success)
->UseRealTime();
}
Expand All @@ -141,7 +143,8 @@ void register_benchmark(blas_benchmark::Args& args,
auto dot_params = blas_benchmark::utils::get_blas1_params(args);

register_benchmark<scalar_t, blas::helper::AllocType::buffer>(
sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, dot_params);
sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER,
dot_params);
#ifdef SB_ENABLE_USM
register_benchmark<scalar_t, blas::helper::AllocType::usm>(
sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, dot_params);
Expand Down
14 changes: 9 additions & 5 deletions benchmark/portblas/blas1/sdsdot.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,11 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
scalar_t vr_temp = 0;
{
auto vr_temp_gpu = blas::helper::allocate<mem_alloc, scalar_t>(1, q);
auto copyToD =
blas::helper::copy_to_device<scalar_t>(q, &vr_temp, vr_temp_gpu, 1);
auto sdsdot_event =
_sdsdot(sb_handle, size, sb, inx, static_cast<index_t>(1), iny,
static_cast<index_t>(1), vr_temp_gpu);
static_cast<index_t>(1), vr_temp_gpu, {copyToD});
sb_handle.wait(sdsdot_event);
auto event = blas::helper::copy_to_host(q, vr_temp_gpu, &vr_temp, 1);
sb_handle.wait(event);
Expand Down Expand Up @@ -126,8 +128,8 @@ void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success,
run<scalar_t, mem_alloc>(st, sb_handle_ptr, size, success);
};
benchmark::RegisterBenchmark(
blas_benchmark::utils::get_name<benchmark_op, scalar_t>(
size, mem_type).c_str(),
blas_benchmark::utils::get_name<benchmark_op, scalar_t>(size, mem_type)
.c_str(),
BM_lambda, sb_handle_ptr, size, success)
->UseRealTime();
}
Expand All @@ -139,10 +141,12 @@ void register_benchmark(blas_benchmark::Args& args,
auto sdsdot_params = blas_benchmark::utils::get_blas1_params(args);

register_benchmark<scalar_t, blas::helper::AllocType::buffer>(
sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, sdsdot_params);
sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER,
sdsdot_params);
#ifdef SB_ENABLE_USM
register_benchmark<scalar_t, blas::helper::AllocType::usm>(
sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, sdsdot_params);
sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM,
sdsdot_params);
#endif
}

Expand Down
52 changes: 31 additions & 21 deletions src/interface/blas1_interface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,16 +151,22 @@ typename sb_handle_t::event_t _sdsdot(
sb_handle_t &sb_handle, index_t _N, float sb, container_0_t _vx,
increment_t _incx, container_1_t _vy, increment_t _incy, container_2_t _rs,
const typename sb_handle_t::event_t &_dependencies) {
typename sb_handle_t::event_t dot_event{};
auto rs = make_vector_view(_rs, static_cast<increment_t>(1),
static_cast<index_t>(1));

dot_event = blas::dot::backend::_dot(sb_handle, _N, _vx, _incx, _vy, _incy,
_rs, _dependencies);
auto addOp = make_op<ScalarOp, AddOperator>(sb, rs);
auto assignOp2 = make_op<Assign>(rs, addOp);
auto ret2 = sb_handle.execute(assignOp2, dot_event);
return blas::concatenate_vectors(dot_event, ret2);
if (!_N) {
auto addOp = make_op<ScalarOp, AddOperator>(sb, rs);
auto assignOp = make_op<Assign>(rs, addOp);
auto ret = sb_handle.execute(assignOp, _dependencies);
return ret;
} else {
typename sb_handle_t::event_t dotOp{};
dotOp = blas::dot::backend::_dot(sb_handle, _N, _vx, _incx, _vy, _incy, _rs,
_dependencies);
auto addOp = make_op<ScalarOp, AddOperator>(sb, rs);
auto assignOp2 = make_op<Assign>(rs, addOp);
auto ret = sb_handle.execute(assignOp2, dotOp);
return blas::concatenate_vectors(dotOp, ret);
}
}

/**
Expand Down Expand Up @@ -761,18 +767,17 @@ typename ValueType<container_0_t>::type _dot(
auto gpu_res = helper::allocate < is_usm ? helper::AllocType::usm
: helper::AllocType::buffer,
element_t > (static_cast<index_t>(1), sb_handle.get_queue());
auto copy_to_d =
auto copyTodD =
blas::helper::copy_to_device(sb_handle.get_queue(), &res, gpu_res, 1);
typename sb_handle_t::event_t all_deps = concatenate_vectors(
_dependencies, typename sb_handle_t::event_t{copy_to_d});
_dependencies, typename sb_handle_t::event_t{copyTodD});

auto dot_event =
auto dotOp =
internal::_dot(sb_handle, _N, _vx, _incx, _vy, _incy, gpu_res, all_deps);

sb_handle.wait(dot_event);
auto copy_to_h =
helper::copy_to_host(sb_handle.get_queue(), gpu_res, &res, 1);
sb_handle.wait(copy_to_h);
sb_handle.wait(dotOp);
auto copyToH = helper::copy_to_host(sb_handle.get_queue(), gpu_res, &res, 1);
sb_handle.wait(copyToH);

helper::deallocate<is_usm ? helper::AllocType::usm
: helper::AllocType::buffer>(gpu_res,
Expand Down Expand Up @@ -808,16 +813,21 @@ typename ValueType<container_0_t>::type _sdsdot(
const typename sb_handle_t::event_t &_dependencies) {
constexpr bool is_usm = std::is_pointer<container_0_t>::value;
using element_t = typename ValueType<container_0_t>::type;
element_t res{};
element_t res = element_t(0);
auto gpu_res = blas::helper::allocate < is_usm ? helper::AllocType::usm
: helper::AllocType::buffer,
element_t > (static_cast<index_t>(1), sb_handle.get_queue());
auto event1 = blas::internal::_sdsdot(sb_handle, _N, sb, _vx, _incx, _vy,
_incy, gpu_res, _dependencies);
sb_handle.wait(event1);
auto event2 =
auto copyTodD =
blas::helper::copy_to_device(sb_handle.get_queue(), &res, gpu_res, 1);
typename sb_handle_t::event_t all_deps = concatenate_vectors(
_dependencies, typename sb_handle_t::event_t{copyTodD});

auto sdsdot_event = blas::internal::_sdsdot(sb_handle, _N, sb, _vx, _incx,
_vy, _incy, gpu_res, all_deps);
sb_handle.wait(sdsdot_event);
auto copyToH =
blas::helper::copy_to_host(sb_handle.get_queue(), gpu_res, &res, 1);
sb_handle.wait(event2);
sb_handle.wait(copyToH);

blas::helper::deallocate<is_usm ? helper::AllocType::usm
: helper::AllocType::buffer>(
Expand Down

0 comments on commit 67bae5e

Please sign in to comment.