Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add distribute function #14

Merged
merged 1 commit into from
Sep 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/bellman-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,11 @@ bc_error pn_set_values_from_packed_bits(void *values, const void *packet_bits, c
static_cast<cudaStream_t>(stream.handle)));
}

bc_error pn_distribute_values(const void *src, void *dst, const unsigned count, const unsigned stride, bc_stream stream) {
return static_cast<bc_error>(pn::distribute_values(static_cast<const fd_q::storage *>(src), static_cast<fd_q::storage *>(dst), count, stride,
static_cast<cudaStream_t>(stream.handle)));
}

bc_error pn_tear_down() { return static_cast<bc_error>(pn::tear_down()); };

bc_error msm_set_up() { return static_cast<bc_error>(msm::set_up()); }
Expand Down
8 changes: 8 additions & 0 deletions src/bellman-cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,14 @@ bc_error pn_generate_permutation_polynomials(generate_permutation_polynomials_co
// stream - Stream on which this operation will be scheduled
bc_error pn_set_values_from_packed_bits(void *values, const void *packet_bits, unsigned count, bc_stream stream);

// Distribute field element values with a stride
// src - device pointer to the vector of field elements from where the values will be read
// dst - device pointer to the vector of field elements to where the results will be written
// count - number of values to distribute
// stride - stride with which the values will be distributed
// stream - Stream on which this operation will be scheduled
bc_error pn_distribute_values(const void *src, void *dst, unsigned count, unsigned stride, bc_stream stream);

// release all resources associated with the internal state for polynomial computations
bc_error pn_tear_down();

Expand Down
17 changes: 17 additions & 0 deletions src/pn_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -141,4 +141,21 @@ cudaError_t set_values_from_packed_bits(fd_q::storage *values, const unsigned *p
return cudaGetLastError();
}

__global__ void distribute_values_kernel(const fd_q::storage *src, fd_q::storage *dst, const unsigned count, const unsigned stride) {
typedef fd_q::storage storage;
const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
if (gid >= count)
return;
const auto value = memory::load<storage, memory::ld_modifier::cs>(src + gid);
memory::store<storage, memory::st_modifier::cs>(dst + gid * stride, value);
}

cudaError_t distribute_values(const fd_q::storage *src, fd_q::storage *dst, const unsigned count, const unsigned stride, cudaStream_t stream) {
const unsigned threads_per_block = 128;
const dim3 block_dim = count < threads_per_block ? count : threads_per_block;
const dim3 grid_dim = (count - 1) / block_dim.x + 1;
distribute_values_kernel<<<grid_dim, block_dim, 0, stream>>>(src, dst, count, stride);
return cudaGetLastError();
}

} // namespace pn
2 changes: 2 additions & 0 deletions src/pn_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,6 @@ cudaError_t generate_permutation_matrix(fd_q::storage *values, const fd_q::stora

cudaError_t set_values_from_packed_bits(fd_q::storage *values, const unsigned *packet_bits, unsigned count, cudaStream_t stream);

cudaError_t distribute_values(const fd_q::storage *src, fd_q::storage *dst, unsigned count, unsigned stride, cudaStream_t stream);

} // namespace pn
Loading