matter-labs · robik75 · Sep 8, 2024 · Sep 4, 2024
@@ -280,6 +280,11 @@ bc_error pn_set_values_from_packed_bits(void *values, const void *packet_bits, c
                                                                static_cast<cudaStream_t>(stream.handle)));
 }
 
+bc_error pn_distribute_values(const void *src, void *dst, const unsigned count, const unsigned stride, bc_stream stream) {
+  return static_cast<bc_error>(pn::distribute_values(static_cast<const fd_q::storage *>(src), static_cast<fd_q::storage *>(dst), count, stride,
+                                                               static_cast<cudaStream_t>(stream.handle)));
+}
+
 bc_error pn_tear_down() { return static_cast<bc_error>(pn::tear_down()); };
 
 bc_error msm_set_up() { return static_cast<bc_error>(msm::set_up()); }

@@ -458,6 +458,14 @@ bc_error pn_generate_permutation_polynomials(generate_permutation_polynomials_co
 // stream - Stream on which this operation will be scheduled
 bc_error pn_set_values_from_packed_bits(void *values, const void *packet_bits, unsigned count, bc_stream stream);
 
+// Distribute field element values with a stride
+// src - device pointer to the vector of field elements from where the values will be read
+// dst - device pointer to the vector of field elements to where the results will be written
+// count - number of values to distribute
+// stride - stride with which the values will be distributed
+// stream - Stream on which this operation will be scheduled
+bc_error pn_distribute_values(const void *src, void *dst, unsigned count, unsigned stride, bc_stream stream);
+
 // release all resources associated with the internal state for polynomial computations
 bc_error pn_tear_down();
 

@@ -141,4 +141,21 @@ cudaError_t set_values_from_packed_bits(fd_q::storage *values, const unsigned *p
   return cudaGetLastError();
 }
 
+__global__ void distribute_values_kernel(const fd_q::storage *src, fd_q::storage *dst, const unsigned count, const unsigned stride) {
+  typedef fd_q::storage storage;
+  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (gid >= count)
+    return;
+  const auto value = memory::load<storage, memory::ld_modifier::cs>(src + gid);
+  memory::store<storage, memory::st_modifier::cs>(dst + gid * stride, value);
+}
+
+cudaError_t distribute_values(const fd_q::storage *src, fd_q::storage *dst, const unsigned count, const unsigned stride, cudaStream_t stream) {
+  const unsigned threads_per_block = 128;
+  const dim3 block_dim = count < threads_per_block ? count : threads_per_block;
+  const dim3 grid_dim = (count - 1) / block_dim.x + 1;
+  distribute_values_kernel<<<grid_dim, block_dim, 0, stream>>>(src, dst, count, stride);
+  return cudaGetLastError();
+}
+
 } // namespace pn
@@ -18,4 +18,6 @@ cudaError_t generate_permutation_matrix(fd_q::storage *values, const fd_q::stora
 
 cudaError_t set_values_from_packed_bits(fd_q::storage *values, const unsigned *packet_bits, unsigned count, cudaStream_t stream);
 
+cudaError_t distribute_values(const fd_q::storage *src, fd_q::storage *dst, unsigned count, unsigned stride, cudaStream_t stream);
+
 } // namespace pn
Original file line number	Diff line number	Diff line change
Expand Up		@@ -18,4 +18,6 @@ cudaError_t generate_permutation_matrix(fd_q::storage *values, const fd_q::stora

		cudaError_t set_values_from_packed_bits(fd_q::storage values, const unsigned packet_bits, unsigned count, cudaStream_t stream);

		cudaError_t distribute_values(const fd_q::storage src, fd_q::storage dst, unsigned count, unsigned stride, cudaStream_t stream);

		} // namespace pn