From aaa577e9a60e62433f5d3e2046a5b18cf94d7fdb Mon Sep 17 00:00:00 2001 From: Fedor Kudriavtsev Date: Mon, 13 Jan 2025 23:10:31 +0300 Subject: [PATCH] task07 solution --- src/cl/prefix_sum.cl | 20 +++++++++++++++++++- src/main_prefix_sum.cpp | 30 ++++++++++++++++++++++++++---- 2 files changed, 45 insertions(+), 5 deletions(-) diff --git a/src/cl/prefix_sum.cl b/src/cl/prefix_sum.cl index 0ffdd02f..264eb5d1 100644 --- a/src/cl/prefix_sum.cl +++ b/src/cl/prefix_sum.cl @@ -1 +1,19 @@ -// TODO \ No newline at end of file +#ifdef __CLION_IDE__ + #include +#endif + +#line 6 + +__kernel void prefix_sum_binary(__global unsigned int* sum, unsigned int n, unsigned int rate) +{ + const int i = (get_global_id(0) + 1) * rate - 1; + if (i < n) + sum[i] += sum[i - rate / 2]; +} + +__kernel void prefix_sum_second_part(__global unsigned int* sum, unsigned int n, unsigned int rate) +{ + const int i = (get_global_id(0) + 1) * rate - 1 + rate / 2; + if (i < n) + sum[i] += sum[i - rate / 2]; +} \ No newline at end of file diff --git a/src/main_prefix_sum.cpp b/src/main_prefix_sum.cpp index 782a88bd..7555c0df 100644 --- a/src/main_prefix_sum.cpp +++ b/src/main_prefix_sum.cpp @@ -47,6 +47,12 @@ std::vector computeCPU(const std::vector &as) int main(int argc, char **argv) { + gpu::Device device = gpu::chooseGPUDevice(argc, argv); + + gpu::Context context; + context.init(device.device_id_opencl); + context.activate(); + for (unsigned int n = 4096; n <= max_n; n *= 4) { std::cout << "______________________________________________" << std::endl; unsigned int values_range = std::min(1023, std::numeric_limits::max() / n); @@ -83,18 +89,35 @@ int main(int argc, char **argv) #endif // work-efficient prefix sum -#if 0 { std::vector res(n); + ocl::Kernel prefix_sum_binary(prefix_sum_kernel, prefix_sum_kernel_length, "prefix_sum_binary"); + ocl::Kernel prefix_sum_second_part(prefix_sum_kernel, prefix_sum_kernel_length, "prefix_sum_second_part"); + prefix_sum_binary.compile(); + prefix_sum_second_part.compile(); + + gpu::gpu_mem_32u gpu; + gpu.resizeN(n); timer t; for (int iter = 0; iter < benchmarkingIters; ++iter) { - // TODO + gpu.writeN(as.data(), as.size()); t.restart(); - // TODO + + for (unsigned int rate = 2; rate <= n; rate *= 2) { + prefix_sum_binary.exec(gpu::WorkSize(64, n / rate), gpu, n, rate); + + } + + for (unsigned int rate = n / 2; rate >= 2; rate /= 2) { + prefix_sum_second_part.exec(gpu::WorkSize(64, (n + rate - 1) / rate), gpu, n, rate); + } + t.nextLap(); } + gpu.readN(res.data(), as.size()); + std::cout << "GPU [work-efficient]: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl; std::cout << "GPU [work-efficient]: " << (n / 1000.0 / 1000.0) / t.lapAvg() << " millions/s" << std::endl; @@ -102,6 +125,5 @@ int main(int argc, char **argv) EXPECT_THE_SAME(cpu_reference[i], res[i], "GPU result should be consistent!"); } } -#endif } }