diff --git a/src/cl/prefix_sum.cl b/src/cl/prefix_sum.cl index 0ffdd02f..0de3650d 100644 --- a/src/cl/prefix_sum.cl +++ b/src/cl/prefix_sum.cl @@ -1 +1,21 @@ -// TODO \ No newline at end of file +#ifdef __CLION_IDE__ + #include +#endif + +#line 6 + +__kernel void prefix_sum_binary(__global unsigned int* sum, unsigned int n, unsigned int rate) +{ + const int i = (get_global_id(0) + 1) * rate - 1; + if (i < n) { + sum[i] += sum[i - (rate >> 1)]; + } +} + +__kernel void prefix_sum_second_part(__global unsigned int* sum, unsigned int n, unsigned int rate) +{ + const int i = (get_global_id(0) + 1) * rate - 1 + (rate >> 1); + if (i < n) { + sum[i] += sum[i - (rate >> 1)]; + } +} diff --git a/src/main_prefix_sum.cpp b/src/main_prefix_sum.cpp index 782a88bd..187e832c 100644 --- a/src/main_prefix_sum.cpp +++ b/src/main_prefix_sum.cpp @@ -83,18 +83,41 @@ int main(int argc, char **argv) #endif // work-efficient prefix sum -#if 0 +#if 1 { + gpu::Device device = gpu::chooseGPUDevice(argc, argv); + gpu::Context context; + context.init(device.device_id_opencl); + context.activate(); + std::vector res(n); + ocl::Kernel prefix_sum_binary(prefix_sum_kernel, prefix_sum_kernel_length, "prefix_sum_binary"); + ocl::Kernel prefix_sum_second_part(prefix_sum_kernel, prefix_sum_kernel_length, "prefix_sum_second_part"); + prefix_sum_binary.compile(); + prefix_sum_second_part.compile(); + + gpu::gpu_mem_32u gpu; + gpu.resizeN(n); + timer t; for (int iter = 0; iter < benchmarkingIters; ++iter) { - // TODO + gpu.writeN(as.data(), as.size()); t.restart(); - // TODO + + for (unsigned int rate = 2; rate <= n; rate *= 2) + prefix_sum_binary.exec( + gpu::WorkSize(64, n / rate), gpu, n, rate); + + for (unsigned int rate = n / 2; rate >= 2; rate /= 2) + prefix_sum_second_part.exec(gpu::WorkSize(64, (n + rate - 1) / rate), + gpu, n, rate); + t.nextLap(); } + gpu.readN(res.data(), as.size()); + std::cout << "GPU [work-efficient]: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl; std::cout << "GPU [work-efficient]: " << (n / 1000.0 / 1000.0) / t.lapAvg() << " millions/s" << std::endl;