task07 solution

GPGPUCourse · Jan 13, 2025 · aaa577e · aaa577e
1 parent 286de39
commit aaa577e
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 5 deletions.
diff --git a/src/cl/prefix_sum.cl b/src/cl/prefix_sum.cl
@@ -1 +1,19 @@
-// TODO
+#ifdef __CLION_IDE__
+    #include <libgpu/opencl/cl/clion_defines.cl>
+#endif
+
+#line 6
+
+__kernel void prefix_sum_binary(__global unsigned int* sum, unsigned int n, unsigned int rate)
+{
+       const int i = (get_global_id(0) + 1) * rate - 1;
+       if (i < n)
+            sum[i] += sum[i - rate / 2];
+}
+
+__kernel void prefix_sum_second_part(__global unsigned int* sum, unsigned int n, unsigned int rate)
+{
+    const int i = (get_global_id(0) + 1) * rate - 1 + rate / 2;
+    if (i < n)
+        sum[i] += sum[i - rate / 2];
+}
diff --git a/src/main_prefix_sum.cpp b/src/main_prefix_sum.cpp
@@ -47,6 +47,12 @@ std::vector<unsigned int> computeCPU(const std::vector<unsigned int> &as)
 
 int main(int argc, char **argv)
 {
+    gpu::Device device = gpu::chooseGPUDevice(argc, argv);
+
+    gpu::Context context;
+    context.init(device.device_id_opencl);
+    context.activate();
+
 	for (unsigned int n = 4096; n <= max_n; n *= 4) {
 		std::cout << "______________________________________________" << std::endl;
 		unsigned int values_range = std::min<unsigned int>(1023, std::numeric_limits<int>::max() / n);
@@ -83,25 +89,41 @@ int main(int argc, char **argv)
 #endif
 
 // work-efficient prefix sum
-#if 0
         {
             std::vector<unsigned int> res(n);
+            ocl::Kernel prefix_sum_binary(prefix_sum_kernel, prefix_sum_kernel_length, "prefix_sum_binary");
+            ocl::Kernel prefix_sum_second_part(prefix_sum_kernel, prefix_sum_kernel_length, "prefix_sum_second_part");
+            prefix_sum_binary.compile();
+            prefix_sum_second_part.compile();
+
+            gpu::gpu_mem_32u gpu;
+            gpu.resizeN(n);
 
             timer t;
             for (int iter = 0; iter < benchmarkingIters; ++iter) {
-                // TODO
+                gpu.writeN(as.data(), as.size());
                 t.restart();
-                // TODO
+
+                for (unsigned int rate = 2; rate <= n; rate *= 2) {
+                    prefix_sum_binary.exec(gpu::WorkSize(64, n / rate), gpu, n, rate);
+
+                }
+
+                for (unsigned int rate = n / 2; rate >= 2; rate /= 2) {
+                    prefix_sum_second_part.exec(gpu::WorkSize(64, (n + rate - 1) / rate), gpu, n, rate);
+                }
+
                 t.nextLap();
             }
 
+            gpu.readN(res.data(), as.size());
+
             std::cout << "GPU [work-efficient]: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl;
             std::cout << "GPU [work-efficient]: " << (n / 1000.0 / 1000.0) / t.lapAvg() << " millions/s" << std::endl;
 
             for (int i = 0; i < n; ++i) {
                 EXPECT_THE_SAME(cpu_reference[i], res[i], "GPU result should be consistent!");
             }
         }
-#endif
 	}
 }