GPGPUCourse · ninachekalina · Jan 7, 2025
diff --git a/src/cl/bitonic.cl b/src/cl/bitonic.cl
@@ -1,4 +1,16 @@
-__kernel void bitonic()
+__kernel void bitonic(__global int *data, unsigned int step_size, unsigned int sub_step_size)
 {
+    unsigned int thread_id = get_global_id(0);
+    unsigned int segment_index = thread_id / step_size;
+    bool sort_direction = segment_index % 2 == 0;
+    unsigned int local_idx = thread_id / sub_step_size * (sub_step_size * 2) + (thread_id % sub_step_size);
 
+    unsigned int pair_idx = local_idx + sub_step_size;
+    if (sort_direction && data[local_idx] > data[pair_idx] ||
+        !sort_direction && data[local_idx] < data[pair_idx]
+    ) {
+        int temp = data[local_idx];
+        data[local_idx] = data[pair_idx];
+        data[pair_idx] = temp;
+    }
 }
diff --git a/src/main_bitonic.cpp b/src/main_bitonic.cpp
@@ -68,16 +68,22 @@ int main(int argc, char **argv) {
         ocl::Kernel bitonic(bitonic_kernel, bitonic_kernel_length, "bitonic");
         bitonic.compile();
 
+        unsigned int workGroupSize = 64;
+        unsigned int globalWorkSize = (n / 2 + workGroupSize - 1) / workGroupSize * workGroupSize;
         timer t;
         for (int iter = 0; iter < benchmarkingIters; ++iter) {
             as_gpu.writeN(as.data(), n);
             t.restart();// Запускаем секундомер после прогрузки данных, чтобы замерять время работы кернела, а не трансфер данных
-
-            /*TODO*/
-
+            for (unsigned int blockHalfSize = 1; blockHalfSize <= n / 2; blockHalfSize *= 2) {
+                for (unsigned int subBlockHalfSize = blockHalfSize; subBlockHalfSize >= 1; subBlockHalfSize /= 2) {
+                    bitonic.exec(
+                        gpu::WorkSize(workGroupSize, globalWorkSize),
+                        as_gpu, blockHalfSize, subBlockHalfSize
+                    );
+                }
+            }
             t.nextLap();
         }
-
         std::cout << "GPU: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl;
         std::cout << "GPU: " << (n / 1000 / 1000) / t.lapAvg() << " millions/s" << std::endl;
     }