From c858d6cb53cbe33e562568e449b4ba753b5fae6f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9D=D0=B8=D0=BD=D0=B0=20=D0=A7=D0=B5=D0=BA=D0=B0=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=BD=D0=B0?= <na.chekalina@interfax.ru>
Date: Tue, 7 Jan 2025 10:37:37 +0300
Subject: [PATCH] Add  files: bitonic.cl, main_bitonic.cpp

---
 src/cl/bitonic.cl    | 14 +++++++++++++-
 src/main_bitonic.cpp | 14 ++++++++++----
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/src/cl/bitonic.cl b/src/cl/bitonic.cl
index ecf336b9..8cefbd59 100644
--- a/src/cl/bitonic.cl
+++ b/src/cl/bitonic.cl
@@ -1,4 +1,16 @@
-__kernel void bitonic()
+__kernel void bitonic(__global int *data, unsigned int step_size, unsigned int sub_step_size)
 {
+    unsigned int thread_id = get_global_id(0);
+    unsigned int segment_index = thread_id / step_size;
+    bool sort_direction = segment_index % 2 == 0;
+    unsigned int local_idx = thread_id / sub_step_size * (sub_step_size * 2) + (thread_id % sub_step_size);
 
+    unsigned int pair_idx = local_idx + sub_step_size;
+    if (sort_direction && data[local_idx] > data[pair_idx] ||
+        !sort_direction && data[local_idx] < data[pair_idx]
+    ) {
+        int temp = data[local_idx];
+        data[local_idx] = data[pair_idx];
+        data[pair_idx] = temp;
+    }
 }
diff --git a/src/main_bitonic.cpp b/src/main_bitonic.cpp
index 9b508a6c..0513ee7e 100644
--- a/src/main_bitonic.cpp
+++ b/src/main_bitonic.cpp
@@ -68,16 +68,22 @@ int main(int argc, char **argv) {
         ocl::Kernel bitonic(bitonic_kernel, bitonic_kernel_length, "bitonic");
         bitonic.compile();
 
+        unsigned int workGroupSize = 64;
+        unsigned int globalWorkSize = (n / 2 + workGroupSize - 1) / workGroupSize * workGroupSize;
         timer t;
         for (int iter = 0; iter < benchmarkingIters; ++iter) {
             as_gpu.writeN(as.data(), n);
             t.restart();// Запускаем секундомер после прогрузки данных, чтобы замерять время работы кернела, а не трансфер данных
-
-            /*TODO*/
-
+            for (unsigned int blockHalfSize = 1; blockHalfSize <= n / 2; blockHalfSize *= 2) {
+                for (unsigned int subBlockHalfSize = blockHalfSize; subBlockHalfSize >= 1; subBlockHalfSize /= 2) {
+                    bitonic.exec(
+                        gpu::WorkSize(workGroupSize, globalWorkSize),
+                        as_gpu, blockHalfSize, subBlockHalfSize
+                    );
+                }
+            }
             t.nextLap();
         }
-
         std::cout << "GPU: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl;
         std::cout << "GPU: " << (n / 1000 / 1000) / t.lapAvg() << " millions/s" << std::endl;
     }