Optimized IndexAddSYCL (#7455)

ssheorey · github-actions[bot] · web-flow · commit 03ed109ede5f · 2026-03-12T13:49:29.000-07:00
* Fix style auto fix.
* Fix windows update_release.sh
---------
Co-authored-by: github-actions[bot] &lt;github-actions[bot]@users.noreply.github.com&gt;
diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml
@@ -22,7 +22,7 @@ jobs:
         uses: actions/checkout@v4
         with:
           repository: ${{ github.event.pull_request.head.repo.full_name || github.repository }}
-          ref: ${{ github.event.pull_request.head.sha || github.ref }}
+          ref: ${{ github.event.pull_request.head.ref || github.ref }}
           token: ${{ secrets.GITHUB_TOKEN }}
       - name: Set up Python version
         uses: actions/setup-python@v5
@@ -47,7 +47,7 @@ jobs:
           git add -u
           if ! git diff --staged --quiet; then
             git commit -m "Apply automatic code formatting"
-            git push || exit 1
+            git push origin "HEAD:${{ github.event.pull_request.head.ref }}" || exit 1
           else
             echo "No formatting changes needed"
           fi
@@ -56,7 +56,7 @@ jobs:
         run: |
           git add -u
           if ! git diff --staged --quiet; then
-            echo "::error::This PR requires formatting changes but is from a fork. Please run 'python util/check_style.py --apply' locally and push the changes."
+            echo "::error::This PR requires formatting changes but is from a fork. Please run 'make apply-style' or 'python util/check_style.py --apply' locally and push the changes."
             exit 1
           else
             echo "No formatting changes needed"
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
@@ -177,8 +177,9 @@ jobs:
         env:
           GH_TOKEN: ${{ github.token }}
         shell: bash
+        working-directory: ${{ env.BUILD_DIR }}
         run: |
-          .github/workflows/update_release.sh ${{ env.BUILD_DIR }}/package/${{ env.DEVEL_PKG_NAME }}
+          .github/workflows/update_release.sh package/${{ env.DEVEL_PKG_NAME }}
 
       - name: Viewer App
         working-directory: ${{ env.BUILD_DIR }}
@@ -347,8 +348,9 @@ jobs:
         env:
           GH_TOKEN: ${{ github.token }}
         shell: bash
+        working-directory: ${{ env.BUILD_DIR }}
         run: |
-          .github/workflows/update_release.sh ${{ env.BUILD_DIR }}/lib/python_package/pip_package/${{ env.PIP_PKG_NAME }}
+          .github/workflows/update_release.sh lib/python_package/pip_package/${{ env.PIP_PKG_NAME }}
 
   test-wheel:
     name: Test wheel
diff --git a/cpp/open3d/core/kernel/IndexReduction.h b/cpp/open3d/core/kernel/IndexReduction.h
@@ -14,25 +14,25 @@ namespace open3d {
 namespace core {
 namespace kernel {
 
-void IndexAdd_(int64_t dim,
+void IndexAdd_([[maybe_unused]] int64_t dim,
                const Tensor& index,
                const Tensor& src,
                Tensor& dst);
 
-void IndexAddCPU_(int64_t dim,
+void IndexAddCPU_([[maybe_unused]] int64_t dim,
                   const Tensor& index,
                   const Tensor& src,
                   Tensor& dst);
 
 #ifdef BUILD_SYCL_MODULE
-void IndexAddSYCL_(int64_t dim,
+void IndexAddSYCL_([[maybe_unused]] int64_t dim,
                    const Tensor& index,
                    const Tensor& src,
                    Tensor& dst);
 #endif
 
 #ifdef BUILD_CUDA_MODULE
-void IndexAddCUDA_(int64_t dim,
+void IndexAddCUDA_([[maybe_unused]] int64_t dim,
                    const Tensor& index,
                    const Tensor& src,
                    Tensor& dst);
diff --git a/cpp/open3d/core/kernel/IndexReductionSYCL.cpp b/cpp/open3d/core/kernel/IndexReductionSYCL.cpp
@@ -6,7 +6,6 @@
 // ----------------------------------------------------------------------------
 
 #include "open3d/core/Dispatch.h"
-#include "open3d/core/Indexer.h"
 #include "open3d/core/SYCLContext.h"
 #include "open3d/core/Tensor.h"
 #include "open3d/utility/Logging.h"
@@ -15,45 +14,147 @@ namespace open3d {
 namespace core {
 namespace kernel {
 
+namespace {
+
+template <typename scalar_t>
+// Launches contiguous index_add over dim0:
+//   dst[index[i], ...] += src[i, ...]
+// Contract:
+// - `index_ptr`, `src_ptr`, and `dst_ptr` point to contiguous buffers.
+// - `index_length` is the length of `index_ptr` and the leading dimension of
+//   `src_ptr`.
+// - `broadcasting_elems` is the flattened product of non-reduction dimensions.
+// - `dst_ptr` has enough rows to address all index values.
+void LaunchIndexAddContiguousSYCLKernel(sycl::queue& queue,
+                                        const int64_t* index_ptr,
+                                        const scalar_t* src_ptr,
+                                        scalar_t* dst_ptr,
+                                        int64_t index_length,
+                                        int64_t broadcasting_elems) {
+    if (index_length <= 0 || broadcasting_elems <= 0) {
+        return;
+    }
+
+    auto ceil_div = [](int64_t a, int64_t b) -> int64_t {
+        return (a + b - 1) / b;
+    };
+    auto round_up = [](int64_t x, int64_t m) -> int64_t {
+        return ((x + m - 1) / m) * m;
+    };
+
+    // 2D launch configuration:
+    // - X dimension tiles columns (broadcasting_elems).
+    // - Y dimension tiles reduction rows (index_length).
+    //
+    // Each work-group processes TILE_ROWS rows and WG_X columns. Within a row
+    // tile, consecutive runs of identical destination indices are reduced into
+    // one atomic add per (column, run), reducing atomic pressure while
+    // preserving index_add semantics.
+    constexpr int WG_X = 256;
+    constexpr int TILE_ROWS = 8;
+    const int64_t num_row_tiles = ceil_div(index_length, int64_t(TILE_ROWS));
+    const int64_t global_x = round_up(broadcasting_elems, int64_t(WG_X));
+    sycl::nd_range<2> launch(sycl::range<2>(num_row_tiles, global_x),
+                             sycl::range<2>(1, WG_X));
+
+    queue.submit([&](sycl::handler& cgh) {
+             sycl::local_accessor<int64_t, 1> l_idx(sycl::range<1>(TILE_ROWS),
+                                                    cgh);
+
+             cgh.parallel_for(
+                     launch,
+                     [=](sycl::nd_item<2> it) [[sycl::reqd_sub_group_size(
+                             16)]] {
+                         const int lid_x = int(it.get_local_id(1));
+                         const int64_t group_y = it.get_group(0);
+                         const int64_t col = it.get_global_id(1);
+                         if (col >= broadcasting_elems) {
+                             return;
+                         }
+
+                         const int64_t row_base = group_y * int64_t(TILE_ROWS);
+
+                         if (lid_x < TILE_ROWS) {
+                             const int64_t r = row_base + lid_x;
+                             l_idx[lid_x] = (r < index_length) ? index_ptr[r]
+                                                               : int64_t(-1);
+                         }
+                         it.barrier(sycl::access::fence_space::local_space);
+
+                         int run_start = 0;
+                         while (run_start < TILE_ROWS) {
+                             const int64_t dst_row = l_idx[run_start];
+                             if (dst_row < 0) {
+                                 break;
+                             }
+
+                             int run_end = run_start + 1;
+                             while (run_end < TILE_ROWS &&
+                                    l_idx[run_end] == dst_row) {
+                                 ++run_end;
+                             }
+
+                             scalar_t sum = scalar_t(0);
+                             for (int rr = run_start; rr < run_end; ++rr) {
+                                 const int64_t src_row = row_base + int64_t(rr);
+                                 if (src_row < index_length) {
+                                     const int64_t workload_idx =
+                                             src_row * broadcasting_elems + col;
+                                     sum += src_ptr[workload_idx];
+                                 }
+                             }
+
+                             const int64_t dst_idx =
+                                     dst_row * broadcasting_elems + col;
+                             sycl::atomic_ref<scalar_t,
+                                              sycl::memory_order::relaxed,
+                                              sycl::memory_scope::device>
+                                     aref(dst_ptr[dst_idx]);
+                             aref += sum;
+
+                             run_start = run_end;
+                         }
+                     });
+         }).wait_and_throw();
+}
+
+}  // namespace
+
 void IndexAddSYCL_(int64_t dim,
                    const Tensor& index,
                    const Tensor& src,
                    Tensor& dst) {
     // index: [N,], src: [N, D], dst: [M, D]
-    // In Indexer, output shape defines the actual primary strides.
-    // However, in IndexAdd_, input dominates the iterations.
-    // So put dst (output) at indexer's input, and src (input) at output.
-    Indexer indexer({dst}, src, DtypePolicy::NONE);
+    // This kernel assumes contiguous layout for fast linear indexing.
+    // Non-contiguous tensors are materialized as contiguous before launch.
+    const Tensor index_contiguous = index.Contiguous();
+    const Tensor src_contiguous = src.Contiguous();
+    Tensor dst_contiguous = dst.Contiguous();
 
-    // Index is simply a 1D contiguous tensor, with a different stride
-    // behavior to src. So use raw pointer for simplicity.
-    auto index_ptr = index.GetDataPtr<int64_t>();
+    // Index is simply a 1D contiguous tensor.
+    auto index_ptr = index_contiguous.GetDataPtr<int64_t>();
 
     int64_t broadcasting_elems = 1;
-    for (int64_t d = 1; d < src.NumDims(); ++d) {
-        broadcasting_elems *= src.GetShape(d);
+    for (int64_t d = 1; d < src_contiguous.NumDims(); ++d) {
+        broadcasting_elems *= src_contiguous.GetShape(d);
     }
+
+    const int64_t index_length = index_contiguous.GetLength();
+
     sycl::queue queue =
             sy::SYCLContext::GetInstance().GetDefaultQueue(src.GetDevice());
 
-    // TODO: Replace with SYCL reduction API
     DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(src.GetDtype(), [&]() {
-        queue.parallel_for(index.GetLength(), [=](int64_t workload_idx) {
-                 int64_t reduction_idx = workload_idx / broadcasting_elems;
-                 int64_t broadcasting_idx = workload_idx % broadcasting_elems;
-
-                 const int64_t idx = index_ptr[reduction_idx];
-                 int64_t dst_idx = idx * broadcasting_elems + broadcasting_idx;
-
-                 // Note input and output is switched here to adapt to the
-                 // indexer
-                 scalar_t* src_ptr = indexer.GetOutputPtr<scalar_t>(0, idx);
-                 scalar_t* dst_ptr = indexer.GetInputPtr<scalar_t>(0, dst_idx);
-                 sycl::atomic_ref<scalar_t, sycl::memory_order::acq_rel,
-                                  sycl::memory_scope::device>(*dst_ptr) +=
-                         *src_ptr;
-             }).wait_and_throw();
+        LaunchIndexAddContiguousSYCLKernel<scalar_t>(
+                queue, index_ptr, src_contiguous.GetDataPtr<scalar_t>(),
+                dst_contiguous.GetDataPtr<scalar_t>(), index_length,
+                broadcasting_elems);
     });
+
+    // If dst is non-contiguous, write back from the contiguous temporary.
+    if (!dst.IsContiguous()) {
+        dst.CopyFrom(dst_contiguous);
+    }
 }
 
 }  // namespace kernel
diff --git a/cpp/open3d/data/Dataset.h b/cpp/open3d/data/Dataset.h
@@ -754,9 +754,9 @@ class PaintedPlasterTexture : public DownloadDataset {
 };
 
 /// \class RedwoodIndoorLivingRoom1
-/// \brief Data class for `RedwoodIndoorLivingRoom1` (Augmented ICL-NUIM Dataset),
-/// containing dense point cloud, rgb sequence, clean depth sequence, noisy depth
-/// sequence, oni sequence, and ground-truth camera trajectory.
+/// \brief Data class for `RedwoodIndoorLivingRoom1` (Augmented ICL-NUIM
+/// Dataset), containing dense point cloud, rgb sequence, clean depth sequence,
+/// noisy depth sequence, oni sequence, and ground-truth camera trajectory.
 ///
 ///     RedwoodIndoorLivingRoom1
 ///     ├── colors
@@ -810,9 +810,9 @@ class RedwoodIndoorLivingRoom1 : public DownloadDataset {
 };
 
 /// \class RedwoodIndoorLivingRoom2
-/// \brief Data class for `RedwoodIndoorLivingRoom2` (Augmented ICL-NUIM Dataset),
-/// containing dense point cloud, rgb sequence, clean depth sequence, noisy depth
-/// sequence, oni sequence, and ground-truth camera trajectory.
+/// \brief Data class for `RedwoodIndoorLivingRoom2` (Augmented ICL-NUIM
+/// Dataset), containing dense point cloud, rgb sequence, clean depth sequence,
+/// noisy depth sequence, oni sequence, and ground-truth camera trajectory.
 ///
 ///     RedwoodIndoorLivingRoom2
 ///     ├── colors
@@ -867,8 +867,8 @@ class RedwoodIndoorLivingRoom2 : public DownloadDataset {
 
 /// \class RedwoodIndoorOffice1
 /// \brief Data class for `RedwoodIndoorOffice1` (Augmented ICL-NUIM Dataset),
-/// containing dense point cloud, rgb sequence, clean depth sequence, noisy depth
-/// sequence, oni sequence, and ground-truth camera trajectory.
+/// containing dense point cloud, rgb sequence, clean depth sequence, noisy
+/// depth sequence, oni sequence, and ground-truth camera trajectory.
 ///
 ///     RedwoodIndoorOffice1
 ///     ├── colors
@@ -923,8 +923,8 @@ class RedwoodIndoorOffice1 : public DownloadDataset {
 
 /// \class RedwoodIndoorOffice2
 /// \brief Data class for `RedwoodIndoorOffice2` (Augmented ICL-NUIM Dataset),
-/// containing dense point cloud, rgb sequence, clean depth sequence, noisy depth
-/// sequence, oni sequence, and ground-truth camera trajectory.
+/// containing dense point cloud, rgb sequence, clean depth sequence, noisy
+/// depth sequence, oni sequence, and ground-truth camera trajectory.
 ///
 ///     RedwoodIndoorOffice2
 ///     ├── colors
diff --git a/cpp/open3d/geometry/PointCloud.h b/cpp/open3d/geometry/PointCloud.h
@@ -281,7 +281,8 @@ class PointCloud : public Geometry3D {
     ///
     ///
     /// \param input PointCloud to use for covariance computation.
-    /// \param search_param The KDTree search parameters for neighborhood search.
+    /// \param search_param The KDTree search parameters for neighborhood
+    /// search.
     static std::vector<Eigen::Matrix3d> EstimatePerPointCovariances(
             const PointCloud &input,
             const KDTreeSearchParam &search_param = KDTreeSearchParamKNN());
diff --git a/cpp/open3d/pipelines/registration/GlobalOptimizationConvergenceCriteria.h b/cpp/open3d/pipelines/registration/GlobalOptimizationConvergenceCriteria.h
@@ -81,7 +81,8 @@ class GlobalOptimizationConvergenceCriteria {
     /// increments.
     /// \param min_right_term Minimum right term value.
     /// \param min_residual Minimum residual value.
-    /// \param max_iteration_lm Maximum iteration number for Levenberg Marquardt method.
+    /// \param max_iteration_lm Maximum iteration number for Levenberg Marquardt
+    /// method.
     /// \param upper_scale_factor Upper scale factor value.
     /// \param lower_scale_factor Lower scale factor value.
     GlobalOptimizationConvergenceCriteria(
diff --git a/cpp/open3d/pipelines/registration/Registration.h b/cpp/open3d/pipelines/registration/Registration.h
@@ -39,8 +39,8 @@ class ICPConvergenceCriteria {
     ///
     /// \param relative_fitness If relative change (difference) of fitness score
     /// is lower than relative_fitness, the iteration stops.
-    /// \param relative_rmse If relative change (difference) of inliner RMSE score is
-    /// lower than relative_rmse, the iteration stops.
+    /// \param relative_rmse If relative change (difference) of inliner RMSE
+    /// score is lower than relative_rmse, the iteration stops.
     /// \param max_iteration Maximum iteration before iteration stops.
     ICPConvergenceCriteria(double relative_fitness = 1e-6,
                            double relative_rmse = 1e-6,
@@ -52,10 +52,10 @@ class ICPConvergenceCriteria {
 
 public:
     /// If relative change (difference) of fitness score is lower than
-    /// `relative_fitness`, the iteration stops.
+    /// `relative_fitness_`, the iteration stops.
     double relative_fitness_;
     /// If relative change (difference) of inliner RMSE score is lower than
-    /// `relative_rmse`, the iteration stops.
+    /// `relative_rmse_`, the iteration stops.
     double relative_rmse_;
     /// Maximum iteration before iteration stops.
     int max_iteration_;
diff --git a/cpp/open3d/pipelines/registration/TransformationEstimation.h b/cpp/open3d/pipelines/registration/TransformationEstimation.h
@@ -139,7 +139,8 @@ class TransformationEstimationPointToPlane : public TransformationEstimation {
     ~TransformationEstimationPointToPlane() override {}
 
     /// \brief Constructor that takes as input a RobustKernel.
-    /// \param kernel Any of the implemented statistical robust kernel for outlier rejection.
+    /// \param kernel Any of the implemented statistical robust kernel for
+    /// outlier rejection.
     explicit TransformationEstimationPointToPlane(
             std::shared_ptr<RobustKernel> kernel)
         : kernel_(std::move(kernel)) {}