Skip to content

Commit 03ed109

Browse files
Optimized IndexAddSYCL (#7455)
* Fix style auto fix. * Fix windows update_release.sh --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
1 parent 7dd7bc1 commit 03ed109

File tree

9 files changed

+158
-52
lines changed

9 files changed

+158
-52
lines changed

.github/workflows/style.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ jobs:
2222
uses: actions/checkout@v4
2323
with:
2424
repository: ${{ github.event.pull_request.head.repo.full_name || github.repository }}
25-
ref: ${{ github.event.pull_request.head.sha || github.ref }}
25+
ref: ${{ github.event.pull_request.head.ref || github.ref }}
2626
token: ${{ secrets.GITHUB_TOKEN }}
2727
- name: Set up Python version
2828
uses: actions/setup-python@v5
@@ -47,7 +47,7 @@ jobs:
4747
git add -u
4848
if ! git diff --staged --quiet; then
4949
git commit -m "Apply automatic code formatting"
50-
git push || exit 1
50+
git push origin "HEAD:${{ github.event.pull_request.head.ref }}" || exit 1
5151
else
5252
echo "No formatting changes needed"
5353
fi
@@ -56,7 +56,7 @@ jobs:
5656
run: |
5757
git add -u
5858
if ! git diff --staged --quiet; then
59-
echo "::error::This PR requires formatting changes but is from a fork. Please run 'python util/check_style.py --apply' locally and push the changes."
59+
echo "::error::This PR requires formatting changes but is from a fork. Please run 'make apply-style' or 'python util/check_style.py --apply' locally and push the changes."
6060
exit 1
6161
else
6262
echo "No formatting changes needed"

.github/workflows/windows.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,8 +177,9 @@ jobs:
177177
env:
178178
GH_TOKEN: ${{ github.token }}
179179
shell: bash
180+
working-directory: ${{ env.BUILD_DIR }}
180181
run: |
181-
.github/workflows/update_release.sh ${{ env.BUILD_DIR }}/package/${{ env.DEVEL_PKG_NAME }}
182+
.github/workflows/update_release.sh package/${{ env.DEVEL_PKG_NAME }}
182183
183184
- name: Viewer App
184185
working-directory: ${{ env.BUILD_DIR }}
@@ -347,8 +348,9 @@ jobs:
347348
env:
348349
GH_TOKEN: ${{ github.token }}
349350
shell: bash
351+
working-directory: ${{ env.BUILD_DIR }}
350352
run: |
351-
.github/workflows/update_release.sh ${{ env.BUILD_DIR }}/lib/python_package/pip_package/${{ env.PIP_PKG_NAME }}
353+
.github/workflows/update_release.sh lib/python_package/pip_package/${{ env.PIP_PKG_NAME }}
352354
353355
test-wheel:
354356
name: Test wheel

cpp/open3d/core/kernel/IndexReduction.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,25 +14,25 @@ namespace open3d {
1414
namespace core {
1515
namespace kernel {
1616

17-
void IndexAdd_(int64_t dim,
17+
void IndexAdd_([[maybe_unused]] int64_t dim,
1818
const Tensor& index,
1919
const Tensor& src,
2020
Tensor& dst);
2121

22-
void IndexAddCPU_(int64_t dim,
22+
void IndexAddCPU_([[maybe_unused]] int64_t dim,
2323
const Tensor& index,
2424
const Tensor& src,
2525
Tensor& dst);
2626

2727
#ifdef BUILD_SYCL_MODULE
28-
void IndexAddSYCL_(int64_t dim,
28+
void IndexAddSYCL_([[maybe_unused]] int64_t dim,
2929
const Tensor& index,
3030
const Tensor& src,
3131
Tensor& dst);
3232
#endif
3333

3434
#ifdef BUILD_CUDA_MODULE
35-
void IndexAddCUDA_(int64_t dim,
35+
void IndexAddCUDA_([[maybe_unused]] int64_t dim,
3636
const Tensor& index,
3737
const Tensor& src,
3838
Tensor& dst);

cpp/open3d/core/kernel/IndexReductionSYCL.cpp

Lines changed: 127 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
// ----------------------------------------------------------------------------
77

88
#include "open3d/core/Dispatch.h"
9-
#include "open3d/core/Indexer.h"
109
#include "open3d/core/SYCLContext.h"
1110
#include "open3d/core/Tensor.h"
1211
#include "open3d/utility/Logging.h"
@@ -15,45 +14,147 @@ namespace open3d {
1514
namespace core {
1615
namespace kernel {
1716

17+
namespace {
18+
19+
template <typename scalar_t>
20+
// Launches contiguous index_add over dim0:
21+
// dst[index[i], ...] += src[i, ...]
22+
// Contract:
23+
// - `index_ptr`, `src_ptr`, and `dst_ptr` point to contiguous buffers.
24+
// - `index_length` is the length of `index_ptr` and the leading dimension of
25+
// `src_ptr`.
26+
// - `broadcasting_elems` is the flattened product of non-reduction dimensions.
27+
// - `dst_ptr` has enough rows to address all index values.
28+
void LaunchIndexAddContiguousSYCLKernel(sycl::queue& queue,
29+
const int64_t* index_ptr,
30+
const scalar_t* src_ptr,
31+
scalar_t* dst_ptr,
32+
int64_t index_length,
33+
int64_t broadcasting_elems) {
34+
if (index_length <= 0 || broadcasting_elems <= 0) {
35+
return;
36+
}
37+
38+
auto ceil_div = [](int64_t a, int64_t b) -> int64_t {
39+
return (a + b - 1) / b;
40+
};
41+
auto round_up = [](int64_t x, int64_t m) -> int64_t {
42+
return ((x + m - 1) / m) * m;
43+
};
44+
45+
// 2D launch configuration:
46+
// - X dimension tiles columns (broadcasting_elems).
47+
// - Y dimension tiles reduction rows (index_length).
48+
//
49+
// Each work-group processes TILE_ROWS rows and WG_X columns. Within a row
50+
// tile, consecutive runs of identical destination indices are reduced into
51+
// one atomic add per (column, run), reducing atomic pressure while
52+
// preserving index_add semantics.
53+
constexpr int WG_X = 256;
54+
constexpr int TILE_ROWS = 8;
55+
const int64_t num_row_tiles = ceil_div(index_length, int64_t(TILE_ROWS));
56+
const int64_t global_x = round_up(broadcasting_elems, int64_t(WG_X));
57+
sycl::nd_range<2> launch(sycl::range<2>(num_row_tiles, global_x),
58+
sycl::range<2>(1, WG_X));
59+
60+
queue.submit([&](sycl::handler& cgh) {
61+
sycl::local_accessor<int64_t, 1> l_idx(sycl::range<1>(TILE_ROWS),
62+
cgh);
63+
64+
cgh.parallel_for(
65+
launch,
66+
[=](sycl::nd_item<2> it) [[sycl::reqd_sub_group_size(
67+
16)]] {
68+
const int lid_x = int(it.get_local_id(1));
69+
const int64_t group_y = it.get_group(0);
70+
const int64_t col = it.get_global_id(1);
71+
if (col >= broadcasting_elems) {
72+
return;
73+
}
74+
75+
const int64_t row_base = group_y * int64_t(TILE_ROWS);
76+
77+
if (lid_x < TILE_ROWS) {
78+
const int64_t r = row_base + lid_x;
79+
l_idx[lid_x] = (r < index_length) ? index_ptr[r]
80+
: int64_t(-1);
81+
}
82+
it.barrier(sycl::access::fence_space::local_space);
83+
84+
int run_start = 0;
85+
while (run_start < TILE_ROWS) {
86+
const int64_t dst_row = l_idx[run_start];
87+
if (dst_row < 0) {
88+
break;
89+
}
90+
91+
int run_end = run_start + 1;
92+
while (run_end < TILE_ROWS &&
93+
l_idx[run_end] == dst_row) {
94+
++run_end;
95+
}
96+
97+
scalar_t sum = scalar_t(0);
98+
for (int rr = run_start; rr < run_end; ++rr) {
99+
const int64_t src_row = row_base + int64_t(rr);
100+
if (src_row < index_length) {
101+
const int64_t workload_idx =
102+
src_row * broadcasting_elems + col;
103+
sum += src_ptr[workload_idx];
104+
}
105+
}
106+
107+
const int64_t dst_idx =
108+
dst_row * broadcasting_elems + col;
109+
sycl::atomic_ref<scalar_t,
110+
sycl::memory_order::relaxed,
111+
sycl::memory_scope::device>
112+
aref(dst_ptr[dst_idx]);
113+
aref += sum;
114+
115+
run_start = run_end;
116+
}
117+
});
118+
}).wait_and_throw();
119+
}
120+
121+
} // namespace
122+
18123
void IndexAddSYCL_(int64_t dim,
19124
const Tensor& index,
20125
const Tensor& src,
21126
Tensor& dst) {
22127
// index: [N,], src: [N, D], dst: [M, D]
23-
// In Indexer, output shape defines the actual primary strides.
24-
// However, in IndexAdd_, input dominates the iterations.
25-
// So put dst (output) at indexer's input, and src (input) at output.
26-
Indexer indexer({dst}, src, DtypePolicy::NONE);
128+
// This kernel assumes contiguous layout for fast linear indexing.
129+
// Non-contiguous tensors are materialized as contiguous before launch.
130+
const Tensor index_contiguous = index.Contiguous();
131+
const Tensor src_contiguous = src.Contiguous();
132+
Tensor dst_contiguous = dst.Contiguous();
27133

28-
// Index is simply a 1D contiguous tensor, with a different stride
29-
// behavior to src. So use raw pointer for simplicity.
30-
auto index_ptr = index.GetDataPtr<int64_t>();
134+
// Index is simply a 1D contiguous tensor.
135+
auto index_ptr = index_contiguous.GetDataPtr<int64_t>();
31136

32137
int64_t broadcasting_elems = 1;
33-
for (int64_t d = 1; d < src.NumDims(); ++d) {
34-
broadcasting_elems *= src.GetShape(d);
138+
for (int64_t d = 1; d < src_contiguous.NumDims(); ++d) {
139+
broadcasting_elems *= src_contiguous.GetShape(d);
35140
}
141+
142+
const int64_t index_length = index_contiguous.GetLength();
143+
36144
sycl::queue queue =
37145
sy::SYCLContext::GetInstance().GetDefaultQueue(src.GetDevice());
38146

39-
// TODO: Replace with SYCL reduction API
40147
DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(src.GetDtype(), [&]() {
41-
queue.parallel_for(index.GetLength(), [=](int64_t workload_idx) {
42-
int64_t reduction_idx = workload_idx / broadcasting_elems;
43-
int64_t broadcasting_idx = workload_idx % broadcasting_elems;
44-
45-
const int64_t idx = index_ptr[reduction_idx];
46-
int64_t dst_idx = idx * broadcasting_elems + broadcasting_idx;
47-
48-
// Note input and output is switched here to adapt to the
49-
// indexer
50-
scalar_t* src_ptr = indexer.GetOutputPtr<scalar_t>(0, idx);
51-
scalar_t* dst_ptr = indexer.GetInputPtr<scalar_t>(0, dst_idx);
52-
sycl::atomic_ref<scalar_t, sycl::memory_order::acq_rel,
53-
sycl::memory_scope::device>(*dst_ptr) +=
54-
*src_ptr;
55-
}).wait_and_throw();
148+
LaunchIndexAddContiguousSYCLKernel<scalar_t>(
149+
queue, index_ptr, src_contiguous.GetDataPtr<scalar_t>(),
150+
dst_contiguous.GetDataPtr<scalar_t>(), index_length,
151+
broadcasting_elems);
56152
});
153+
154+
// If dst is non-contiguous, write back from the contiguous temporary.
155+
if (!dst.IsContiguous()) {
156+
dst.CopyFrom(dst_contiguous);
157+
}
57158
}
58159

59160
} // namespace kernel

cpp/open3d/data/Dataset.h

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -754,9 +754,9 @@ class PaintedPlasterTexture : public DownloadDataset {
754754
};
755755

756756
/// \class RedwoodIndoorLivingRoom1
757-
/// \brief Data class for `RedwoodIndoorLivingRoom1` (Augmented ICL-NUIM Dataset),
758-
/// containing dense point cloud, rgb sequence, clean depth sequence, noisy depth
759-
/// sequence, oni sequence, and ground-truth camera trajectory.
757+
/// \brief Data class for `RedwoodIndoorLivingRoom1` (Augmented ICL-NUIM
758+
/// Dataset), containing dense point cloud, rgb sequence, clean depth sequence,
759+
/// noisy depth sequence, oni sequence, and ground-truth camera trajectory.
760760
///
761761
/// RedwoodIndoorLivingRoom1
762762
/// ├── colors
@@ -810,9 +810,9 @@ class RedwoodIndoorLivingRoom1 : public DownloadDataset {
810810
};
811811

812812
/// \class RedwoodIndoorLivingRoom2
813-
/// \brief Data class for `RedwoodIndoorLivingRoom2` (Augmented ICL-NUIM Dataset),
814-
/// containing dense point cloud, rgb sequence, clean depth sequence, noisy depth
815-
/// sequence, oni sequence, and ground-truth camera trajectory.
813+
/// \brief Data class for `RedwoodIndoorLivingRoom2` (Augmented ICL-NUIM
814+
/// Dataset), containing dense point cloud, rgb sequence, clean depth sequence,
815+
/// noisy depth sequence, oni sequence, and ground-truth camera trajectory.
816816
///
817817
/// RedwoodIndoorLivingRoom2
818818
/// ├── colors
@@ -867,8 +867,8 @@ class RedwoodIndoorLivingRoom2 : public DownloadDataset {
867867

868868
/// \class RedwoodIndoorOffice1
869869
/// \brief Data class for `RedwoodIndoorOffice1` (Augmented ICL-NUIM Dataset),
870-
/// containing dense point cloud, rgb sequence, clean depth sequence, noisy depth
871-
/// sequence, oni sequence, and ground-truth camera trajectory.
870+
/// containing dense point cloud, rgb sequence, clean depth sequence, noisy
871+
/// depth sequence, oni sequence, and ground-truth camera trajectory.
872872
///
873873
/// RedwoodIndoorOffice1
874874
/// ├── colors
@@ -923,8 +923,8 @@ class RedwoodIndoorOffice1 : public DownloadDataset {
923923

924924
/// \class RedwoodIndoorOffice2
925925
/// \brief Data class for `RedwoodIndoorOffice2` (Augmented ICL-NUIM Dataset),
926-
/// containing dense point cloud, rgb sequence, clean depth sequence, noisy depth
927-
/// sequence, oni sequence, and ground-truth camera trajectory.
926+
/// containing dense point cloud, rgb sequence, clean depth sequence, noisy
927+
/// depth sequence, oni sequence, and ground-truth camera trajectory.
928928
///
929929
/// RedwoodIndoorOffice2
930930
/// ├── colors

cpp/open3d/geometry/PointCloud.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,8 @@ class PointCloud : public Geometry3D {
281281
///
282282
///
283283
/// \param input PointCloud to use for covariance computation.
284-
/// \param search_param The KDTree search parameters for neighborhood search.
284+
/// \param search_param The KDTree search parameters for neighborhood
285+
/// search.
285286
static std::vector<Eigen::Matrix3d> EstimatePerPointCovariances(
286287
const PointCloud &input,
287288
const KDTreeSearchParam &search_param = KDTreeSearchParamKNN());

cpp/open3d/pipelines/registration/GlobalOptimizationConvergenceCriteria.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,8 @@ class GlobalOptimizationConvergenceCriteria {
8181
/// increments.
8282
/// \param min_right_term Minimum right term value.
8383
/// \param min_residual Minimum residual value.
84-
/// \param max_iteration_lm Maximum iteration number for Levenberg Marquardt method.
84+
/// \param max_iteration_lm Maximum iteration number for Levenberg Marquardt
85+
/// method.
8586
/// \param upper_scale_factor Upper scale factor value.
8687
/// \param lower_scale_factor Lower scale factor value.
8788
GlobalOptimizationConvergenceCriteria(

cpp/open3d/pipelines/registration/Registration.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ class ICPConvergenceCriteria {
3939
///
4040
/// \param relative_fitness If relative change (difference) of fitness score
4141
/// is lower than relative_fitness, the iteration stops.
42-
/// \param relative_rmse If relative change (difference) of inliner RMSE score is
43-
/// lower than relative_rmse, the iteration stops.
42+
/// \param relative_rmse If relative change (difference) of inliner RMSE
43+
/// score is lower than relative_rmse, the iteration stops.
4444
/// \param max_iteration Maximum iteration before iteration stops.
4545
ICPConvergenceCriteria(double relative_fitness = 1e-6,
4646
double relative_rmse = 1e-6,
@@ -52,10 +52,10 @@ class ICPConvergenceCriteria {
5252

5353
public:
5454
/// If relative change (difference) of fitness score is lower than
55-
/// `relative_fitness`, the iteration stops.
55+
/// `relative_fitness_`, the iteration stops.
5656
double relative_fitness_;
5757
/// If relative change (difference) of inliner RMSE score is lower than
58-
/// `relative_rmse`, the iteration stops.
58+
/// `relative_rmse_`, the iteration stops.
5959
double relative_rmse_;
6060
/// Maximum iteration before iteration stops.
6161
int max_iteration_;

cpp/open3d/pipelines/registration/TransformationEstimation.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,8 @@ class TransformationEstimationPointToPlane : public TransformationEstimation {
139139
~TransformationEstimationPointToPlane() override {}
140140

141141
/// \brief Constructor that takes as input a RobustKernel.
142-
/// \param kernel Any of the implemented statistical robust kernel for outlier rejection.
142+
/// \param kernel Any of the implemented statistical robust kernel for
143+
/// outlier rejection.
143144
explicit TransformationEstimationPointToPlane(
144145
std::shared_ptr<RobustKernel> kernel)
145146
: kernel_(std::move(kernel)) {}

0 commit comments

Comments
 (0)