Skip to content

Commit 918189f

Browse files
authored
Improve performance for small string gather (#20656)
MR improves performance of gather API for small string columns(avg.length <= 32 char) by using cub::DeviceMemcpy::Batched API to perform the gather for string columns with greater than ~0.5 million rows. The threshold for the the row count is decided based on benchmarking data on H100. MR adds an additional test case to check string `gather` implementation where kernel is launched with > 1 CTA to verify correctness. It also adds more parameters to sweep for the gather benchmark for representing larger input row counts. Authors: - Tanmay Gujar (https://github.com/tgujar) - David Wendt (https://github.com/davidwendt) Approvers: - David Wendt (https://github.com/davidwendt) - Bradley Dice (https://github.com/bdice) URL: #20656
1 parent 9ebb19b commit 918189f

File tree

3 files changed

+146
-68
lines changed

3 files changed

+146
-68
lines changed

cpp/benchmarks/string/copy.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,5 +60,6 @@ NVBENCH_BENCH(bench_copy)
6060
.set_name("copy")
6161
.add_int64_axis("min_width", {0})
6262
.add_int64_axis("max_width", {32, 64, 128, 256})
63-
.add_int64_axis("num_rows", {32768, 262144, 2097152})
63+
.add_int64_axis("num_rows",
64+
{131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432})
6465
.add_string_axis("api", {"gather", "scatter"});

cpp/include/cudf/strings/detail/gather.cuh

Lines changed: 88 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -11,21 +11,26 @@
1111
#include <cudf/detail/offsets_iterator_factory.cuh>
1212
#include <cudf/detail/utilities/cuda.cuh>
1313
#include <cudf/detail/utilities/grid_1d.cuh>
14+
#include <cudf/detail/utilities/integer_utils.hpp>
1415
#include <cudf/strings/detail/strings_children.cuh>
1516
#include <cudf/strings/detail/utilities.hpp>
1617
#include <cudf/strings/strings_column_view.hpp>
1718
#include <cudf/utilities/memory_resource.hpp>
1819
#include <cudf/utilities/prefetch.hpp>
1920

2021
#include <rmm/cuda_stream_view.hpp>
22+
#include <rmm/device_buffer.hpp>
2123
#include <rmm/exec_policy.hpp>
2224

25+
#include <cub/cub.cuh>
2326
#include <cuda/functional>
2427
#include <cuda/std/iterator>
2528
#include <thrust/binary_search.h>
2629
#include <thrust/execution_policy.h>
2730
#include <thrust/iterator/transform_iterator.h>
2831

32+
#include <cstddef>
33+
2934
namespace cudf {
3035
namespace strings {
3136
namespace detail {
@@ -190,68 +195,6 @@ CUDF_KERNEL void gather_chars_fn_char_parallel(StringIterator strings_begin,
190195
}
191196
}
192197

193-
/**
194-
* @brief Returns a new chars column using the specified indices to select
195-
* strings from the input iterator.
196-
*
197-
* This uses a character-parallel gather CUDA kernel that performs very
198-
* well on a strings column with long strings (e.g. average > 64 bytes).
199-
*
200-
* @tparam StringIterator Iterator should produce `string_view` objects.
201-
* @tparam MapIterator Iterator for retrieving integer indices of the `StringIterator`.
202-
*
203-
* @param strings_begin Start of the iterator to retrieve `string_view` instances.
204-
* @param map_begin Start of index iterator.
205-
* @param map_end End of index iterator.
206-
* @param offsets The offset values to be associated with the output chars column.
207-
* @param chars_bytes The total number of bytes for the output chars column.
208-
* @param stream CUDA stream used for device memory operations and kernel launches.
209-
* @param mr Device memory resource used to allocate the returned column's device memory.
210-
* @return New chars column fit for a strings column.
211-
*/
212-
template <typename StringIterator, typename MapIterator>
213-
rmm::device_uvector<char> gather_chars(StringIterator strings_begin,
214-
MapIterator map_begin,
215-
MapIterator map_end,
216-
cudf::detail::input_offsetalator const offsets,
217-
int64_t chars_bytes,
218-
rmm::cuda_stream_view stream,
219-
rmm::device_async_resource_ref mr)
220-
{
221-
auto const output_count = std::distance(map_begin, map_end);
222-
if (output_count == 0) return rmm::device_uvector<char>(0, stream, mr);
223-
224-
auto chars_data = rmm::device_uvector<char>(chars_bytes, stream, mr);
225-
cudf::prefetch::detail::prefetch(chars_data, stream);
226-
auto d_chars = chars_data.data();
227-
228-
constexpr int warps_per_threadblock = 4;
229-
// String parallel strategy will be used if average string length is above this threshold.
230-
// Otherwise, char parallel strategy will be used.
231-
constexpr int64_t string_parallel_threshold = 32;
232-
233-
int64_t const average_string_length = chars_bytes / output_count;
234-
235-
if (average_string_length > string_parallel_threshold) {
236-
constexpr int max_threadblocks = 65536;
237-
gather_chars_fn_string_parallel<<<
238-
min((static_cast<int>(output_count) + warps_per_threadblock - 1) / warps_per_threadblock,
239-
max_threadblocks),
240-
warps_per_threadblock * cudf::detail::warp_size,
241-
0,
242-
stream.value()>>>(strings_begin, d_chars, offsets, map_begin, output_count);
243-
} else {
244-
constexpr int strings_per_threadblock = 32;
245-
gather_chars_fn_char_parallel<strings_per_threadblock>
246-
<<<(output_count + strings_per_threadblock - 1) / strings_per_threadblock,
247-
warps_per_threadblock * cudf::detail::warp_size,
248-
0,
249-
stream.value()>>>(strings_begin, d_chars, offsets, map_begin, output_count);
250-
}
251-
252-
return chars_data;
253-
}
254-
255198
/**
256199
* @brief Returns a new strings column using the specified indices to select
257200
* elements from the `strings` column.
@@ -299,15 +242,94 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
299242
if (not d_strings.is_valid(idx)) { return 0; }
300243
return static_cast<size_type>(d_in_offsets[idx + 1] - d_in_offsets[idx]);
301244
}));
302-
auto [out_offsets_column, total_bytes] = cudf::strings::detail::make_offsets_child_column(
245+
246+
auto [out_offsets_column, out_char_bytes] = cudf::strings::detail::make_offsets_child_column(
303247
sizes_itr, sizes_itr + output_count, stream, mr);
304248

305-
// build chars column
249+
// build out offset view
306250
auto const offsets_view =
307251
cudf::detail::offsetalator_factory::make_input_iterator(out_offsets_column->view());
308252
cudf::prefetch::detail::prefetch(strings.chars_begin(stream), strings.chars_size(stream), stream);
309-
auto out_chars_data = gather_chars(
310-
d_strings->begin<string_view>(), begin, end, offsets_view, total_bytes, stream, mr);
253+
254+
// build output char column
255+
auto out_chars_data = rmm::device_uvector<char>(out_char_bytes, stream, mr);
256+
cudf::prefetch::detail::prefetch(out_chars_data, stream);
257+
auto d_out_chars = out_chars_data.data();
258+
259+
constexpr int warps_per_threadblock = 4;
260+
// String parallel strategy will be used if average string length is above this threshold.
261+
// Otherwise, char parallel strategy will be used.
262+
constexpr int64_t string_parallel_threshold = 32;
263+
264+
int64_t const average_string_length = out_char_bytes / output_count;
265+
266+
if (average_string_length > string_parallel_threshold) {
267+
constexpr int max_threadblocks = 65536;
268+
auto const grid_size =
269+
min(cudf::util::div_rounding_up_safe(static_cast<int64_t>(output_count),
270+
static_cast<int64_t>(warps_per_threadblock)),
271+
static_cast<int64_t>(max_threadblocks));
272+
gather_chars_fn_string_parallel<<<grid_size,
273+
warps_per_threadblock * cudf::detail::warp_size,
274+
0,
275+
stream.value()>>>(
276+
d_strings->begin<string_view>(), d_out_chars, offsets_view, begin, output_count);
277+
} else {
278+
// Threshold is based on empirical data on H100.
279+
// If row count is above this threshold we use the cub::DeviceMemcpy::Batched API, otherwise we
280+
// use the custom cuDF kernel.
281+
constexpr int64_t cub_batch_copy_threshold = 1024 * 1024 * 0.5;
282+
283+
if (output_count < cub_batch_copy_threshold) {
284+
constexpr int strings_per_threadblock = 32;
285+
auto const grid_size = cudf::util::div_rounding_up_safe(
286+
static_cast<int64_t>(output_count), static_cast<int64_t>(strings_per_threadblock));
287+
gather_chars_fn_char_parallel<strings_per_threadblock>
288+
<<<grid_size, warps_per_threadblock * cudf::detail::warp_size, 0, stream.value()>>>(
289+
d_strings->begin<string_view>(), d_out_chars, offsets_view, begin, output_count);
290+
} else {
291+
// Iterator over the character column of input strings to gather
292+
auto in_chars_itr = thrust::make_transform_iterator(
293+
begin,
294+
cuda::proclaim_return_type<const char*>([d_strings = *d_strings] __device__(size_type idx) {
295+
if (NullifyOutOfBounds && (idx < 0 || idx >= d_strings.size())) {
296+
return static_cast<const char*>(nullptr);
297+
}
298+
if (not d_strings.is_valid(idx)) { return static_cast<const char*>(nullptr); }
299+
return d_strings.element<string_view>(idx).data();
300+
}));
301+
302+
// Iterator over the output locations to write the output
303+
auto out_chars_itr = cudf::detail::make_counting_transform_iterator(
304+
0,
305+
cuda::proclaim_return_type<char*>(
306+
[d_strings = *d_strings, offsets_view, d_out_chars] __device__(size_type idx) {
307+
return d_out_chars + offsets_view[idx];
308+
}));
309+
310+
// Determine temporary device storage requirements
311+
size_t temp_storage_bytes = 0;
312+
cub::DeviceMemcpy::Batched(nullptr,
313+
temp_storage_bytes,
314+
in_chars_itr,
315+
out_chars_itr,
316+
sizes_itr,
317+
output_count,
318+
stream.value());
319+
320+
// Allocate temporary storage
321+
auto d_temp_storage = rmm::device_buffer(temp_storage_bytes, stream, mr);
322+
323+
// Run batched copy algorithm
324+
cub::DeviceMemcpy::Batched(d_temp_storage.data(),
325+
temp_storage_bytes,
326+
in_chars_itr,
327+
out_chars_itr,
328+
sizes_itr,
329+
output_count,
330+
stream.value());
331+
}
332+
}
311333

312334
return make_strings_column(output_count,
313335
std::move(out_offsets_column),

cpp/tests/copying/gather_str_tests.cpp

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* SPDX-FileCopyrightText: Copyright (c) 2020-2024, NVIDIA CORPORATION.
2+
* SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION.
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55
#include <cudf_test/base_fixture.hpp>
@@ -14,6 +14,8 @@
1414
#include <cudf/table/table_view.hpp>
1515
#include <cudf/utilities/memory_resource.hpp>
1616

17+
#include <random>
18+
1719
class GatherTestStr : public cudf::test::BaseFixture {};
1820

1921
TEST_F(GatherTestStr, StringColumn)
@@ -145,3 +147,56 @@ TEST_F(GatherTestStr, GatherZeroSizeStringsColumn)
145147
cudf::get_current_device_resource_ref());
146148
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, results->get_column(0).view());
147149
}
150+
151+
TEST_F(GatherTestStr, GatherRandomStringsColumn)
152+
{
153+
constexpr int num_total_strings = 512;
154+
constexpr int num_gathered_strings = 128;
155+
156+
std::mt19937 rng(12345);
157+
std::uniform_int_distribution<int> len_dist(0, 20);
158+
std::uniform_int_distribution<int> ch_dist(97, 122); // 'a'..'z'
159+
160+
// Generate random strings
161+
std::vector<std::string> host_strings;
162+
host_strings.reserve(num_total_strings);
163+
for (int i = 0; i < num_total_strings; ++i) {
164+
int len = len_dist(rng);
165+
std::string s;
166+
s.reserve(len);
167+
for (int j = 0; j < len; ++j) {
168+
s.push_back(static_cast<char>(ch_dist(rng)));
169+
}
170+
host_strings.push_back(std::move(s));
171+
}
172+
173+
std::vector<char const*> h_ptrs;
174+
h_ptrs.reserve(num_total_strings);
175+
for (auto& s : host_strings) {
176+
h_ptrs.push_back(s.c_str());
177+
}
178+
179+
cudf::test::strings_column_wrapper strings(h_ptrs.begin(), h_ptrs.end());
180+
cudf::table_view source_table({strings});
181+
182+
// Generate random string indices to gather
183+
std::uniform_int_distribution<int> idx_dist(0, num_total_strings - 1);
184+
std::vector<int32_t> h_map;
185+
h_map.reserve(num_gathered_strings);
186+
for (int i = 0; i < num_gathered_strings; ++i) {
187+
h_map.push_back(static_cast<int32_t>(idx_dist(rng)));
188+
}
189+
190+
// Gather strings
191+
cudf::test::fixed_width_column_wrapper<int32_t> gather_map(h_map.begin(), h_map.end());
192+
auto result = cudf::gather(source_table, gather_map);
193+
194+
std::vector<char const*> h_expected;
195+
h_expected.reserve(num_gathered_strings);
196+
for (auto idx : h_map) {
197+
h_expected.push_back(h_ptrs[static_cast<size_t>(idx)]);
198+
}
199+
cudf::test::strings_column_wrapper expected(h_expected.begin(), h_expected.end());
200+
201+
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), expected);
202+
}

0 commit comments

Comments
 (0)