From c936ea64affcb86e8a17c76ce3db25cc6de8f58a Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Thu, 18 Jun 2026 16:11:25 -0700 Subject: [PATCH 1/2] [C++][Compute] Support string_view/binary_view keys in the hash-aggregate Grouper --- cpp/src/arrow/compute/row/grouper.cc | 12 +- .../arrow/compute/row/grouper_benchmark.cc | 27 ++++- cpp/src/arrow/compute/row/grouper_test.cc | 83 +++++++++++++- .../arrow/compute/row/row_encoder_internal.cc | 108 ++++++++++++++++++ .../arrow/compute/row/row_encoder_internal.h | 28 +++++ .../compute/row/row_encoder_internal_test.cc | 75 ++++++++++++ 6 files changed, 328 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc index a342e5a6b1bf..a5c0732f1695 100644 --- a/cpp/src/arrow/compute/row/grouper.cc +++ b/cpp/src/arrow/compute/row/grouper.cc @@ -373,6 +373,12 @@ struct GrouperImpl : public Grouper { continue; } + if (is_binary_view_like(key_type.id())) { + impl->encoders_[i] = + std::make_unique(key_type.GetSharedPtr()); + continue; + } + if (key_type.id() == Type::NA) { impl->encoders_[i] = std::make_unique(); continue; @@ -556,7 +562,11 @@ struct GrouperFastImpl : public Grouper { } #if ARROW_LITTLE_ENDIAN for (size_t i = 0; i < key_types.size(); ++i) { - if (is_large_binary_like(key_types[i].id())) { + // The fast (Swiss-table) row format can't represent variadic view buffers, + // so view keys fall back to GrouperImpl, which handles them via + // BinaryViewKeyEncoder. + if (is_large_binary_like(key_types[i].id()) || + is_binary_view_like(key_types[i].id())) { return false; } } diff --git a/cpp/src/arrow/compute/row/grouper_benchmark.cc b/cpp/src/arrow/compute/row/grouper_benchmark.cc index 1e1a16d57900..245d1cd76b80 100644 --- a/cpp/src/arrow/compute/row/grouper_benchmark.cc +++ b/cpp/src/arrow/compute/row/grouper_benchmark.cc @@ -20,9 +20,11 @@ #include "arrow/util/key_value_metadata.h" #include "arrow/util/string.h" +#include "arrow/compute/cast.h" #include "arrow/compute/row/grouper.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" +#include "arrow/type_traits.h" #include "arrow/util/benchmark_util.h" namespace arrow { @@ -62,8 +64,23 @@ static ExecBatch MakeRandomExecBatch(const DataTypeVector& types, int64_t num_ro std::vector values; values.resize(num_types); for (int i = 0; i < num_types; ++i) { - auto field = ::arrow::field("", types[i], metadata); - values[i] = rng.ArrayOf(*field, num_rows, alignment, memory_pool); + // The random generator honors the "unique" knob (which controls group + // cardinality) for plain int32-offset binary/string but not for view types. + // To keep view-key benchmarks comparable to their non-view counterparts, + // generate the plain equivalent column and cast it to the view type; the + // cast happens outside the timed grouping loop. + const auto& type = types[i]; + const bool is_view = is_binary_view_like(*type); + std::shared_ptr gen_type = type; + if (is_view) { + gen_type = type->id() == Type::STRING_VIEW ? utf8() : binary(); + } + auto field = ::arrow::field("", gen_type, metadata); + Datum value = rng.ArrayOf(*field, num_rows, alignment, memory_pool); + if (is_view) { + ASSIGN_OR_ABORT(value, compute::Cast(value, type)); + } + values[i] = std::move(value); } return ExecBatch(std::move(values), num_rows); @@ -125,6 +142,9 @@ BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{boolean}", {boolean()})->Apply(SetArg BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{int32}", {int32()})->Apply(SetArgs); BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{int64}", {int64()})->Apply(SetArgs); BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{utf8}", {utf8()})->Apply(SetArgs); +// View ("German string") keys route through the generic GrouperImpl (the +// Swiss-table fast path used by {utf8} rejects view keys), like large_utf8. +BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{utf8_view}", {utf8_view()})->Apply(SetArgs); BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{fixed_size_binary(32)}", {fixed_size_binary(32)}) ->Apply(SetArgs); @@ -147,6 +167,9 @@ BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{int32, boolean, utf8}", BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{int32, int64, boolean, utf8}", {int32(), int64(), boolean(), utf8()}) ->Apply(SetArgs); +BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{int32, int64, boolean, utf8_view}", + {int32(), int64(), boolean(), utf8_view()}) + ->Apply(SetArgs); BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{utf8, int32, int64, fixed_size_binary(32), boolean}", {utf8(), int32(), int64(), fixed_size_binary(32), boolean()}) diff --git a/cpp/src/arrow/compute/row/grouper_test.cc b/cpp/src/arrow/compute/row/grouper_test.cc index 901d1b6178a4..fd6a2b850a1b 100644 --- a/cpp/src/arrow/compute/row/grouper_test.cc +++ b/cpp/src/arrow/compute/row/grouper_test.cc @@ -23,6 +23,7 @@ #include "arrow/array.h" #include "arrow/array/util.h" #include "arrow/compute/api_vector.h" +#include "arrow/compute/cast.h" #include "arrow/compute/exec.h" #include "arrow/compute/row/grouper.h" #include "arrow/compute/row/grouper_internal.h" @@ -120,6 +121,8 @@ void TestGroupClassSupportedKeys( ASSERT_OK(make_func({utf8(), binary(), large_utf8(), large_binary()})); + ASSERT_OK(make_func({utf8_view(), binary_view()})); + ASSERT_OK(make_func({fixed_size_binary(16), fixed_size_binary(32)})); ASSERT_OK(make_func({decimal128(32, 10), decimal256(76, 20)})); @@ -754,13 +757,26 @@ struct TestGrouper { auto group_ids = id_batch.make_array(); ValidateOutput(*group_ids); + // View ("German string") types don't have take kernels yet (GH-43010), so + // round-trip the validation through their non-view equivalent. This doesn't + // weaken the check: the grouper's output type is verified directly by + // ExpectUniques, and the encoded key bytes are identical across the layouts. + auto as_takeable = [](std::shared_ptr arr) -> std::shared_ptr { + if (is_binary_view_like(*arr->type())) { + auto target = arr->type_id() == Type::STRING_VIEW ? utf8() : binary(); + arr = Cast(Datum(arr), target).ValueOrDie().make_array(); + } + return arr; + }; + for (int i = 0; i < key_batch.num_values(); ++i) { SCOPED_TRACE(ToChars(i) + "th key array"); auto original = key_batch[i].is_array() ? key_batch[i].make_array() : *MakeArrayFromScalar(*key_batch[i].scalar(), key_batch.length); - ASSERT_OK_AND_ASSIGN(auto encoded, Take(*uniques_[i].make_array(), *group_ids)); + ASSERT_OK_AND_ASSIGN(auto encoded, + Take(*as_takeable(uniques_[i].make_array()), *group_ids)); std::shared_ptr expected = original; if (can_be_null && original->type_id() != Type::NA) { // To compute the expected output, mask out the original entries that @@ -791,7 +807,7 @@ struct TestGrouper { expected_data->null_count = kUnknownNullCount; expected = MakeArray(expected_data); } - AssertArraysEqual(*expected, *encoded, /*verbose=*/true, + AssertArraysEqual(*as_takeable(expected), *encoded, /*verbose=*/true, EqualOptions().nans_equal(true)); } } @@ -913,6 +929,47 @@ TEST(Grouper, StringKey) { } } +TEST(Grouper, StringViewKey) { + // View ("German string") keys route through GrouperImpl's BinaryViewKeyEncoder + // (GrouperFastImpl rejects them). Mix short keys (stored inline in the view + // header) with keys longer than 12 bytes (stored out-of-line) to exercise both + // view representations. + for (auto ty : {utf8_view(), binary_view()}) { + ARROW_SCOPED_TRACE("key type = ", *ty); + { + TestGrouper g({ty}); + g.ExpectConsume(R"([["eh"], ["eh"]])", "[0, 0]"); + g.ExpectConsume(R"([["eh"], ["eh"]])", "[0, 0]"); + g.ExpectConsume(R"([["be"], [null]])", "[1, 2]"); + g.ExpectConsume(R"([["a long out-of-line view"], ["a long out-of-line view"]])", + "[3, 3]"); + // GetUniques must round-trip back to the original view type, preserving the + // inline/out-of-line distinction transparently. + g.ExpectUniques(R"([["eh"], ["be"], [null], ["a long out-of-line view"]])"); + } + { + TestGrouper g({ty}); + g.ExpectPopulate(R"([["eh"], ["eh"]])"); + g.ExpectPopulate(R"([["be"], [null]])"); + g.ExpectLookup(R"([["be"], [null], ["da"]])", "[1, 2, null]"); + } + } +} + +TEST(Grouper, StringViewInt64Key) { + for (auto ty : {utf8_view(), binary_view()}) { + ARROW_SCOPED_TRACE("key type = ", *ty); + TestGrouper g({ty, int64()}); + + g.ExpectConsume(R"([["eh", 0], ["eh", 0]])", "[0, 0]"); + g.ExpectConsume(R"([["eh", 0], ["eh", null]])", "[0, 1]"); + g.ExpectConsume(R"([["eh", 1], ["bee", 1]])", "[2, 3]"); + g.ExpectConsume(R"([["eh", null], ["bee", 1]])", "[1, 3]"); + + g.ExpectUniques(R"([["eh", 0], ["eh", null], ["eh", 1], ["bee", 1]])"); + } +} + TEST(Grouper, DictKey) { // For dictionary keys, all batches must share a single dictionary. // Eventually, differing dictionaries will be unified and indices transposed @@ -1074,6 +1131,12 @@ FieldVector AnnotateForRandomGeneration(FieldVector fields) { } else if (is_binary_like(*field->type())) { // (note this is unsupported for large binary types) field = field->WithMergedMetadata(key_value_metadata({"unique"}, {"100"})); + } else if (is_binary_view_like(*field->type())) { + // View types don't support the "unique" knob; constrain the string length + // instead so that group ids repeat (and a mix of inline/out-of-line views + // is exercised). + field = field->WithMergedMetadata( + key_value_metadata({"min_length", "max_length"}, {"0", "5"})); } field = field->WithMergedMetadata(key_value_metadata({"null_probability"}, {"0.1"})); } @@ -1130,6 +1193,22 @@ TEST(Grouper, RandomStringInt64DoubleInt32Keys) { TestRandomLookup(TestGrouper({utf8(), int64(), float64(), int32()})); } +TEST(Grouper, RandomStringViewKeys) { + for (auto ty : {utf8_view(), binary_view()}) { + ARROW_SCOPED_TRACE("key type = ", *ty); + TestRandomConsume(TestGrouper({ty})); + TestRandomLookup(TestGrouper({ty})); + } +} + +TEST(Grouper, RandomStringViewInt64Keys) { + for (auto ty : {utf8_view(), binary_view()}) { + ARROW_SCOPED_TRACE("key type = ", *ty); + TestRandomConsume(TestGrouper({ty, int64()})); + TestRandomLookup(TestGrouper({ty, int64()})); + } +} + TEST(Grouper, NullKeys) { { TestGrouper g({null()}); diff --git a/cpp/src/arrow/compute/row/row_encoder_internal.cc b/cpp/src/arrow/compute/row/row_encoder_internal.cc index eefc2ffb2f71..521f6692d3fc 100644 --- a/cpp/src/arrow/compute/row/row_encoder_internal.cc +++ b/cpp/src/arrow/compute/row/row_encoder_internal.cc @@ -17,6 +17,7 @@ #include "arrow/compute/row/row_encoder_internal.h" +#include "arrow/array/builder_binary.h" #include "arrow/util/bitmap_writer.h" #include "arrow/util/logging_internal.h" @@ -256,6 +257,108 @@ Result> DictionaryKeyEncoder::Decode(uint8_t** encode return data; } +void BinaryViewKeyEncoder::AddLength(const ExecValue& data, int64_t batch_length, + int32_t* lengths) { + if (data.is_array()) { + int64_t i = 0; + ARROW_DCHECK_EQ(data.array.length, batch_length); + VisitArraySpanInline( + data.array, + [&](std::string_view bytes) { + lengths[i++] += + kExtraByteForNull + sizeof(Offset) + static_cast(bytes.size()); + }, + [&] { lengths[i++] += kExtraByteForNull + sizeof(Offset); }); + } else { + const Scalar& scalar = *data.scalar; + const int32_t buffer_size = + scalar.is_valid + ? static_cast(UnboxScalar::Unbox(scalar).size()) + : 0; + for (int64_t i = 0; i < batch_length; i++) { + lengths[i] += kExtraByteForNull + sizeof(Offset) + buffer_size; + } + } +} + +void BinaryViewKeyEncoder::AddLengthNull(int32_t* length) { + *length += kExtraByteForNull + sizeof(Offset); +} + +Status BinaryViewKeyEncoder::Encode(const ExecValue& data, int64_t batch_length, + uint8_t** encoded_bytes) { + auto handle_next_valid_value = [&encoded_bytes](std::string_view bytes) { + auto& encoded_ptr = *encoded_bytes++; + *encoded_ptr++ = kValidByte; + util::SafeStore(encoded_ptr, static_cast(bytes.size())); + encoded_ptr += sizeof(Offset); + memcpy(encoded_ptr, bytes.data(), bytes.size()); + encoded_ptr += bytes.size(); + }; + auto handle_next_null_value = [&encoded_bytes]() { + auto& encoded_ptr = *encoded_bytes++; + *encoded_ptr++ = kNullByte; + util::SafeStore(encoded_ptr, static_cast(0)); + encoded_ptr += sizeof(Offset); + }; + if (data.is_array()) { + ARROW_DCHECK_EQ(data.length(), batch_length); + VisitArraySpanInline(data.array, handle_next_valid_value, + handle_next_null_value); + } else { + const auto& scalar = data.scalar_as(); + if (scalar.is_valid) { + const auto bytes = std::string_view{*scalar.value}; + for (int64_t i = 0; i < batch_length; i++) { + handle_next_valid_value(bytes); + } + } else { + for (int64_t i = 0; i < batch_length; i++) { + handle_next_null_value(); + } + } + } + return Status::OK(); +} + +void BinaryViewKeyEncoder::EncodeNull(uint8_t** encoded_bytes) { + auto& encoded_ptr = *encoded_bytes; + *encoded_ptr++ = kNullByte; + util::SafeStore(encoded_ptr, static_cast(0)); + encoded_ptr += sizeof(Offset); +} + +Result> BinaryViewKeyEncoder::Decode(uint8_t** encoded_bytes, + int32_t length, + MemoryPool* pool) { + // Build a fresh view array. MakeBuilder yields the type-faithful builder + // (BinaryViewBuilder for binary_view, StringViewBuilder for utf8_view), and + // Append decides inline vs. out-of-line storage, so we never share buffers + // with the encoded input. + std::unique_ptr builder; + RETURN_NOT_OK(MakeBuilder(pool, type_, &builder)); + auto& view_builder = checked_cast(*builder); + RETURN_NOT_OK(view_builder.Reserve(length)); + + for (int32_t i = 0; i < length; ++i) { + uint8_t*& encoded_ptr = encoded_bytes[i]; + const bool is_valid = (*encoded_ptr++ == kValidByte); + const auto key_length = util::SafeLoadAs(encoded_ptr); + encoded_ptr += sizeof(Offset); + if (is_valid) { + RETURN_NOT_OK(view_builder.Append(encoded_ptr, key_length)); + } else { + RETURN_NOT_OK(view_builder.AppendNull()); + } + // Null rows encode a zero length, so this is a no-op for them. + encoded_ptr += key_length; + } + + std::shared_ptr out; + RETURN_NOT_OK(view_builder.Finish(&out)); + return out->data(); +} + void RowEncoder::Init(const std::vector& column_types, ExecContext* ctx) { ctx_ = ctx; encoders_.resize(column_types.size()); @@ -301,6 +404,11 @@ void RowEncoder::Init(const std::vector& column_types, ExecContext* continue; } + if (is_binary_view_like(type.id())) { + encoders_[i] = std::make_shared(type.GetSharedPtr()); + continue; + } + // We should not get here ARROW_DCHECK(false); } diff --git a/cpp/src/arrow/compute/row/row_encoder_internal.h b/cpp/src/arrow/compute/row/row_encoder_internal.h index 9337e78bf8a2..5fd2565873d9 100644 --- a/cpp/src/arrow/compute/row/row_encoder_internal.h +++ b/cpp/src/arrow/compute/row/row_encoder_internal.h @@ -252,6 +252,34 @@ struct VarLengthKeyEncoder : KeyEncoder { std::shared_ptr type_; }; +// Encodes BinaryView/StringView ("German string") keys using the same +// variable-length row format as VarLengthKeyEncoder: a leading null byte, a +// 4-byte length prefix, then the raw key bytes. The encoded row copies the key +// bytes (it never references the input's variadic data buffers), so Decode +// builds a fresh view array rather than aliasing shared buffers. +struct ARROW_COMPUTE_EXPORT BinaryViewKeyEncoder : KeyEncoder { + // The on-row length prefix. Group keys are bounded by the int32 row format, so + // a 32-bit length matches the binary (int32-offset) encoder's wire format. + using Offset = int32_t; + + explicit BinaryViewKeyEncoder(std::shared_ptr type) + : type_(std::move(type)) {} + + void AddLength(const ExecValue& data, int64_t batch_length, int32_t* lengths) override; + + void AddLengthNull(int32_t* length) override; + + Status Encode(const ExecValue& data, int64_t batch_length, + uint8_t** encoded_bytes) override; + + void EncodeNull(uint8_t** encoded_bytes) override; + + Result> Decode(uint8_t** encoded_bytes, int32_t length, + MemoryPool* pool) override; + + std::shared_ptr type_; +}; + struct ARROW_COMPUTE_EXPORT NullKeyEncoder : KeyEncoder { void AddLength(const ExecValue&, int64_t batch_length, int32_t* lengths) override {} diff --git a/cpp/src/arrow/compute/row/row_encoder_internal_test.cc b/cpp/src/arrow/compute/row/row_encoder_internal_test.cc index 78839d1ead55..c2a68421e171 100644 --- a/cpp/src/arrow/compute/row/row_encoder_internal_test.cc +++ b/cpp/src/arrow/compute/row/row_encoder_internal_test.cc @@ -20,7 +20,9 @@ #include "arrow/compute/row/row_encoder_internal.h" +#include "arrow/array.h" #include "arrow/array/validate.h" +#include "arrow/scalar.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" #include "arrow/type_fwd.h" @@ -65,4 +67,77 @@ TEST(TestKeyEncoder, BooleanScalar) { } } +// Encodes `value` (either an array, or a scalar repeated `length` times) through +// the key encoder and decodes it back, laying out one contiguous row buffer per +// value the way RowEncoder does. Encode and Decode advance the per-row pointers, +// so they are reset between the two calls. +Result> RoundTripThroughKeyEncoder(KeyEncoder* encoder, + const ExecValue& value, + int32_t length) { + std::vector lengths(length, 0); + encoder->AddLength(value, length, lengths.data()); + + std::vector offsets(length + 1, 0); + for (int32_t i = 0; i < length; ++i) { + offsets[i + 1] = offsets[i] + lengths[i]; + } + std::vector bytes(offsets[length]); + std::vector payload_ptrs(length); + auto reset_payload_ptrs = [&] { + for (int32_t i = 0; i < length; ++i) { + payload_ptrs[i] = bytes.data() + offsets[i]; + } + }; + + reset_payload_ptrs(); + ARROW_RETURN_NOT_OK(encoder->Encode(value, length, payload_ptrs.data())); + reset_payload_ptrs(); + return encoder->Decode(payload_ptrs.data(), length, ::arrow::default_memory_pool()); +} + +// GH-43733 follow-up: the BinaryViewKeyEncoder must round-trip view ("German +// string") keys through the variable-length row format and rebuild a view array +// of the original type. Covers short (inline) and >12-byte (out-of-line) views, +// empties, and nulls, for both the array and scalar input paths. +TEST(TestKeyEncoder, BinaryViewArray) { + for (const auto& ty : {utf8_view(), binary_view()}) { + SCOPED_TRACE("type " + ty->ToString()); + auto array = + ArrayFromJSON(ty, R"(["short", null, "a long out-of-line value", "", "x"])"); + + BinaryViewKeyEncoder key_encoder(ty); + ASSERT_OK_AND_ASSIGN( + auto decoded, RoundTripThroughKeyEncoder(&key_encoder, ExecValue{*array->data()}, + static_cast(array->length()))); + + ASSERT_OK(arrow::internal::ValidateArrayFull(*decoded)); + ASSERT_EQ(decoded->type->id(), ty->id()); + AssertArraysEqual(*array, *MakeArray(decoded), /*verbose=*/true); + } +} + +TEST(TestKeyEncoder, BinaryViewScalar) { + constexpr int32_t kBatchLength = 8; + for (const auto& ty : {utf8_view(), binary_view()}) { + SCOPED_TRACE("type " + ty->ToString()); + // A scalar key is encoded once per row; cover inline, out-of-line, and null. + for (const auto& scalar : + {ScalarFromJSON(ty, R"("short")"), + ScalarFromJSON(ty, R"("a long out-of-line value")"), MakeNullScalar(ty)}) { + SCOPED_TRACE("scalar " + scalar->ToString()); + BinaryViewKeyEncoder key_encoder(ty); + ASSERT_OK_AND_ASSIGN( + auto decoded, RoundTripThroughKeyEncoder(&key_encoder, ExecValue{scalar.get()}, + kBatchLength)); + + ASSERT_OK(arrow::internal::ValidateArrayFull(*decoded)); + ASSERT_EQ(decoded->type->id(), ty->id()); + ASSERT_OK_AND_ASSIGN( + auto expected, + MakeArrayFromScalar(*scalar, kBatchLength, ::arrow::default_memory_pool())); + AssertArraysEqual(*expected, *MakeArray(decoded), /*verbose=*/true); + } + } +} + } // namespace arrow::compute::internal From 4025daf39e9294c82098d25b21f6cab80ac1e5e7 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Thu, 18 Jun 2026 18:15:55 -0700 Subject: [PATCH 2/2] simplify comments --- cpp/src/arrow/compute/row/grouper.cc | 5 ++--- cpp/src/arrow/compute/row/grouper_benchmark.cc | 10 +++------- cpp/src/arrow/compute/row/grouper_test.cc | 17 ++++------------- .../arrow/compute/row/row_encoder_internal.cc | 9 +++------ .../arrow/compute/row/row_encoder_internal.h | 11 ++++------- .../compute/row/row_encoder_internal_test.cc | 13 ++++--------- 6 files changed, 20 insertions(+), 45 deletions(-) diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc index a5c0732f1695..d3eafef2cfa7 100644 --- a/cpp/src/arrow/compute/row/grouper.cc +++ b/cpp/src/arrow/compute/row/grouper.cc @@ -562,9 +562,8 @@ struct GrouperFastImpl : public Grouper { } #if ARROW_LITTLE_ENDIAN for (size_t i = 0; i < key_types.size(); ++i) { - // The fast (Swiss-table) row format can't represent variadic view buffers, - // so view keys fall back to GrouperImpl, which handles them via - // BinaryViewKeyEncoder. + // View keys fall back to GrouperImpl; the fast row format can't hold + // their variadic buffers. if (is_large_binary_like(key_types[i].id()) || is_binary_view_like(key_types[i].id())) { return false; diff --git a/cpp/src/arrow/compute/row/grouper_benchmark.cc b/cpp/src/arrow/compute/row/grouper_benchmark.cc index 245d1cd76b80..6dec5d63455e 100644 --- a/cpp/src/arrow/compute/row/grouper_benchmark.cc +++ b/cpp/src/arrow/compute/row/grouper_benchmark.cc @@ -64,11 +64,8 @@ static ExecBatch MakeRandomExecBatch(const DataTypeVector& types, int64_t num_ro std::vector values; values.resize(num_types); for (int i = 0; i < num_types; ++i) { - // The random generator honors the "unique" knob (which controls group - // cardinality) for plain int32-offset binary/string but not for view types. - // To keep view-key benchmarks comparable to their non-view counterparts, - // generate the plain equivalent column and cast it to the view type; the - // cast happens outside the timed grouping loop. + // The "unique" cardinality knob isn't honored for view types, so generate the + // plain equivalent and cast it (outside the timed loop) to match {utf8}. const auto& type = types[i]; const bool is_view = is_binary_view_like(*type); std::shared_ptr gen_type = type; @@ -142,8 +139,7 @@ BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{boolean}", {boolean()})->Apply(SetArg BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{int32}", {int32()})->Apply(SetArgs); BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{int64}", {int64()})->Apply(SetArgs); BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{utf8}", {utf8()})->Apply(SetArgs); -// View ("German string") keys route through the generic GrouperImpl (the -// Swiss-table fast path used by {utf8} rejects view keys), like large_utf8. +// View keys use the generic GrouperImpl path (the {utf8} fast path rejects them). BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{utf8_view}", {utf8_view()})->Apply(SetArgs); BENCHMARK_CAPTURE(GrouperWithMultiTypes, "{fixed_size_binary(32)}", {fixed_size_binary(32)}) diff --git a/cpp/src/arrow/compute/row/grouper_test.cc b/cpp/src/arrow/compute/row/grouper_test.cc index fd6a2b850a1b..717b15c58f40 100644 --- a/cpp/src/arrow/compute/row/grouper_test.cc +++ b/cpp/src/arrow/compute/row/grouper_test.cc @@ -757,10 +757,8 @@ struct TestGrouper { auto group_ids = id_batch.make_array(); ValidateOutput(*group_ids); - // View ("German string") types don't have take kernels yet (GH-43010), so - // round-trip the validation through their non-view equivalent. This doesn't - // weaken the check: the grouper's output type is verified directly by - // ExpectUniques, and the encoded key bytes are identical across the layouts. + // Take has no view kernel yet (GH-43010); validate via the non-view cast. + // Output type is checked separately by ExpectUniques. auto as_takeable = [](std::shared_ptr arr) -> std::shared_ptr { if (is_binary_view_like(*arr->type())) { auto target = arr->type_id() == Type::STRING_VIEW ? utf8() : binary(); @@ -930,10 +928,7 @@ TEST(Grouper, StringKey) { } TEST(Grouper, StringViewKey) { - // View ("German string") keys route through GrouperImpl's BinaryViewKeyEncoder - // (GrouperFastImpl rejects them). Mix short keys (stored inline in the view - // header) with keys longer than 12 bytes (stored out-of-line) to exercise both - // view representations. + // Mix inline (<=12 byte) and out-of-line view keys. for (auto ty : {utf8_view(), binary_view()}) { ARROW_SCOPED_TRACE("key type = ", *ty); { @@ -943,8 +938,6 @@ TEST(Grouper, StringViewKey) { g.ExpectConsume(R"([["be"], [null]])", "[1, 2]"); g.ExpectConsume(R"([["a long out-of-line view"], ["a long out-of-line view"]])", "[3, 3]"); - // GetUniques must round-trip back to the original view type, preserving the - // inline/out-of-line distinction transparently. g.ExpectUniques(R"([["eh"], ["be"], [null], ["a long out-of-line view"]])"); } { @@ -1132,9 +1125,7 @@ FieldVector AnnotateForRandomGeneration(FieldVector fields) { // (note this is unsupported for large binary types) field = field->WithMergedMetadata(key_value_metadata({"unique"}, {"100"})); } else if (is_binary_view_like(*field->type())) { - // View types don't support the "unique" knob; constrain the string length - // instead so that group ids repeat (and a mix of inline/out-of-line views - // is exercised). + // Views don't support the "unique" knob; small lengths keep cardinality low. field = field->WithMergedMetadata( key_value_metadata({"min_length", "max_length"}, {"0", "5"})); } diff --git a/cpp/src/arrow/compute/row/row_encoder_internal.cc b/cpp/src/arrow/compute/row/row_encoder_internal.cc index 521f6692d3fc..2926bbfb1b7e 100644 --- a/cpp/src/arrow/compute/row/row_encoder_internal.cc +++ b/cpp/src/arrow/compute/row/row_encoder_internal.cc @@ -331,10 +331,8 @@ void BinaryViewKeyEncoder::EncodeNull(uint8_t** encoded_bytes) { Result> BinaryViewKeyEncoder::Decode(uint8_t** encoded_bytes, int32_t length, MemoryPool* pool) { - // Build a fresh view array. MakeBuilder yields the type-faithful builder - // (BinaryViewBuilder for binary_view, StringViewBuilder for utf8_view), and - // Append decides inline vs. out-of-line storage, so we never share buffers - // with the encoded input. + // Build a fresh view array; MakeBuilder gives the type-faithful builder and + // Append handles inline vs. out-of-line storage. std::unique_ptr builder; RETURN_NOT_OK(MakeBuilder(pool, type_, &builder)); auto& view_builder = checked_cast(*builder); @@ -350,8 +348,7 @@ Result> BinaryViewKeyEncoder::Decode(uint8_t** encode } else { RETURN_NOT_OK(view_builder.AppendNull()); } - // Null rows encode a zero length, so this is a no-op for them. - encoded_ptr += key_length; + encoded_ptr += key_length; // zero for null rows } std::shared_ptr out; diff --git a/cpp/src/arrow/compute/row/row_encoder_internal.h b/cpp/src/arrow/compute/row/row_encoder_internal.h index 5fd2565873d9..894e157219c0 100644 --- a/cpp/src/arrow/compute/row/row_encoder_internal.h +++ b/cpp/src/arrow/compute/row/row_encoder_internal.h @@ -252,14 +252,11 @@ struct VarLengthKeyEncoder : KeyEncoder { std::shared_ptr type_; }; -// Encodes BinaryView/StringView ("German string") keys using the same -// variable-length row format as VarLengthKeyEncoder: a leading null byte, a -// 4-byte length prefix, then the raw key bytes. The encoded row copies the key -// bytes (it never references the input's variadic data buffers), so Decode -// builds a fresh view array rather than aliasing shared buffers. +// Encodes BinaryView/StringView keys in the same variable-length row format as +// VarLengthKeyEncoder. The encoded row copies the key bytes, so Decode builds a +// fresh view array rather than aliasing the input's variadic buffers. struct ARROW_COMPUTE_EXPORT BinaryViewKeyEncoder : KeyEncoder { - // The on-row length prefix. Group keys are bounded by the int32 row format, so - // a 32-bit length matches the binary (int32-offset) encoder's wire format. + // On-row length prefix; matches the binary (int32-offset) encoder's format. using Offset = int32_t; explicit BinaryViewKeyEncoder(std::shared_ptr type) diff --git a/cpp/src/arrow/compute/row/row_encoder_internal_test.cc b/cpp/src/arrow/compute/row/row_encoder_internal_test.cc index c2a68421e171..22a72117e9ff 100644 --- a/cpp/src/arrow/compute/row/row_encoder_internal_test.cc +++ b/cpp/src/arrow/compute/row/row_encoder_internal_test.cc @@ -67,10 +67,8 @@ TEST(TestKeyEncoder, BooleanScalar) { } } -// Encodes `value` (either an array, or a scalar repeated `length` times) through -// the key encoder and decodes it back, laying out one contiguous row buffer per -// value the way RowEncoder does. Encode and Decode advance the per-row pointers, -// so they are reset between the two calls. +// Encodes `value` (an array, or a scalar repeated `length` times) and decodes it +// back, laying out per-row buffers the way RowEncoder does. Result> RoundTripThroughKeyEncoder(KeyEncoder* encoder, const ExecValue& value, int32_t length) { @@ -95,10 +93,7 @@ Result> RoundTripThroughKeyEncoder(KeyEncoder* encode return encoder->Decode(payload_ptrs.data(), length, ::arrow::default_memory_pool()); } -// GH-43733 follow-up: the BinaryViewKeyEncoder must round-trip view ("German -// string") keys through the variable-length row format and rebuild a view array -// of the original type. Covers short (inline) and >12-byte (out-of-line) views, -// empties, and nulls, for both the array and scalar input paths. +// Round-trip view keys as an array: inline, out-of-line, empty, and null values. TEST(TestKeyEncoder, BinaryViewArray) { for (const auto& ty : {utf8_view(), binary_view()}) { SCOPED_TRACE("type " + ty->ToString()); @@ -120,7 +115,7 @@ TEST(TestKeyEncoder, BinaryViewScalar) { constexpr int32_t kBatchLength = 8; for (const auto& ty : {utf8_view(), binary_view()}) { SCOPED_TRACE("type " + ty->ToString()); - // A scalar key is encoded once per row; cover inline, out-of-line, and null. + // Scalar input path: inline, out-of-line, and null. for (const auto& scalar : {ScalarFromJSON(ty, R"("short")"), ScalarFromJSON(ty, R"("a long out-of-line value")"), MakeNullScalar(ty)}) {