Skip to content

Commit 6be6cba

Browse files
sdf-jklklion26scovichalamb
authored
Support variant_to_arrow for utf8 (#8600)
# Which issue does this PR close? - Closes #8567. # Rationale for this change Add support for Variant::Utf-8, LargeUtf8, Utf8View. This needs to add a new builder VariantToStringArrowRowBuilder, because LargeUtf8, Utf8View are not ArrowPritimitiveType's # What changes are included in this PR? - Added support for Variant::Utf-8, LargeUtf8, Utf8View by adding a new enum and builder for utf8 and largeUtf8 and added utf8view to primitive builder. - Added a new variable `data_capacity` to `make_string_variant_to_arrow_row_builder` to support string types. - Updated the `make_string_variant_to_arrow_row_builder` in `variant_get` to include the variable. # Are these changes tested? Added a variant_get test for utf8 type and created two separate tests for largeUtf8 and Utf8view because these types can't be shredded. # Are there any user-facing changes? No --------- Co-authored-by: Congxian Qiu <[email protected]> Co-authored-by: Ryan Johnson <[email protected]> Co-authored-by: Andrew Lamb <[email protected]>
1 parent d48bf0d commit 6be6cba

File tree

5 files changed

+134
-7
lines changed

5 files changed

+134
-7
lines changed

arrow-array/src/builder/generic_bytes_builder.rs

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,50 @@ impl<O: OffsetSizeTrait> std::fmt::Write for GenericStringBuilder<O> {
348348
}
349349
}
350350

351+
/// A byte size value representing the number of bytes to allocate per string in [`GenericStringBuilder`]
352+
///
353+
/// To create a [`GenericStringBuilder`] using `.with_capacity` we are required to provide: \
354+
/// - `item_capacity` - the row count \
355+
/// - `data_capacity` - total string byte count \
356+
///
357+
/// We will use the `AVERAGE_STRING_LENGTH` * row_count for `data_capacity`. \
358+
///
359+
/// These capacities are preallocation hints used to improve performance,
360+
/// but consuquences of passing a hint too large or too small should be negligible.
361+
const AVERAGE_STRING_LENGTH: usize = 16;
362+
/// Trait for string-like array builders
363+
///
364+
/// This trait provides unified interface for builders that append string-like data
365+
/// such as [`GenericStringBuilder<O>`] and [`crate::builder::StringViewBuilder`]
366+
pub trait StringLikeArrayBuilder: ArrayBuilder {
367+
/// Returns a human-readable type name for the builder.
368+
fn type_name() -> &'static str;
369+
370+
/// Creates a new builder with the given row capacity.
371+
fn with_capacity(capacity: usize) -> Self;
372+
373+
/// Appends a non-null string value to the builder.
374+
fn append_value(&mut self, value: &str);
375+
376+
/// Appends a null value to the builder.
377+
fn append_null(&mut self);
378+
}
379+
380+
impl<O: OffsetSizeTrait> StringLikeArrayBuilder for GenericStringBuilder<O> {
381+
fn type_name() -> &'static str {
382+
std::any::type_name::<Self>()
383+
}
384+
fn with_capacity(capacity: usize) -> Self {
385+
Self::with_capacity(capacity, capacity * AVERAGE_STRING_LENGTH)
386+
}
387+
fn append_value(&mut self, value: &str) {
388+
Self::append_value(self, value);
389+
}
390+
fn append_null(&mut self) {
391+
Self::append_null(self);
392+
}
393+
}
394+
351395
/// Array builder for [`GenericBinaryArray`][crate::GenericBinaryArray]
352396
///
353397
/// Values can be appended using [`GenericByteBuilder::append_value`], and nulls with

arrow-array/src/builder/generic_bytes_view_builder.rs

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ use arrow_schema::ArrowError;
2525
use hashbrown::HashTable;
2626
use hashbrown::hash_table::Entry;
2727

28-
use crate::builder::ArrayBuilder;
28+
use crate::builder::{ArrayBuilder, StringLikeArrayBuilder};
2929
use crate::types::bytes::ByteArrayNativeType;
3030
use crate::types::{BinaryViewType, ByteViewType, StringViewType};
3131
use crate::{Array, ArrayRef, GenericByteViewArray};
@@ -533,6 +533,21 @@ impl<T: ByteViewType + ?Sized, V: AsRef<T::Native>> Extend<Option<V>>
533533
/// ```
534534
pub type StringViewBuilder = GenericByteViewBuilder<StringViewType>;
535535

536+
impl StringLikeArrayBuilder for StringViewBuilder {
537+
fn type_name() -> &'static str {
538+
std::any::type_name::<StringViewBuilder>()
539+
}
540+
fn with_capacity(capacity: usize) -> Self {
541+
Self::with_capacity(capacity)
542+
}
543+
fn append_value(&mut self, value: &str) {
544+
Self::append_value(self, value);
545+
}
546+
fn append_null(&mut self) {
547+
Self::append_null(self);
548+
}
549+
}
550+
536551
/// Array builder for [`BinaryViewArray`][crate::BinaryViewArray]
537552
///
538553
/// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with

parquet-variant-compute/src/variant_array.rs

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -967,6 +967,16 @@ fn typed_value_to_variant<'a>(
967967
let value = array.value(index);
968968
Ok(Variant::from(value))
969969
}
970+
DataType::LargeUtf8 => {
971+
let array = typed_value.as_string::<i64>();
972+
let value = array.value(index);
973+
Ok(Variant::from(value))
974+
}
975+
DataType::Utf8View => {
976+
let array = typed_value.as_string_view();
977+
let value = array.value(index);
978+
Ok(Variant::from(value))
979+
}
970980
DataType::Int8 => {
971981
primitive_conversion_single_value!(Int8Type, typed_value, index)
972982
}
@@ -1165,14 +1175,14 @@ fn canonicalize_and_verify_data_type(data_type: &DataType) -> Result<Cow<'_, Dat
11651175
// Binary and string are allowed. Force Binary to BinaryView because that's what the parquet
11661176
// reader returns and what the rest of the variant code expects.
11671177
Binary => Cow::Owned(DataType::BinaryView),
1168-
BinaryView | Utf8 => borrow!(),
1178+
BinaryView | Utf8 | LargeUtf8 | Utf8View => borrow!(),
11691179

11701180
// UUID maps to 16-byte fixed-size binary; no other width is allowed
11711181
FixedSizeBinary(16) => borrow!(),
11721182
FixedSizeBinary(_) | FixedSizeList(..) => fail!(),
11731183

11741184
// We can _possibly_ allow (some of) these some day?
1175-
LargeBinary | LargeUtf8 | Utf8View | ListView(_) | LargeList(_) | LargeListView(_) => {
1185+
LargeBinary | ListView(_) | LargeList(_) | LargeListView(_) => {
11761186
fail!()
11771187
}
11781188

parquet-variant-compute/src/variant_get.rs

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -311,8 +311,8 @@ mod test {
311311
use arrow::array::{
312312
Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, Decimal32Array,
313313
Decimal64Array, Decimal128Array, Decimal256Array, Float32Array, Float64Array, Int8Array,
314-
Int16Array, Int32Array, Int64Array, NullBuilder, StringArray, StructArray,
315-
Time64MicrosecondArray,
314+
Int16Array, Int32Array, Int64Array, LargeStringArray, NullBuilder, StringArray,
315+
StringViewArray, StructArray, Time64MicrosecondArray,
316316
};
317317
use arrow::buffer::NullBuffer;
318318
use arrow::compute::CastOptions;
@@ -778,6 +778,27 @@ mod test {
778778
BooleanArray::from(vec![Some(true), Some(false), Some(true)])
779779
);
780780

781+
perfectly_shredded_to_arrow_primitive_test!(
782+
get_variant_perfectly_shredded_utf8_as_utf8,
783+
DataType::Utf8,
784+
perfectly_shredded_utf8_variant_array,
785+
StringArray::from(vec![Some("foo"), Some("bar"), Some("baz")])
786+
);
787+
788+
perfectly_shredded_to_arrow_primitive_test!(
789+
get_variant_perfectly_shredded_large_utf8_as_utf8,
790+
DataType::Utf8,
791+
perfectly_shredded_large_utf8_variant_array,
792+
StringArray::from(vec![Some("foo"), Some("bar"), Some("baz")])
793+
);
794+
795+
perfectly_shredded_to_arrow_primitive_test!(
796+
get_variant_perfectly_shredded_utf8_view_as_utf8,
797+
DataType::Utf8,
798+
perfectly_shredded_utf8_view_variant_array,
799+
StringArray::from(vec![Some("foo"), Some("bar"), Some("baz")])
800+
);
801+
781802
macro_rules! perfectly_shredded_variant_array_fn {
782803
($func:ident, $typed_value_gen:expr) => {
783804
fn $func() -> ArrayRef {
@@ -801,6 +822,18 @@ mod test {
801822
};
802823
}
803824

825+
perfectly_shredded_variant_array_fn!(perfectly_shredded_utf8_variant_array, || {
826+
StringArray::from(vec![Some("foo"), Some("bar"), Some("baz")])
827+
});
828+
829+
perfectly_shredded_variant_array_fn!(perfectly_shredded_large_utf8_variant_array, || {
830+
LargeStringArray::from(vec![Some("foo"), Some("bar"), Some("baz")])
831+
});
832+
833+
perfectly_shredded_variant_array_fn!(perfectly_shredded_utf8_view_variant_array, || {
834+
StringViewArray::from(vec![Some("foo"), Some("bar"), Some("baz")])
835+
});
836+
804837
perfectly_shredded_variant_array_fn!(perfectly_shredded_bool_variant_array, || {
805838
BooleanArray::from(vec![Some(true), Some(false), Some(true)])
806839
});

parquet-variant-compute/src/variant_to_arrow.rs

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@
1616
// under the License.
1717

1818
use arrow::array::{
19-
ArrayRef, BinaryViewArray, BooleanBuilder, FixedSizeBinaryBuilder, NullArray,
20-
NullBufferBuilder, PrimitiveBuilder,
19+
ArrayRef, BinaryViewArray, BooleanBuilder, FixedSizeBinaryBuilder, LargeStringBuilder,
20+
NullArray, NullBufferBuilder, PrimitiveBuilder, StringBuilder, StringLikeArrayBuilder,
21+
StringViewBuilder,
2122
};
2223
use arrow::compute::{CastOptions, DecimalCast};
2324
use arrow::datatypes::{self, DataType, DecimalType};
@@ -62,6 +63,9 @@ pub(crate) enum PrimitiveVariantToArrowRowBuilder<'a> {
6263
Time(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Time64MicrosecondType>),
6364
Date(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Date32Type>),
6465
Uuid(VariantToUuidArrowRowBuilder<'a>),
66+
String(VariantToStringArrowBuilder<'a, StringBuilder>),
67+
LargeString(VariantToStringArrowBuilder<'a, LargeStringBuilder>),
68+
StringView(VariantToStringArrowBuilder<'a, StringViewBuilder>),
6569
}
6670

6771
/// Builder for converting variant values into strongly typed Arrow arrays.
@@ -104,6 +108,9 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
104108
Time(b) => b.append_null(),
105109
Date(b) => b.append_null(),
106110
Uuid(b) => b.append_null(),
111+
String(b) => b.append_null(),
112+
LargeString(b) => b.append_null(),
113+
StringView(b) => b.append_null(),
107114
}
108115
}
109116

@@ -134,6 +141,9 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
134141
Time(b) => b.append_value(value),
135142
Date(b) => b.append_value(value),
136143
Uuid(b) => b.append_value(value),
144+
String(b) => b.append_value(value),
145+
LargeString(b) => b.append_value(value),
146+
StringView(b) => b.append_value(value),
137147
}
138148
}
139149

@@ -164,6 +174,9 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
164174
Time(b) => b.finish(),
165175
Date(b) => b.finish(),
166176
Uuid(b) => b.finish(),
177+
String(b) => b.finish(),
178+
LargeString(b) => b.finish(),
179+
StringView(b) => b.finish(),
167180
}
168181
}
169182
}
@@ -304,6 +317,11 @@ pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>(
304317
"FixedSizeBinary({size}) is not a valid variant shredding type. Only FixedSizeBinary(16) for UUID is supported."
305318
)));
306319
}
320+
DataType::Utf8 => String(VariantToStringArrowBuilder::new(cast_options, capacity)),
321+
DataType::LargeUtf8 => {
322+
LargeString(VariantToStringArrowBuilder::new(cast_options, capacity))
323+
}
324+
DataType::Utf8View => StringView(VariantToStringArrowBuilder::new(cast_options, capacity)),
307325
_ if data_type.is_primitive() => {
308326
return Err(ArrowError::NotYetImplemented(format!(
309327
"Primitive data_type {data_type:?} not yet implemented"
@@ -451,6 +469,13 @@ macro_rules! define_variant_to_primitive_builder {
451469
}
452470
}
453471

472+
define_variant_to_primitive_builder!(
473+
struct VariantToStringArrowBuilder<'a, B: StringLikeArrayBuilder>
474+
|capacity| -> B { B::with_capacity(capacity) },
475+
|value| value.as_string(),
476+
type_name: B::type_name()
477+
);
478+
454479
define_variant_to_primitive_builder!(
455480
struct VariantToBooleanArrowRowBuilder<'a>
456481
|capacity| -> BooleanBuilder { BooleanBuilder::with_capacity(capacity) },

0 commit comments

Comments
 (0)