Skip to content

Commit 62b6299

Browse files
committed
address reviews
1 parent 0662e12 commit 62b6299

File tree

2 files changed

+88
-84
lines changed

2 files changed

+88
-84
lines changed

parquet-variant-compute/src/unshred_variant.rs

Lines changed: 85 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
2020
use crate::{BorrowedShreddingState, VariantArray, VariantValueArrayBuilder};
2121
use arrow::array::{
22-
Array, AsArray as _, BinaryViewArray, BooleanArray, FixedSizeBinaryArray, NullBufferBuilder,
23-
PrimitiveArray, StringArray, StructArray,
22+
Array, AsArray as _, BinaryViewArray, BooleanArray, FixedSizeBinaryArray, PrimitiveArray,
23+
StringArray, StructArray,
2424
};
2525
use arrow::buffer::NullBuffer;
2626
use arrow::datatypes::{
@@ -58,21 +58,19 @@ pub fn unshred_variant(array: &VariantArray) -> Result<VariantArray> {
5858

5959
// NOTE: None/None at top-level is technically invalid, but the shredding spec requires us to
6060
// emit `Variant::Null` when a required value is missing.
61-
let mut row_builder = make_unshred_variant_row_builder(array.shredding_state().borrow())?
62-
.unwrap_or_else(|| UnshredVariantRowBuilder::null(array.nulls()));
61+
let nulls = array.nulls();
62+
let mut row_builder = UnshredVariantRowBuilder::try_new_opt(array.shredding_state().borrow())?
63+
.unwrap_or_else(|| UnshredVariantRowBuilder::null(nulls));
6364

6465
let metadata = array.metadata_field();
6566
let mut value_builder = VariantValueArrayBuilder::new(array.len());
66-
let mut null_builder = NullBufferBuilder::new(array.len());
6767
for i in 0..array.len() {
6868
if array.is_null(i) {
6969
value_builder.append_null();
70-
null_builder.append_null();
7170
} else {
7271
let metadata = VariantMetadata::new(metadata.value(i));
7372
let mut value_builder = value_builder.builder_ext(&metadata);
7473
row_builder.append_row(&mut value_builder, &metadata, i)?;
75-
null_builder.append_non_null();
7674
}
7775
}
7876

@@ -81,7 +79,7 @@ pub fn unshred_variant(array: &VariantArray) -> Result<VariantArray> {
8179
metadata.clone(),
8280
Some(value),
8381
None,
84-
null_builder.finish(),
82+
nulls.cloned(),
8583
))
8684
}
8785

@@ -139,86 +137,86 @@ impl<'a> UnshredVariantRowBuilder<'a> {
139137
Self::Null(b) => b.append_row(builder, metadata, index),
140138
}
141139
}
142-
}
143-
144-
/// Factory function to create the appropriate row builder for given field components
145-
/// Returns None for None/None case - caller decides how to handle based on context
146-
fn make_unshred_variant_row_builder<'a>(
147-
shredding_state: BorrowedShreddingState<'a>,
148-
) -> Result<Option<UnshredVariantRowBuilder<'a>>> {
149-
let value = shredding_state.value_field();
150-
let typed_value = shredding_state.typed_value_field();
151-
let Some(typed_value) = typed_value else {
152-
// Copy the value across directly, if present. Else caller decides what to do.
153-
return Ok(value
154-
.map(|v| UnshredVariantRowBuilder::ValueOnly(ValueOnlyUnshredVariantBuilder::new(v))));
155-
};
156140

157-
// Has typed_value -> determine type and create appropriate builder
158-
macro_rules! primitive_builder {
159-
($enum_variant:ident, $cast_fn:ident) => {
160-
UnshredVariantRowBuilder::$enum_variant(UnshredPrimitiveRowBuilder::new(
161-
value,
162-
typed_value.$cast_fn(),
163-
))
141+
/// Creates a new UnshredVariantRowBuilder from shredding state
142+
/// Returns None for None/None case - caller decides how to handle based on context
143+
fn try_new_opt(shredding_state: BorrowedShreddingState<'a>) -> Result<Option<Self>> {
144+
let value = shredding_state.value_field();
145+
let typed_value = shredding_state.typed_value_field();
146+
let Some(typed_value) = typed_value else {
147+
// Copy the value across directly, if present. Else caller decides what to do.
148+
return Ok(value.map(|v| Self::ValueOnly(ValueOnlyUnshredVariantBuilder::new(v))));
164149
};
165-
}
166150

167-
let builder = match typed_value.data_type() {
168-
DataType::Int8 => primitive_builder!(PrimitiveInt8, as_primitive),
169-
DataType::Int16 => primitive_builder!(PrimitiveInt16, as_primitive),
170-
DataType::Int32 => primitive_builder!(PrimitiveInt32, as_primitive),
171-
DataType::Int64 => primitive_builder!(PrimitiveInt64, as_primitive),
172-
DataType::Float32 => primitive_builder!(PrimitiveFloat32, as_primitive),
173-
DataType::Float64 => primitive_builder!(PrimitiveFloat64, as_primitive),
174-
DataType::Date32 => primitive_builder!(PrimitiveDate32, as_primitive),
175-
DataType::Time64(TimeUnit::Microsecond) => {
176-
primitive_builder!(PrimitiveTime64, as_primitive)
177-
}
178-
DataType::Time64(time_unit) => {
179-
return Err(ArrowError::InvalidArgumentError(format!(
180-
"Time64({time_unit}) is not a valid variant shredding type",
181-
)));
182-
}
183-
DataType::Timestamp(TimeUnit::Microsecond, timezone) => {
184-
UnshredVariantRowBuilder::TimestampMicrosecond(TimestampUnshredRowBuilder::new(
185-
value,
186-
typed_value.as_primitive(),
187-
timezone.is_some(),
188-
))
151+
// Has typed_value -> determine type and create appropriate builder
152+
macro_rules! primitive_builder {
153+
($enum_variant:ident, $cast_fn:ident) => {
154+
Self::$enum_variant(UnshredPrimitiveRowBuilder::new(
155+
value,
156+
typed_value.$cast_fn(),
157+
))
158+
};
189159
}
190-
DataType::Timestamp(TimeUnit::Nanosecond, timezone) => {
191-
UnshredVariantRowBuilder::TimestampNanosecond(TimestampUnshredRowBuilder::new(
160+
161+
let builder = match typed_value.data_type() {
162+
DataType::Int8 => primitive_builder!(PrimitiveInt8, as_primitive),
163+
DataType::Int16 => primitive_builder!(PrimitiveInt16, as_primitive),
164+
DataType::Int32 => primitive_builder!(PrimitiveInt32, as_primitive),
165+
DataType::Int64 => primitive_builder!(PrimitiveInt64, as_primitive),
166+
DataType::Float32 => primitive_builder!(PrimitiveFloat32, as_primitive),
167+
DataType::Float64 => primitive_builder!(PrimitiveFloat64, as_primitive),
168+
DataType::Date32 => primitive_builder!(PrimitiveDate32, as_primitive),
169+
DataType::Time64(TimeUnit::Microsecond) => {
170+
primitive_builder!(PrimitiveTime64, as_primitive)
171+
}
172+
DataType::Time64(time_unit) => {
173+
return Err(ArrowError::InvalidArgumentError(format!(
174+
"Time64({time_unit}) is not a valid variant shredding type",
175+
)));
176+
}
177+
DataType::Timestamp(TimeUnit::Microsecond, timezone) => {
178+
Self::TimestampMicrosecond(TimestampUnshredRowBuilder::new(
179+
value,
180+
typed_value.as_primitive(),
181+
timezone.is_some(),
182+
))
183+
}
184+
DataType::Timestamp(TimeUnit::Nanosecond, timezone) => {
185+
Self::TimestampNanosecond(TimestampUnshredRowBuilder::new(
186+
value,
187+
typed_value.as_primitive(),
188+
timezone.is_some(),
189+
))
190+
}
191+
DataType::Timestamp(time_unit, _) => {
192+
return Err(ArrowError::InvalidArgumentError(format!(
193+
"Timestamp({time_unit}) is not a valid variant shredding type",
194+
)));
195+
}
196+
DataType::Boolean => primitive_builder!(PrimitiveBoolean, as_boolean),
197+
DataType::Utf8 => primitive_builder!(PrimitiveString, as_string),
198+
DataType::BinaryView => primitive_builder!(PrimitiveBinaryView, as_binary_view),
199+
DataType::FixedSizeBinary(16) => {
200+
primitive_builder!(PrimitiveUuid, as_fixed_size_binary)
201+
}
202+
DataType::FixedSizeBinary(size) => {
203+
return Err(ArrowError::InvalidArgumentError(format!(
204+
"FixedSizeBinary({size}) is not a valid variant shredding type",
205+
)));
206+
}
207+
DataType::Struct(_) => Self::Struct(StructUnshredVariantBuilder::try_new(
192208
value,
193-
typed_value.as_primitive(),
194-
timezone.is_some(),
195-
))
196-
}
197-
DataType::Timestamp(time_unit, _) => {
198-
return Err(ArrowError::InvalidArgumentError(format!(
199-
"Timestamp({time_unit}) is not a valid variant shredding type",
200-
)));
201-
}
202-
DataType::Boolean => primitive_builder!(PrimitiveBoolean, as_boolean),
203-
DataType::Utf8 => primitive_builder!(PrimitiveString, as_string),
204-
DataType::BinaryView => primitive_builder!(PrimitiveBinaryView, as_binary_view),
205-
DataType::FixedSizeBinary(16) => primitive_builder!(PrimitiveUuid, as_fixed_size_binary),
206-
DataType::FixedSizeBinary(size) => {
207-
return Err(ArrowError::InvalidArgumentError(format!(
208-
"FixedSizeBinary({size}) is not a valid variant shredding type",
209-
)));
210-
}
211-
DataType::Struct(_) => UnshredVariantRowBuilder::Struct(
212-
StructUnshredVariantBuilder::try_new(value, typed_value.as_struct())?,
213-
),
214-
_ => {
215-
return Err(ArrowError::NotYetImplemented(format!(
216-
"Unshredding not yet supported for type: {}",
217-
typed_value.data_type()
218-
)));
219-
}
220-
};
221-
Ok(Some(builder))
209+
typed_value.as_struct(),
210+
)?),
211+
_ => {
212+
return Err(ArrowError::NotYetImplemented(format!(
213+
"Unshredding not yet supported for type: {}",
214+
typed_value.data_type()
215+
)));
216+
}
217+
};
218+
Ok(Some(builder))
219+
}
222220
}
223221

224222
/// Builder for arrays with neither typed_value nor value (all NULL/Variant::Null)
@@ -466,7 +464,7 @@ impl<'a> StructUnshredVariantBuilder<'a> {
466464
field_array.data_type()
467465
)));
468466
};
469-
let field_unshredder = make_unshred_variant_row_builder(field_array.try_into()?)?;
467+
let field_unshredder = UnshredVariantRowBuilder::try_new_opt(field_array.try_into()?)?;
470468
field_unshredders.insert(field.name().as_ref(), field_unshredder);
471469
}
472470

@@ -518,3 +516,6 @@ impl<'a> StructUnshredVariantBuilder<'a> {
518516
Ok(())
519517
}
520518
}
519+
520+
// TODO: This code is covered by tests in `parquet/tests/variant_integration.rs`. Does that suffice?
521+
// Or do we also need targeted stand-alone unit tests for full coverage?

parquet/tests/variant_integration.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,9 @@ impl VariantTestCase {
291291

292292
let variant_data = self.load_variants();
293293
let variant_array = self.load_parquet();
294+
295+
// `load_parquet` returns shredded variant values, but the test expectations are provided as
296+
// unshredded variant values. Unshred (failing for invalid input) so we can compare them.
294297
let variant_array = unshred_variant(&variant_array).unwrap();
295298

296299
// if this is an error case, the expected error message should be set

0 commit comments

Comments
 (0)