Skip to content

Commit 2326b55

Browse files
committed
[ADD] add shredding support for variant objects
1 parent 81270f1 commit 2326b55

File tree

6 files changed

+1404
-536
lines changed

6 files changed

+1404
-536
lines changed

parquet-variant-compute/src/variant_array.rs

Lines changed: 116 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -105,11 +105,26 @@ impl VariantArray {
105105
)));
106106
};
107107

108+
// Extract value and typed_value fields
109+
let value = if let Some(value_col) = inner.column_by_name("value") {
110+
if let Some(binary_view) = value_col.as_binary_view_opt() {
111+
Some(binary_view.clone())
112+
} else {
113+
return Err(ArrowError::NotYetImplemented(format!(
114+
"VariantArray 'value' field must be BinaryView, got {}",
115+
value_col.data_type()
116+
)));
117+
}
118+
} else {
119+
None
120+
};
121+
let typed_value = inner.column_by_name("typed_value").cloned();
122+
108123
// Note these clones are cheap, they just bump the ref count
109124
Ok(Self {
110125
inner: inner.clone(),
111126
metadata: metadata.clone(),
112-
shredding_state: ShreddingState::try_new(inner)?,
127+
shredding_state: ShreddingState::try_new(metadata.clone(), value, typed_value)?,
113128
})
114129
}
115130

@@ -135,7 +150,7 @@ impl VariantArray {
135150
// This would be a lot simpler if ShreddingState were just a pair of Option... we already
136151
// have everything we need.
137152
let inner = builder.build();
138-
let shredding_state = ShreddingState::try_new(&inner).unwrap(); // valid by construction
153+
let shredding_state = ShreddingState::try_new(metadata.clone(), value, typed_value).unwrap(); // valid by construction
139154
Self {
140155
inner,
141156
metadata,
@@ -180,24 +195,28 @@ impl VariantArray {
180195
/// caller to ensure that the metadata and value were constructed correctly.
181196
pub fn value(&self, index: usize) -> Variant<'_, '_> {
182197
match &self.shredding_state {
183-
ShreddingState::Unshredded { value } => {
198+
ShreddingState::Unshredded { value, .. } => {
199+
// Unshredded case
184200
Variant::new(self.metadata.value(index), value.value(index))
185201
}
186-
ShreddingState::PerfectlyShredded { typed_value, .. } => {
202+
ShreddingState::Typed { typed_value, .. } => {
203+
// Typed case (formerly PerfectlyShredded)
187204
if typed_value.is_null(index) {
188205
Variant::Null
189206
} else {
190207
typed_value_to_variant(typed_value, index)
191208
}
192209
}
193-
ShreddingState::ImperfectlyShredded { value, typed_value } => {
210+
ShreddingState::PartiallyShredded { value, typed_value, .. } => {
211+
// PartiallyShredded case (formerly ImperfectlyShredded)
194212
if typed_value.is_null(index) {
195213
Variant::new(self.metadata.value(index), value.value(index))
196214
} else {
197215
typed_value_to_variant(typed_value, index)
198216
}
199217
}
200218
ShreddingState::AllNull { .. } => {
219+
// AllNull case: neither value nor typed_value fields exist
201220
// NOTE: This handles the case where neither value nor typed_value fields exist.
202221
// For top-level variants, this returns Variant::Null (JSON null).
203222
// For shredded object fields, this technically should indicate SQL NULL,
@@ -256,8 +275,11 @@ impl VariantArray {
256275
/// additional fields), or NULL (`v:a` was an object containing only the single expected field `b`).
257276
///
258277
/// Finally, `v.typed_value.a.typed_value.b.value` is either NULL (`v:a.b` was an integer) or else a
259-
/// variant value.
278+
/// variant value (which could be `Variant::Null`).
279+
#[derive(Debug)]
260280
pub struct ShreddedVariantFieldArray {
281+
/// Reference to the underlying StructArray
282+
inner: StructArray,
261283
shredding_state: ShreddingState,
262284
}
263285

@@ -284,15 +306,24 @@ impl ShreddedVariantFieldArray {
284306
///
285307
/// Currently, only `value` columns of type [`BinaryViewArray`] are supported.
286308
pub fn try_new(inner: ArrayRef) -> Result<Self, ArrowError> {
287-
let Some(inner) = inner.as_struct_opt() else {
309+
let Some(inner_struct) = inner.as_struct_opt() else {
288310
return Err(ArrowError::InvalidArgumentError(
289-
"Invalid VariantArray: requires StructArray as input".to_string(),
311+
"Invalid ShreddedVariantFieldArray: requires StructArray as input".to_string(),
290312
));
291313
};
292314

315+
// Extract value and typed_value fields (metadata is not expected in ShreddedVariantFieldArray)
316+
let value = inner_struct.column_by_name("value").and_then(|col| col.as_binary_view_opt().cloned());
317+
let typed_value = inner_struct.column_by_name("typed_value").cloned();
318+
319+
// Use a dummy metadata for the constructor (ShreddedVariantFieldArray doesn't have metadata)
320+
let dummy_metadata = arrow::array::BinaryViewArray::new_null(inner_struct.len());
321+
293322
// Note this clone is cheap, it just bumps the ref count
323+
let inner = inner_struct.clone();
294324
Ok(Self {
295-
shredding_state: ShreddingState::try_new(inner)?,
325+
inner: inner.clone(),
326+
shredding_state: ShreddingState::try_new(dummy_metadata, value, typed_value)?,
296327
})
297328
}
298329

@@ -310,6 +341,65 @@ impl ShreddedVariantFieldArray {
310341
pub fn typed_value_field(&self) -> Option<&ArrayRef> {
311342
self.shredding_state.typed_value_field()
312343
}
344+
345+
/// Returns a reference to the underlying [`StructArray`].
346+
pub fn inner(&self) -> &StructArray {
347+
&self.inner
348+
}
349+
}
350+
351+
impl Array for ShreddedVariantFieldArray {
352+
fn as_any(&self) -> &dyn Any {
353+
self
354+
}
355+
356+
fn to_data(&self) -> ArrayData {
357+
self.inner.to_data()
358+
}
359+
360+
fn into_data(self) -> ArrayData {
361+
self.inner.into_data()
362+
}
363+
364+
fn data_type(&self) -> &DataType {
365+
self.inner.data_type()
366+
}
367+
368+
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
369+
let inner = self.inner.slice(offset, length);
370+
let shredding_state = self.shredding_state.slice(offset, length);
371+
Arc::new(Self {
372+
inner,
373+
shredding_state,
374+
})
375+
}
376+
377+
fn len(&self) -> usize {
378+
self.inner.len()
379+
}
380+
381+
fn is_empty(&self) -> bool {
382+
self.inner.is_empty()
383+
}
384+
385+
fn offset(&self) -> usize {
386+
self.inner.offset()
387+
}
388+
389+
fn nulls(&self) -> Option<&NullBuffer> {
390+
// According to the shredding spec, ShreddedVariantFieldArray should be
391+
// physically non-nullable - SQL NULL is inferred by both value and
392+
// typed_value being physically NULL
393+
None
394+
}
395+
396+
fn get_buffer_memory_size(&self) -> usize {
397+
self.inner.get_buffer_memory_size()
398+
}
399+
400+
fn get_array_memory_size(&self) -> usize {
401+
self.inner.get_array_memory_size()
402+
}
313403
}
314404

315405
/// Represents the shredding state of a [`VariantArray`]
@@ -333,10 +423,16 @@ impl ShreddedVariantFieldArray {
333423
#[derive(Debug)]
334424
pub enum ShreddingState {
335425
/// This variant has no typed_value field
336-
Unshredded { value: BinaryViewArray },
426+
Unshredded {
427+
metadata: BinaryViewArray,
428+
value: BinaryViewArray,
429+
},
337430
/// This variant has a typed_value field and no value field
338431
/// meaning it is the shredded type
339-
PerfectlyShredded { typed_value: ArrayRef },
432+
Typed {
433+
metadata: BinaryViewArray,
434+
typed_value: ArrayRef,
435+
},
340436
/// Imperfectly shredded: Shredded values reside in `typed_value` while those that failed to
341437
/// shred reside in `value`. Missing field values are NULL in both columns, while NULL primitive
342438
/// values have NULL `typed_value` and `Variant::Null` in `value`.
@@ -347,7 +443,8 @@ pub enum ShreddingState {
347443
/// and/or typed_value sub-fields that indicate how shredding actually turned out). Meanwhile,
348444
/// the `value` is a variant object containing the subset of fields for which shredding was
349445
/// not even attempted.
350-
ImperfectlyShredded {
446+
PartiallyShredded {
447+
metadata: BinaryViewArray,
351448
value: BinaryViewArray,
352449
typed_value: ArrayRef,
353450
},
@@ -357,7 +454,9 @@ pub enum ShreddingState {
357454
/// Note: By strict spec interpretation, this should only be valid for shredded object fields,
358455
/// not top-level variants. However, we allow it and treat as Variant::Null for pragmatic
359456
/// handling of missing data.
360-
AllNull { metadata: BinaryViewArray },
457+
AllNull {
458+
metadata: BinaryViewArray,
459+
},
361460
}
362461

363462
impl ShreddingState {
@@ -415,7 +514,8 @@ impl ShreddingState {
415514
/// Slice all the underlying arrays
416515
pub fn slice(&self, offset: usize, length: usize) -> Self {
417516
match self {
418-
ShreddingState::Unshredded { value } => ShreddingState::Unshredded {
517+
ShreddingState::Unshredded { metadata, value } => ShreddingState::Unshredded {
518+
metadata: metadata.slice(offset, length),
419519
value: value.slice(offset, length),
420520
},
421521
ShreddingState::Typed {
@@ -445,7 +545,7 @@ impl ShreddingState {
445545
///
446546
/// TODO: move to arrow crate
447547
#[derive(Debug, Default, Clone)]
448-
pub struct StructArrayBuilder {
548+
pub(crate) struct StructArrayBuilder {
449549
fields: Vec<FieldRef>,
450550
arrays: Vec<ArrayRef>,
451551
nulls: Option<NullBuffer>,
@@ -658,6 +758,7 @@ mod test {
658758
let metadata = BinaryViewArray::from(vec![b"test" as &[u8]]);
659759
let shredding_state = ShreddingState::try_new(metadata.clone(), None, None).unwrap();
660760

761+
// Verify the shredding state is AllNull
661762
assert!(matches!(shredding_state, ShreddingState::AllNull { .. }));
662763

663764
// Verify metadata is preserved correctly

0 commit comments

Comments
 (0)