Skip to content

Commit 81270f1

Browse files
scovichcarpecodeum
authored andcommitted
[Variant] Very rough pathfinding for variant get/shredding
1 parent 76b75ee commit 81270f1

File tree

7 files changed

+750
-60
lines changed

7 files changed

+750
-60
lines changed

parquet-variant-compute/src/variant_array.rs

Lines changed: 185 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
use arrow::array::{Array, ArrayData, ArrayRef, AsArray, BinaryViewArray, StructArray};
2121
use arrow::buffer::NullBuffer;
2222
use arrow::datatypes::Int32Type;
23-
use arrow_schema::{ArrowError, DataType};
23+
use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields};
2424
use parquet_variant::Variant;
2525
use std::any::Any;
2626
use std::sync::Arc;
@@ -48,6 +48,9 @@ pub struct VariantArray {
4848
/// Reference to the underlying StructArray
4949
inner: StructArray,
5050

51+
/// The metadata column of this variant
52+
metadata: BinaryViewArray,
53+
5154
/// how is this variant array shredded?
5255
shredding_state: ShreddingState,
5356
}
@@ -102,31 +105,42 @@ impl VariantArray {
102105
)));
103106
};
104107

105-
// Find the value field, if present
106-
let value = inner
107-
.column_by_name("value")
108-
.map(|v| {
109-
v.as_binary_view_opt().ok_or_else(|| {
110-
ArrowError::NotYetImplemented(format!(
111-
"VariantArray 'value' field must be BinaryView, got {}",
112-
v.data_type()
113-
))
114-
})
115-
})
116-
.transpose()?;
117-
118-
// Find the typed_value field, if present
119-
let typed_value = inner.column_by_name("typed_value");
120-
121108
// Note these clones are cheap, they just bump the ref count
122-
let inner = inner.clone();
123-
let shredding_state =
124-
ShreddingState::try_new(metadata.clone(), value.cloned(), typed_value.cloned())?;
125-
126109
Ok(Self {
110+
inner: inner.clone(),
111+
metadata: metadata.clone(),
112+
shredding_state: ShreddingState::try_new(inner)?,
113+
})
114+
}
115+
116+
#[allow(unused)]
117+
pub(crate) fn from_parts(
118+
metadata: BinaryViewArray,
119+
value: Option<BinaryViewArray>,
120+
typed_value: Option<ArrayRef>,
121+
nulls: Option<NullBuffer>,
122+
) -> Self {
123+
let mut builder =
124+
StructArrayBuilder::new().with_field("metadata", Arc::new(metadata.clone()));
125+
if let Some(value) = value.clone() {
126+
builder = builder.with_field("value", Arc::new(value));
127+
}
128+
if let Some(typed_value) = typed_value.clone() {
129+
builder = builder.with_field("typed_value", typed_value);
130+
}
131+
if let Some(nulls) = nulls {
132+
builder = builder.with_nulls(nulls);
133+
}
134+
135+
// This would be a lot simpler if ShreddingState were just a pair of Option... we already
136+
// have everything we need.
137+
let inner = builder.build();
138+
let shredding_state = ShreddingState::try_new(&inner).unwrap(); // valid by construction
139+
Self {
127140
inner,
141+
metadata,
128142
shredding_state,
129-
})
143+
}
130144
}
131145

132146
/// Returns a reference to the underlying [`StructArray`].
@@ -166,23 +180,19 @@ impl VariantArray {
166180
/// caller to ensure that the metadata and value were constructed correctly.
167181
pub fn value(&self, index: usize) -> Variant<'_, '_> {
168182
match &self.shredding_state {
169-
ShreddingState::Unshredded { metadata, value } => {
170-
Variant::new(metadata.value(index), value.value(index))
183+
ShreddingState::Unshredded { value } => {
184+
Variant::new(self.metadata.value(index), value.value(index))
171185
}
172-
ShreddingState::Typed { typed_value, .. } => {
186+
ShreddingState::PerfectlyShredded { typed_value, .. } => {
173187
if typed_value.is_null(index) {
174188
Variant::Null
175189
} else {
176190
typed_value_to_variant(typed_value, index)
177191
}
178192
}
179-
ShreddingState::PartiallyShredded {
180-
metadata,
181-
value,
182-
typed_value,
183-
} => {
193+
ShreddingState::ImperfectlyShredded { value, typed_value } => {
184194
if typed_value.is_null(index) {
185-
Variant::new(metadata.value(index), value.value(index))
195+
Variant::new(self.metadata.value(index), value.value(index))
186196
} else {
187197
typed_value_to_variant(typed_value, index)
188198
}
@@ -199,7 +209,96 @@ impl VariantArray {
199209

200210
/// Return a reference to the metadata field of the [`StructArray`]
201211
pub fn metadata_field(&self) -> &BinaryViewArray {
202-
self.shredding_state.metadata_field()
212+
&self.metadata
213+
}
214+
215+
/// Return a reference to the value field of the `StructArray`
216+
pub fn value_field(&self) -> Option<&BinaryViewArray> {
217+
self.shredding_state.value_field()
218+
}
219+
220+
/// Return a reference to the typed_value field of the `StructArray`, if present
221+
pub fn typed_value_field(&self) -> Option<&ArrayRef> {
222+
self.shredding_state.typed_value_field()
223+
}
224+
}
225+
226+
/// One shredded field of a partially or prefectly shredded variant. For example, suppose the
227+
/// shredding schema for variant `v` treats it as an object with a single field `a`, where `a` is
228+
/// itself a struct with the single field `b` of type INT. Then the physical layout of the column
229+
/// is:
230+
///
231+
/// ```text
232+
/// v: VARIANT {
233+
/// metadata: BINARY,
234+
/// value: BINARY,
235+
/// typed_value: STRUCT {
236+
/// a: SHREDDED_VARIANT_FIELD {
237+
/// value: BINARY,
238+
/// typed_value: STRUCT {
239+
/// a: SHREDDED_VARIANT_FIELD {
240+
/// value: BINARY,
241+
/// typed_value: INT,
242+
/// },
243+
/// },
244+
/// },
245+
/// },
246+
/// }
247+
/// ```
248+
///
249+
/// In the above, each row of `v.value` is either a variant value (shredding failed, `v` was not an
250+
/// object at all) or a variant object (partial shredding, `v` was an object but included unexpected
251+
/// fields other than `a`), or is NULL (perfect shredding, `v` was an object containing only the
252+
/// single expected field `a`).
253+
///
254+
/// A similar story unfolds for each `v.typed_value.a.value` -- a variant value if shredding failed
255+
/// (`v:a` was not an object at all), or a variant object (`v:a` was an object with unexpected
256+
/// additional fields), or NULL (`v:a` was an object containing only the single expected field `b`).
257+
///
258+
/// Finally, `v.typed_value.a.typed_value.b.value` is either NULL (`v:a.b` was an integer) or else a
259+
/// variant value.
260+
pub struct ShreddedVariantFieldArray {
261+
shredding_state: ShreddingState,
262+
}
263+
264+
#[allow(unused)]
265+
impl ShreddedVariantFieldArray {
266+
/// Creates a new `ShreddedVariantFieldArray` from a [`StructArray`].
267+
///
268+
/// # Arguments
269+
/// - `inner` - The underlying [`StructArray`] that contains the variant data.
270+
///
271+
/// # Returns
272+
/// - A new instance of `ShreddedVariantFieldArray`.
273+
///
274+
/// # Errors:
275+
/// - If the `StructArray` does not contain the required fields
276+
///
277+
/// # Requirements of the `StructArray`
278+
///
279+
/// 1. An optional field named `value` that is binary, large_binary, or
280+
/// binary_view
281+
///
282+
/// 2. An optional field named `typed_value` which can be any primitive type
283+
/// or be a list, large_list, list_view or struct
284+
///
285+
/// Currently, only `value` columns of type [`BinaryViewArray`] are supported.
286+
pub fn try_new(inner: ArrayRef) -> Result<Self, ArrowError> {
287+
let Some(inner) = inner.as_struct_opt() else {
288+
return Err(ArrowError::InvalidArgumentError(
289+
"Invalid VariantArray: requires StructArray as input".to_string(),
290+
));
291+
};
292+
293+
// Note this clone is cheap, it just bumps the ref count
294+
Ok(Self {
295+
shredding_state: ShreddingState::try_new(inner)?,
296+
})
297+
}
298+
299+
/// Return the shredding state of this `VariantArray`
300+
pub fn shredding_state(&self) -> &ShreddingState {
301+
&self.shredding_state
203302
}
204303

205304
/// Return a reference to the value field of the `StructArray`
@@ -234,24 +333,21 @@ impl VariantArray {
234333
#[derive(Debug)]
235334
pub enum ShreddingState {
236335
/// This variant has no typed_value field
237-
Unshredded {
238-
metadata: BinaryViewArray,
239-
value: BinaryViewArray,
240-
},
336+
Unshredded { value: BinaryViewArray },
241337
/// This variant has a typed_value field and no value field
242338
/// meaning it is the shredded type
243-
Typed {
244-
metadata: BinaryViewArray,
245-
typed_value: ArrayRef,
246-
},
247-
/// Partially shredded:
248-
/// * value is an object
249-
/// * typed_value is a shredded object.
339+
PerfectlyShredded { typed_value: ArrayRef },
340+
/// Imperfectly shredded: Shredded values reside in `typed_value` while those that failed to
341+
/// shred reside in `value`. Missing field values are NULL in both columns, while NULL primitive
342+
/// values have NULL `typed_value` and `Variant::Null` in `value`.
250343
///
251-
/// Note the spec says "Writers must not produce data where both value and
252-
/// typed_value are non-null, unless the Variant value is an object."
253-
PartiallyShredded {
254-
metadata: BinaryViewArray,
344+
/// NOTE: A partially shredded struct is a special kind of imperfect shredding, where
345+
/// `typed_value` and `value` are both non-NULL. The `typed_value` is a struct containing the
346+
/// subset of fields for which shredding was attempted (each field will then have its own value
347+
/// and/or typed_value sub-fields that indicate how shredding actually turned out). Meanwhile,
348+
/// the `value` is a variant object containing the subset of fields for which shredding was
349+
/// not even attempted.
350+
ImperfectlyShredded {
255351
value: BinaryViewArray,
256352
typed_value: ArrayRef,
257353
},
@@ -319,8 +415,7 @@ impl ShreddingState {
319415
/// Slice all the underlying arrays
320416
pub fn slice(&self, offset: usize, length: usize) -> Self {
321417
match self {
322-
ShreddingState::Unshredded { metadata, value } => ShreddingState::Unshredded {
323-
metadata: metadata.slice(offset, length),
418+
ShreddingState::Unshredded { value } => ShreddingState::Unshredded {
324419
value: value.slice(offset, length),
325420
},
326421
ShreddingState::Typed {
@@ -346,6 +441,45 @@ impl ShreddingState {
346441
}
347442
}
348443

444+
/// Builds struct arrays from component fields
445+
///
446+
/// TODO: move to arrow crate
447+
#[derive(Debug, Default, Clone)]
448+
pub struct StructArrayBuilder {
449+
fields: Vec<FieldRef>,
450+
arrays: Vec<ArrayRef>,
451+
nulls: Option<NullBuffer>,
452+
}
453+
454+
impl StructArrayBuilder {
455+
pub fn new() -> Self {
456+
Default::default()
457+
}
458+
459+
/// Add an array to this struct array as a field with the specified name.
460+
pub fn with_field(mut self, field_name: &str, array: ArrayRef) -> Self {
461+
let field = Field::new(field_name, array.data_type().clone(), true);
462+
self.fields.push(Arc::new(field));
463+
self.arrays.push(array);
464+
self
465+
}
466+
467+
/// Set the null buffer for this struct array.
468+
pub fn with_nulls(mut self, nulls: NullBuffer) -> Self {
469+
self.nulls = Some(nulls);
470+
self
471+
}
472+
473+
pub fn build(self) -> StructArray {
474+
let Self {
475+
fields,
476+
arrays,
477+
nulls,
478+
} = self;
479+
StructArray::new(Fields::from(fields), arrays, nulls)
480+
}
481+
}
482+
349483
/// returns the non-null element at index as a Variant
350484
fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant<'_, '_> {
351485
match typed_value.data_type() {
@@ -388,9 +522,11 @@ impl Array for VariantArray {
388522

389523
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
390524
let inner = self.inner.slice(offset, length);
525+
let metadata = self.metadata.slice(offset, length);
391526
let shredding_state = self.shredding_state.slice(offset, length);
392527
Arc::new(Self {
393528
inner,
529+
metadata,
394530
shredding_state,
395531
})
396532
}

0 commit comments

Comments
 (0)