2020use arrow:: array:: { Array , ArrayData , ArrayRef , AsArray , BinaryViewArray , StructArray } ;
2121use arrow:: buffer:: NullBuffer ;
2222use arrow:: datatypes:: Int32Type ;
23- use arrow_schema:: { ArrowError , DataType } ;
23+ use arrow_schema:: { ArrowError , DataType , Field , FieldRef , Fields } ;
2424use parquet_variant:: Variant ;
2525use std:: any:: Any ;
2626use std:: sync:: Arc ;
@@ -48,6 +48,9 @@ pub struct VariantArray {
4848 /// Reference to the underlying StructArray
4949 inner : StructArray ,
5050
51+ /// The metadata column of this variant
52+ metadata : BinaryViewArray ,
53+
5154 /// how is this variant array shredded?
5255 shredding_state : ShreddingState ,
5356}
@@ -102,31 +105,42 @@ impl VariantArray {
102105 ) ) ) ;
103106 } ;
104107
105- // Find the value field, if present
106- let value = inner
107- . column_by_name ( "value" )
108- . map ( |v| {
109- v. as_binary_view_opt ( ) . ok_or_else ( || {
110- ArrowError :: NotYetImplemented ( format ! (
111- "VariantArray 'value' field must be BinaryView, got {}" ,
112- v. data_type( )
113- ) )
114- } )
115- } )
116- . transpose ( ) ?;
117-
118- // Find the typed_value field, if present
119- let typed_value = inner. column_by_name ( "typed_value" ) ;
120-
121108 // Note these clones are cheap, they just bump the ref count
122- let inner = inner. clone ( ) ;
123- let shredding_state =
124- ShreddingState :: try_new ( metadata. clone ( ) , value. cloned ( ) , typed_value. cloned ( ) ) ?;
125-
126109 Ok ( Self {
110+ inner : inner. clone ( ) ,
111+ metadata : metadata. clone ( ) ,
112+ shredding_state : ShreddingState :: try_new ( inner) ?,
113+ } )
114+ }
115+
116+ #[ allow( unused) ]
117+ pub ( crate ) fn from_parts (
118+ metadata : BinaryViewArray ,
119+ value : Option < BinaryViewArray > ,
120+ typed_value : Option < ArrayRef > ,
121+ nulls : Option < NullBuffer > ,
122+ ) -> Self {
123+ let mut builder =
124+ StructArrayBuilder :: new ( ) . with_field ( "metadata" , Arc :: new ( metadata. clone ( ) ) ) ;
125+ if let Some ( value) = value. clone ( ) {
126+ builder = builder. with_field ( "value" , Arc :: new ( value) ) ;
127+ }
128+ if let Some ( typed_value) = typed_value. clone ( ) {
129+ builder = builder. with_field ( "typed_value" , typed_value) ;
130+ }
131+ if let Some ( nulls) = nulls {
132+ builder = builder. with_nulls ( nulls) ;
133+ }
134+
135+ // This would be a lot simpler if ShreddingState were just a pair of Option... we already
136+ // have everything we need.
137+ let inner = builder. build ( ) ;
138+ let shredding_state = ShreddingState :: try_new ( & inner) . unwrap ( ) ; // valid by construction
139+ Self {
127140 inner,
141+ metadata,
128142 shredding_state,
129- } )
143+ }
130144 }
131145
132146 /// Returns a reference to the underlying [`StructArray`].
@@ -166,23 +180,19 @@ impl VariantArray {
166180 /// caller to ensure that the metadata and value were constructed correctly.
167181 pub fn value ( & self , index : usize ) -> Variant < ' _ , ' _ > {
168182 match & self . shredding_state {
169- ShreddingState :: Unshredded { metadata , value } => {
170- Variant :: new ( metadata. value ( index) , value. value ( index) )
183+ ShreddingState :: Unshredded { value } => {
184+ Variant :: new ( self . metadata . value ( index) , value. value ( index) )
171185 }
172- ShreddingState :: Typed { typed_value, .. } => {
186+ ShreddingState :: PerfectlyShredded { typed_value, .. } => {
173187 if typed_value. is_null ( index) {
174188 Variant :: Null
175189 } else {
176190 typed_value_to_variant ( typed_value, index)
177191 }
178192 }
179- ShreddingState :: PartiallyShredded {
180- metadata,
181- value,
182- typed_value,
183- } => {
193+ ShreddingState :: ImperfectlyShredded { value, typed_value } => {
184194 if typed_value. is_null ( index) {
185- Variant :: new ( metadata. value ( index) , value. value ( index) )
195+ Variant :: new ( self . metadata . value ( index) , value. value ( index) )
186196 } else {
187197 typed_value_to_variant ( typed_value, index)
188198 }
@@ -199,7 +209,96 @@ impl VariantArray {
199209
200210 /// Return a reference to the metadata field of the [`StructArray`]
201211 pub fn metadata_field ( & self ) -> & BinaryViewArray {
202- self . shredding_state . metadata_field ( )
212+ & self . metadata
213+ }
214+
215+ /// Return a reference to the value field of the `StructArray`
216+ pub fn value_field ( & self ) -> Option < & BinaryViewArray > {
217+ self . shredding_state . value_field ( )
218+ }
219+
220+ /// Return a reference to the typed_value field of the `StructArray`, if present
221+ pub fn typed_value_field ( & self ) -> Option < & ArrayRef > {
222+ self . shredding_state . typed_value_field ( )
223+ }
224+ }
225+
226+ /// One shredded field of a partially or prefectly shredded variant. For example, suppose the
227+ /// shredding schema for variant `v` treats it as an object with a single field `a`, where `a` is
228+ /// itself a struct with the single field `b` of type INT. Then the physical layout of the column
229+ /// is:
230+ ///
231+ /// ```text
232+ /// v: VARIANT {
233+ /// metadata: BINARY,
234+ /// value: BINARY,
235+ /// typed_value: STRUCT {
236+ /// a: SHREDDED_VARIANT_FIELD {
237+ /// value: BINARY,
238+ /// typed_value: STRUCT {
239+ /// a: SHREDDED_VARIANT_FIELD {
240+ /// value: BINARY,
241+ /// typed_value: INT,
242+ /// },
243+ /// },
244+ /// },
245+ /// },
246+ /// }
247+ /// ```
248+ ///
249+ /// In the above, each row of `v.value` is either a variant value (shredding failed, `v` was not an
250+ /// object at all) or a variant object (partial shredding, `v` was an object but included unexpected
251+ /// fields other than `a`), or is NULL (perfect shredding, `v` was an object containing only the
252+ /// single expected field `a`).
253+ ///
254+ /// A similar story unfolds for each `v.typed_value.a.value` -- a variant value if shredding failed
255+ /// (`v:a` was not an object at all), or a variant object (`v:a` was an object with unexpected
256+ /// additional fields), or NULL (`v:a` was an object containing only the single expected field `b`).
257+ ///
258+ /// Finally, `v.typed_value.a.typed_value.b.value` is either NULL (`v:a.b` was an integer) or else a
259+ /// variant value.
260+ pub struct ShreddedVariantFieldArray {
261+ shredding_state : ShreddingState ,
262+ }
263+
264+ #[ allow( unused) ]
265+ impl ShreddedVariantFieldArray {
266+ /// Creates a new `ShreddedVariantFieldArray` from a [`StructArray`].
267+ ///
268+ /// # Arguments
269+ /// - `inner` - The underlying [`StructArray`] that contains the variant data.
270+ ///
271+ /// # Returns
272+ /// - A new instance of `ShreddedVariantFieldArray`.
273+ ///
274+ /// # Errors:
275+ /// - If the `StructArray` does not contain the required fields
276+ ///
277+ /// # Requirements of the `StructArray`
278+ ///
279+ /// 1. An optional field named `value` that is binary, large_binary, or
280+ /// binary_view
281+ ///
282+ /// 2. An optional field named `typed_value` which can be any primitive type
283+ /// or be a list, large_list, list_view or struct
284+ ///
285+ /// Currently, only `value` columns of type [`BinaryViewArray`] are supported.
286+ pub fn try_new ( inner : ArrayRef ) -> Result < Self , ArrowError > {
287+ let Some ( inner) = inner. as_struct_opt ( ) else {
288+ return Err ( ArrowError :: InvalidArgumentError (
289+ "Invalid VariantArray: requires StructArray as input" . to_string ( ) ,
290+ ) ) ;
291+ } ;
292+
293+ // Note this clone is cheap, it just bumps the ref count
294+ Ok ( Self {
295+ shredding_state : ShreddingState :: try_new ( inner) ?,
296+ } )
297+ }
298+
299+ /// Return the shredding state of this `VariantArray`
300+ pub fn shredding_state ( & self ) -> & ShreddingState {
301+ & self . shredding_state
203302 }
204303
205304 /// Return a reference to the value field of the `StructArray`
@@ -234,24 +333,21 @@ impl VariantArray {
234333#[ derive( Debug ) ]
235334pub enum ShreddingState {
236335 /// This variant has no typed_value field
237- Unshredded {
238- metadata : BinaryViewArray ,
239- value : BinaryViewArray ,
240- } ,
336+ Unshredded { value : BinaryViewArray } ,
241337 /// This variant has a typed_value field and no value field
242338 /// meaning it is the shredded type
243- Typed {
244- metadata : BinaryViewArray ,
245- typed_value : ArrayRef ,
246- } ,
247- /// Partially shredded:
248- /// * value is an object
249- /// * typed_value is a shredded object.
339+ PerfectlyShredded { typed_value : ArrayRef } ,
340+ /// Imperfectly shredded: Shredded values reside in `typed_value` while those that failed to
341+ /// shred reside in `value`. Missing field values are NULL in both columns, while NULL primitive
342+ /// values have NULL `typed_value` and `Variant::Null` in `value`.
250343 ///
251- /// Note the spec says "Writers must not produce data where both value and
252- /// typed_value are non-null, unless the Variant value is an object."
253- PartiallyShredded {
254- metadata : BinaryViewArray ,
344+ /// NOTE: A partially shredded struct is a special kind of imperfect shredding, where
345+ /// `typed_value` and `value` are both non-NULL. The `typed_value` is a struct containing the
346+ /// subset of fields for which shredding was attempted (each field will then have its own value
347+ /// and/or typed_value sub-fields that indicate how shredding actually turned out). Meanwhile,
348+ /// the `value` is a variant object containing the subset of fields for which shredding was
349+ /// not even attempted.
350+ ImperfectlyShredded {
255351 value : BinaryViewArray ,
256352 typed_value : ArrayRef ,
257353 } ,
@@ -319,8 +415,7 @@ impl ShreddingState {
319415 /// Slice all the underlying arrays
320416 pub fn slice ( & self , offset : usize , length : usize ) -> Self {
321417 match self {
322- ShreddingState :: Unshredded { metadata, value } => ShreddingState :: Unshredded {
323- metadata : metadata. slice ( offset, length) ,
418+ ShreddingState :: Unshredded { value } => ShreddingState :: Unshredded {
324419 value : value. slice ( offset, length) ,
325420 } ,
326421 ShreddingState :: Typed {
@@ -346,6 +441,45 @@ impl ShreddingState {
346441 }
347442}
348443
444+ /// Builds struct arrays from component fields
445+ ///
446+ /// TODO: move to arrow crate
447+ #[ derive( Debug , Default , Clone ) ]
448+ pub struct StructArrayBuilder {
449+ fields : Vec < FieldRef > ,
450+ arrays : Vec < ArrayRef > ,
451+ nulls : Option < NullBuffer > ,
452+ }
453+
454+ impl StructArrayBuilder {
455+ pub fn new ( ) -> Self {
456+ Default :: default ( )
457+ }
458+
459+ /// Add an array to this struct array as a field with the specified name.
460+ pub fn with_field ( mut self , field_name : & str , array : ArrayRef ) -> Self {
461+ let field = Field :: new ( field_name, array. data_type ( ) . clone ( ) , true ) ;
462+ self . fields . push ( Arc :: new ( field) ) ;
463+ self . arrays . push ( array) ;
464+ self
465+ }
466+
467+ /// Set the null buffer for this struct array.
468+ pub fn with_nulls ( mut self , nulls : NullBuffer ) -> Self {
469+ self . nulls = Some ( nulls) ;
470+ self
471+ }
472+
473+ pub fn build ( self ) -> StructArray {
474+ let Self {
475+ fields,
476+ arrays,
477+ nulls,
478+ } = self ;
479+ StructArray :: new ( Fields :: from ( fields) , arrays, nulls)
480+ }
481+ }
482+
349483/// returns the non-null element at index as a Variant
350484fn typed_value_to_variant ( typed_value : & ArrayRef , index : usize ) -> Variant < ' _ , ' _ > {
351485 match typed_value. data_type ( ) {
@@ -388,9 +522,11 @@ impl Array for VariantArray {
388522
389523 fn slice ( & self , offset : usize , length : usize ) -> ArrayRef {
390524 let inner = self . inner . slice ( offset, length) ;
525+ let metadata = self . metadata . slice ( offset, length) ;
391526 let shredding_state = self . shredding_state . slice ( offset, length) ;
392527 Arc :: new ( Self {
393528 inner,
529+ metadata,
394530 shredding_state,
395531 } )
396532 }
0 commit comments