-
Notifications
You must be signed in to change notification settings - Fork 1k
[Variant] Support Shredded Objects in variant_get #8166
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -105,11 +105,26 @@ impl VariantArray { | |||||
| ))); | ||||||
| }; | ||||||
|
|
||||||
| // Extract value and typed_value fields | ||||||
| let value = if let Some(value_col) = inner.column_by_name("value") { | ||||||
| if let Some(binary_view) = value_col.as_binary_view_opt() { | ||||||
| Some(binary_view.clone()) | ||||||
| } else { | ||||||
| return Err(ArrowError::NotYetImplemented(format!( | ||||||
| "VariantArray 'value' field must be BinaryView, got {}", | ||||||
| value_col.data_type() | ||||||
| ))); | ||||||
| } | ||||||
| } else { | ||||||
| None | ||||||
| }; | ||||||
| let typed_value = inner.column_by_name("typed_value").cloned(); | ||||||
|
|
||||||
| // Note these clones are cheap, they just bump the ref count | ||||||
| Ok(Self { | ||||||
| inner: inner.clone(), | ||||||
| metadata: metadata.clone(), | ||||||
| shredding_state: ShreddingState::try_new(inner)?, | ||||||
| shredding_state: ShreddingState::try_new(metadata.clone(), value, typed_value)?, | ||||||
| }) | ||||||
| } | ||||||
|
|
||||||
|
|
@@ -135,7 +150,7 @@ impl VariantArray { | |||||
| // This would be a lot simpler if ShreddingState were just a pair of Option... we already | ||||||
| // have everything we need. | ||||||
| let inner = builder.build(); | ||||||
| let shredding_state = ShreddingState::try_new(&inner).unwrap(); // valid by construction | ||||||
| let shredding_state = ShreddingState::try_new(metadata.clone(), value, typed_value).unwrap(); // valid by construction | ||||||
| Self { | ||||||
| inner, | ||||||
| metadata, | ||||||
|
|
@@ -180,24 +195,28 @@ impl VariantArray { | |||||
| /// caller to ensure that the metadata and value were constructed correctly. | ||||||
| pub fn value(&self, index: usize) -> Variant<'_, '_> { | ||||||
| match &self.shredding_state { | ||||||
| ShreddingState::Unshredded { value } => { | ||||||
| ShreddingState::Unshredded { value, .. } => { | ||||||
| // Unshredded case | ||||||
| Variant::new(self.metadata.value(index), value.value(index)) | ||||||
| } | ||||||
| ShreddingState::PerfectlyShredded { typed_value, .. } => { | ||||||
| ShreddingState::Typed { typed_value, .. } => { | ||||||
| // Typed case (formerly PerfectlyShredded) | ||||||
| if typed_value.is_null(index) { | ||||||
| Variant::Null | ||||||
| } else { | ||||||
| typed_value_to_variant(typed_value, index) | ||||||
| } | ||||||
| } | ||||||
| ShreddingState::ImperfectlyShredded { value, typed_value } => { | ||||||
| ShreddingState::PartiallyShredded { value, typed_value, .. } => { | ||||||
| // PartiallyShredded case (formerly ImperfectlyShredded) | ||||||
|
Comment on lines
+210
to
+211
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is incorrect naming. The term "partially shredded" is specifically defined in the variant shredding spec, and that definition does not agree with how the current code uses the name:
Partial shredding is just one specific kind of imperfect shredding -- any type can shred imperfectly, producing both There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I realize this PR didn't make the naming decisions, but the wording in the PR is not consistent with these namings. I would personally prefer to fix the naming here to be consistent with how the PR uses it, but we could also try to adjust the PR wording to match these enum variant names. |
||||||
| if typed_value.is_null(index) { | ||||||
| Variant::new(self.metadata.value(index), value.value(index)) | ||||||
| } else { | ||||||
| typed_value_to_variant(typed_value, index) | ||||||
| } | ||||||
| } | ||||||
| ShreddingState::AllNull { .. } => { | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another naming nit: This should probably be called |
||||||
| // AllNull case: neither value nor typed_value fields exist | ||||||
| // NOTE: This handles the case where neither value nor typed_value fields exist. | ||||||
| // For top-level variants, this returns Variant::Null (JSON null). | ||||||
| // For shredded object fields, this technically should indicate SQL NULL, | ||||||
|
|
@@ -256,8 +275,11 @@ impl VariantArray { | |||||
| /// additional fields), or NULL (`v:a` was an object containing only the single expected field `b`). | ||||||
| /// | ||||||
| /// Finally, `v.typed_value.a.typed_value.b.value` is either NULL (`v:a.b` was an integer) or else a | ||||||
| /// variant value. | ||||||
| /// variant value (which could be `Variant::Null`). | ||||||
| #[derive(Debug)] | ||||||
| pub struct ShreddedVariantFieldArray { | ||||||
| /// Reference to the underlying StructArray | ||||||
| inner: StructArray, | ||||||
| shredding_state: ShreddingState, | ||||||
| } | ||||||
|
|
||||||
|
|
@@ -284,15 +306,24 @@ impl ShreddedVariantFieldArray { | |||||
| /// | ||||||
| /// Currently, only `value` columns of type [`BinaryViewArray`] are supported. | ||||||
| pub fn try_new(inner: ArrayRef) -> Result<Self, ArrowError> { | ||||||
| let Some(inner) = inner.as_struct_opt() else { | ||||||
| let Some(inner_struct) = inner.as_struct_opt() else { | ||||||
| return Err(ArrowError::InvalidArgumentError( | ||||||
| "Invalid VariantArray: requires StructArray as input".to_string(), | ||||||
| "Invalid ShreddedVariantFieldArray: requires StructArray as input".to_string(), | ||||||
| )); | ||||||
| }; | ||||||
|
|
||||||
| // Extract value and typed_value fields (metadata is not expected in ShreddedVariantFieldArray) | ||||||
| let value = inner_struct.column_by_name("value").and_then(|col| col.as_binary_view_opt().cloned()); | ||||||
| let typed_value = inner_struct.column_by_name("typed_value").cloned(); | ||||||
|
|
||||||
| // Use a dummy metadata for the constructor (ShreddedVariantFieldArray doesn't have metadata) | ||||||
| let dummy_metadata = arrow::array::BinaryViewArray::new_null(inner_struct.len()); | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
|
|
||||||
| // Note this clone is cheap, it just bumps the ref count | ||||||
| let inner = inner_struct.clone(); | ||||||
| Ok(Self { | ||||||
| shredding_state: ShreddingState::try_new(inner)?, | ||||||
| inner: inner.clone(), | ||||||
| shredding_state: ShreddingState::try_new(dummy_metadata, value, typed_value)?, | ||||||
| }) | ||||||
| } | ||||||
|
|
||||||
|
|
@@ -310,6 +341,65 @@ impl ShreddedVariantFieldArray { | |||||
| pub fn typed_value_field(&self) -> Option<&ArrayRef> { | ||||||
| self.shredding_state.typed_value_field() | ||||||
| } | ||||||
|
|
||||||
| /// Returns a reference to the underlying [`StructArray`]. | ||||||
| pub fn inner(&self) -> &StructArray { | ||||||
| &self.inner | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| impl Array for ShreddedVariantFieldArray { | ||||||
| fn as_any(&self) -> &dyn Any { | ||||||
| self | ||||||
| } | ||||||
|
|
||||||
| fn to_data(&self) -> ArrayData { | ||||||
| self.inner.to_data() | ||||||
| } | ||||||
|
|
||||||
| fn into_data(self) -> ArrayData { | ||||||
| self.inner.into_data() | ||||||
| } | ||||||
|
|
||||||
| fn data_type(&self) -> &DataType { | ||||||
| self.inner.data_type() | ||||||
| } | ||||||
|
|
||||||
| fn slice(&self, offset: usize, length: usize) -> ArrayRef { | ||||||
| let inner = self.inner.slice(offset, length); | ||||||
| let shredding_state = self.shredding_state.slice(offset, length); | ||||||
| Arc::new(Self { | ||||||
| inner, | ||||||
| shredding_state, | ||||||
| }) | ||||||
| } | ||||||
|
|
||||||
| fn len(&self) -> usize { | ||||||
| self.inner.len() | ||||||
| } | ||||||
|
|
||||||
| fn is_empty(&self) -> bool { | ||||||
| self.inner.is_empty() | ||||||
| } | ||||||
|
|
||||||
| fn offset(&self) -> usize { | ||||||
| self.inner.offset() | ||||||
| } | ||||||
|
|
||||||
| fn nulls(&self) -> Option<&NullBuffer> { | ||||||
| // According to the shredding spec, ShreddedVariantFieldArray should be | ||||||
| // physically non-nullable - SQL NULL is inferred by both value and | ||||||
| // typed_value being physically NULL | ||||||
| None | ||||||
| } | ||||||
|
Comment on lines
+389
to
+394
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rescuing from github oblivion:
If |
||||||
|
|
||||||
| fn get_buffer_memory_size(&self) -> usize { | ||||||
| self.inner.get_buffer_memory_size() | ||||||
| } | ||||||
|
|
||||||
| fn get_array_memory_size(&self) -> usize { | ||||||
| self.inner.get_array_memory_size() | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| /// Represents the shredding state of a [`VariantArray`] | ||||||
|
|
@@ -333,10 +423,16 @@ impl ShreddedVariantFieldArray { | |||||
| #[derive(Debug)] | ||||||
| pub enum ShreddingState { | ||||||
| /// This variant has no typed_value field | ||||||
| Unshredded { value: BinaryViewArray }, | ||||||
| Unshredded { | ||||||
| metadata: BinaryViewArray, | ||||||
| value: BinaryViewArray, | ||||||
| }, | ||||||
| /// This variant has a typed_value field and no value field | ||||||
| /// meaning it is the shredded type | ||||||
| PerfectlyShredded { typed_value: ArrayRef }, | ||||||
| Typed { | ||||||
| metadata: BinaryViewArray, | ||||||
| typed_value: ArrayRef, | ||||||
| }, | ||||||
| /// Imperfectly shredded: Shredded values reside in `typed_value` while those that failed to | ||||||
| /// shred reside in `value`. Missing field values are NULL in both columns, while NULL primitive | ||||||
| /// values have NULL `typed_value` and `Variant::Null` in `value`. | ||||||
|
|
@@ -347,7 +443,8 @@ pub enum ShreddingState { | |||||
| /// and/or typed_value sub-fields that indicate how shredding actually turned out). Meanwhile, | ||||||
| /// the `value` is a variant object containing the subset of fields for which shredding was | ||||||
| /// not even attempted. | ||||||
| ImperfectlyShredded { | ||||||
| PartiallyShredded { | ||||||
| metadata: BinaryViewArray, | ||||||
| value: BinaryViewArray, | ||||||
| typed_value: ArrayRef, | ||||||
| }, | ||||||
|
|
@@ -357,7 +454,9 @@ pub enum ShreddingState { | |||||
| /// Note: By strict spec interpretation, this should only be valid for shredded object fields, | ||||||
| /// not top-level variants. However, we allow it and treat as Variant::Null for pragmatic | ||||||
| /// handling of missing data. | ||||||
| AllNull { metadata: BinaryViewArray }, | ||||||
| AllNull { | ||||||
| metadata: BinaryViewArray, | ||||||
| }, | ||||||
| } | ||||||
|
|
||||||
| impl ShreddingState { | ||||||
|
|
@@ -415,7 +514,8 @@ impl ShreddingState { | |||||
| /// Slice all the underlying arrays | ||||||
| pub fn slice(&self, offset: usize, length: usize) -> Self { | ||||||
| match self { | ||||||
| ShreddingState::Unshredded { value } => ShreddingState::Unshredded { | ||||||
| ShreddingState::Unshredded { metadata, value } => ShreddingState::Unshredded { | ||||||
| metadata: metadata.slice(offset, length), | ||||||
| value: value.slice(offset, length), | ||||||
| }, | ||||||
| ShreddingState::Typed { | ||||||
|
|
@@ -445,7 +545,7 @@ impl ShreddingState { | |||||
| /// | ||||||
| /// TODO: move to arrow crate | ||||||
| #[derive(Debug, Default, Clone)] | ||||||
| pub struct StructArrayBuilder { | ||||||
| pub(crate) struct StructArrayBuilder { | ||||||
| fields: Vec<FieldRef>, | ||||||
| arrays: Vec<ArrayRef>, | ||||||
| nulls: Option<NullBuffer>, | ||||||
|
|
@@ -658,6 +758,7 @@ mod test { | |||||
| let metadata = BinaryViewArray::from(vec![b"test" as &[u8]]); | ||||||
| let shredding_state = ShreddingState::try_new(metadata.clone(), None, None).unwrap(); | ||||||
|
|
||||||
| // Verify the shredding state is AllNull | ||||||
| assert!(matches!(shredding_state, ShreddingState::AllNull { .. })); | ||||||
|
|
||||||
| // Verify metadata is preserved correctly | ||||||
|
|
||||||

There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
tiny nit?