-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Avoid clones while creating Arrays from ArrayData (speed up reading from Parquet reader)
#9058
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -943,15 +943,15 @@ impl<'a, T: ByteViewType + ?Sized> IntoIterator for &'a GenericByteViewArray<T> | |
| } | ||
|
|
||
| impl<T: ByteViewType + ?Sized> From<ArrayData> for GenericByteViewArray<T> { | ||
| fn from(value: ArrayData) -> Self { | ||
| let views = value.buffers()[0].clone(); | ||
| let views = ScalarBuffer::new(views, value.offset(), value.len()); | ||
| let buffers = value.buffers()[1..].to_vec(); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this call to I am pretty sure I was seeing this allocation show up in my profiling of the parquet reader for clickbench |
||
| fn from(data: ArrayData) -> Self { | ||
| let (_data_type, len, nulls, offset, mut buffers, _child_data) = data.into_parts(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unrelated to the performance improvement: I think this also needs to assert that From a performance perspective, not sure if it makes any measurable difference, but after that assert, you could use
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will do
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was able to make a reproducer and filed a ticket. |
||
| let views = buffers.remove(0); // need to maintain order of remaining buffers | ||
| let views = ScalarBuffer::new(views, offset, len); | ||
| Self { | ||
| data_type: T::DATA_TYPE, | ||
| views, | ||
| buffers, | ||
| nulls: value.nulls().cloned(), | ||
| nulls, | ||
| phantom: Default::default(), | ||
| } | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,7 +26,7 @@ use crate::{ | |
| use arrow_buffer::bit_util::set_bit; | ||
| use arrow_buffer::buffer::NullBuffer; | ||
| use arrow_buffer::{ArrowNativeType, BooleanBuffer, BooleanBufferBuilder}; | ||
| use arrow_data::ArrayData; | ||
| use arrow_data::{ArrayData, ArrayDataBuilder}; | ||
| use arrow_schema::{ArrowError, DataType}; | ||
| use std::any::Any; | ||
| use std::sync::Arc; | ||
|
|
@@ -583,18 +583,21 @@ impl<K: ArrowDictionaryKeyType> DictionaryArray<K> { | |
| /// Constructs a `DictionaryArray` from an array data reference. | ||
| impl<T: ArrowDictionaryKeyType> From<ArrayData> for DictionaryArray<T> { | ||
| fn from(data: ArrayData) -> Self { | ||
| let (data_type, len, nulls, offset, buffers, mut child_data) = data.into_parts(); | ||
|
|
||
| assert_eq!( | ||
| data.buffers().len(), | ||
| buffers.len(), | ||
| 1, | ||
| "DictionaryArray data should contain a single buffer only (keys)." | ||
| ); | ||
| assert_eq!( | ||
| data.child_data().len(), | ||
| child_data.len(), | ||
| 1, | ||
| "DictionaryArray should contain a single child array (values)." | ||
| ); | ||
| let cd = child_data.pop().expect("checked above"); | ||
|
|
||
| if let DataType::Dictionary(key_data_type, _) = data.data_type() { | ||
| if let DataType::Dictionary(key_data_type, _) = &data_type { | ||
| assert_eq!( | ||
| &T::DATA_TYPE, | ||
| key_data_type.as_ref(), | ||
|
|
@@ -603,17 +606,17 @@ impl<T: ArrowDictionaryKeyType> From<ArrayData> for DictionaryArray<T> { | |
| key_data_type | ||
| ); | ||
|
|
||
| let values = make_array(data.child_data()[0].clone()); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. here is an example where the entire child_data |
||
| let data_type = data.data_type().clone(); | ||
| let values = make_array(cd); | ||
|
|
||
| // create a zero-copy of the keys' data | ||
| // SAFETY: | ||
| // ArrayData is valid and verified type above | ||
|
|
||
| let keys = PrimitiveArray::<T>::from(unsafe { | ||
| data.into_builder() | ||
| .data_type(T::DATA_TYPE) | ||
| .child_data(vec![]) | ||
| ArrayDataBuilder::new(T::DATA_TYPE) | ||
| .buffers(buffers) | ||
| .nulls(nulls) | ||
| .offset(offset) | ||
| .len(len) | ||
| .build_unchecked() | ||
| }); | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -479,23 +479,26 @@ impl<OffsetSize: OffsetSizeTrait> From<FixedSizeListArray> for GenericListArray< | |
|
|
||
| impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> { | ||
| fn try_new_from_array_data(data: ArrayData) -> Result<Self, ArrowError> { | ||
| if data.buffers().len() != 1 { | ||
| let (data_type, len, nulls, offset, mut buffers, mut child_data) = data.into_parts(); | ||
|
|
||
| if buffers.len() != 1 { | ||
| return Err(ArrowError::InvalidArgumentError(format!( | ||
| "ListArray data should contain a single buffer only (value offsets), had {}", | ||
| data.buffers().len() | ||
| buffers.len() | ||
| ))); | ||
| } | ||
| let buffer = buffers.pop().expect("checked above"); | ||
|
|
||
| if data.child_data().len() != 1 { | ||
| if child_data.len() != 1 { | ||
| return Err(ArrowError::InvalidArgumentError(format!( | ||
| "ListArray should contain a single child array (values array), had {}", | ||
| data.child_data().len() | ||
| child_data.len() | ||
| ))); | ||
| } | ||
|
|
||
| let values = data.child_data()[0].clone(); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. here is another example of not having to clone (and allocate |
||
| let values = child_data.pop().expect("checked above"); | ||
|
|
||
| if let Some(child_data_type) = Self::get_type(data.data_type()) { | ||
| if let Some(child_data_type) = Self::get_type(&data_type) { | ||
| if values.data_type() != child_data_type { | ||
| return Err(ArrowError::InvalidArgumentError(format!( | ||
| "[Large]ListArray's child datatype {:?} does not \ | ||
|
|
@@ -506,19 +509,18 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> { | |
| } | ||
| } else { | ||
| return Err(ArrowError::InvalidArgumentError(format!( | ||
| "[Large]ListArray's datatype must be [Large]ListArray(). It is {:?}", | ||
| data.data_type() | ||
| "[Large]ListArray's datatype must be [Large]ListArray(). It is {data_type:?}", | ||
| ))); | ||
| } | ||
|
|
||
| let values = make_array(values); | ||
| // SAFETY: | ||
| // ArrayData is valid, and verified type above | ||
| let value_offsets = unsafe { get_offsets(&data) }; | ||
| let value_offsets = unsafe { get_offsets(buffer, offset, len) }; | ||
|
|
||
| Ok(Self { | ||
| data_type: data.data_type().clone(), | ||
| nulls: data.nulls().cloned(), | ||
| data_type, | ||
| nulls, | ||
| values, | ||
| value_offsets, | ||
| }) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this basically shows the new pattern -- rather than cloning parts of
ArrayData, instead simply get the relevant structures that are needed directly as it already gets an ownedArrayDataFor BooleanArray this probably saves some
Arc::clones which isn't a big deal. For more complex types like StructArray and GenericByteViewArray it also saves some Vec allocations which I do think is a bigger deal