Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 7 additions & 10 deletions arrow-array/src/array/boolean_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -389,24 +389,21 @@ impl From<Vec<Option<bool>>> for BooleanArray {

impl From<ArrayData> for BooleanArray {
fn from(data: ArrayData) -> Self {
let (data_type, len, nulls, offset, mut buffers, _child_data) = data.into_parts();
assert_eq!(
data.data_type(),
&DataType::Boolean,
"BooleanArray expected ArrayData with type {} got {}",
data_type,
DataType::Boolean,
data.data_type()
"BooleanArray expected ArrayData with type Boolean got {data_type:?}",
);
assert_eq!(
data.buffers().len(),
buffers.len(),
1,
"BooleanArray data should contain a single buffer only (values buffer)"
);
let values = BooleanBuffer::new(data.buffers()[0].clone(), data.offset(), data.len());
let buffer = buffers.pop().expect("checked above");
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this basically shows the new pattern -- rather than cloning parts of ArrayData, instead simply get the relevant structures that are needed directly as it already gets an owned ArrayData

For BooleanArray this probably saves some Arc::clones which isn't a big deal. For more complex types like StructArray and GenericByteViewArray it also saves some Vec allocations which I do think is a bigger deal

let values = BooleanBuffer::new(buffer, offset, len);

Self {
values,
nulls: data.nulls().cloned(),
}
Self { values, nulls }
}
}

Expand Down
16 changes: 10 additions & 6 deletions arrow-array/src/array/byte_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -542,30 +542,34 @@ impl<'a, T: ByteArrayType> ArrayAccessor for &'a GenericByteArray<T> {

impl<T: ByteArrayType> From<ArrayData> for GenericByteArray<T> {
fn from(data: ArrayData) -> Self {
let (data_type, len, nulls, offset, mut buffers, _child_data) = data.into_parts();
assert_eq!(
data.data_type(),
&Self::DATA_TYPE,
data_type,
Self::DATA_TYPE,
"{}{}Array expects DataType::{}",
T::Offset::PREFIX,
T::PREFIX,
Self::DATA_TYPE
);
assert_eq!(
data.buffers().len(),
buffers.len(),
2,
"{}{}Array data should contain 2 buffers only (offsets and values)",
T::Offset::PREFIX,
T::PREFIX,
);
// buffers are offset then value, so pop in reverse
let value_data = buffers.pop().expect("checked above");
let offset_buffer = buffers.pop().expect("checked above");

// SAFETY:
// ArrayData is valid, and verified type above
let value_offsets = unsafe { get_offsets(&data) };
let value_data = data.buffers()[1].clone();
let value_offsets = unsafe { get_offsets(offset_buffer, offset, len) };
Self {
value_offsets,
value_data,
data_type: T::DATA_TYPE,
nulls: data.nulls().cloned(),
nulls,
}
}
}
Expand Down
10 changes: 5 additions & 5 deletions arrow-array/src/array/byte_view_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -943,15 +943,15 @@ impl<'a, T: ByteViewType + ?Sized> IntoIterator for &'a GenericByteViewArray<T>
}

impl<T: ByteViewType + ?Sized> From<ArrayData> for GenericByteViewArray<T> {
fn from(value: ArrayData) -> Self {
let views = value.buffers()[0].clone();
let views = ScalarBuffer::new(views, value.offset(), value.len());
let buffers = value.buffers()[1..].to_vec();
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this call to to_vec() creates a new vec (aka is a new allocation) while using the new API reuses the existing allocation

I am pretty sure I was seeing this allocation show up in my profiling of the parquet reader for clickbench

fn from(data: ArrayData) -> Self {
let (_data_type, len, nulls, offset, mut buffers, _child_data) = data.into_parts();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unrelated to the performance improvement: I think this also needs to assert that data_type equals T::DATA_TYPE, otherwise it allows unchecked casting from binary to string without utf8 validation.

From a performance perspective, not sure if it makes any measurable difference, but after that assert, you could use data_type instead of T::DATA_TYPE below. That might avoid a call to DataType::drop .

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let views = buffers.remove(0); // need to maintain order of remaining buffers
let views = ScalarBuffer::new(views, offset, len);
Self {
data_type: T::DATA_TYPE,
views,
buffers,
nulls: value.nulls().cloned(),
nulls,
phantom: Default::default(),
}
}
Expand Down
23 changes: 13 additions & 10 deletions arrow-array/src/array/dictionary_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ use crate::{
use arrow_buffer::bit_util::set_bit;
use arrow_buffer::buffer::NullBuffer;
use arrow_buffer::{ArrowNativeType, BooleanBuffer, BooleanBufferBuilder};
use arrow_data::ArrayData;
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::{ArrowError, DataType};
use std::any::Any;
use std::sync::Arc;
Expand Down Expand Up @@ -583,18 +583,21 @@ impl<K: ArrowDictionaryKeyType> DictionaryArray<K> {
/// Constructs a `DictionaryArray` from an array data reference.
impl<T: ArrowDictionaryKeyType> From<ArrayData> for DictionaryArray<T> {
fn from(data: ArrayData) -> Self {
let (data_type, len, nulls, offset, buffers, mut child_data) = data.into_parts();

assert_eq!(
data.buffers().len(),
buffers.len(),
1,
"DictionaryArray data should contain a single buffer only (keys)."
);
assert_eq!(
data.child_data().len(),
child_data.len(),
1,
"DictionaryArray should contain a single child array (values)."
);
let cd = child_data.pop().expect("checked above");

if let DataType::Dictionary(key_data_type, _) = data.data_type() {
if let DataType::Dictionary(key_data_type, _) = &data_type {
assert_eq!(
&T::DATA_TYPE,
key_data_type.as_ref(),
Expand All @@ -603,17 +606,17 @@ impl<T: ArrowDictionaryKeyType> From<ArrayData> for DictionaryArray<T> {
key_data_type
);

let values = make_array(data.child_data()[0].clone());
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here is an example where the entire child_data ArrayData was cloned (this potentially includes two Vecs )

let data_type = data.data_type().clone();
let values = make_array(cd);

// create a zero-copy of the keys' data
// SAFETY:
// ArrayData is valid and verified type above

let keys = PrimitiveArray::<T>::from(unsafe {
data.into_builder()
.data_type(T::DATA_TYPE)
.child_data(vec![])
ArrayDataBuilder::new(T::DATA_TYPE)
.buffers(buffers)
.nulls(nulls)
.offset(offset)
.len(len)
.build_unchecked()
});

Expand Down
17 changes: 9 additions & 8 deletions arrow-array/src/array/fixed_size_binary_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -497,24 +497,25 @@ impl FixedSizeBinaryArray {

impl From<ArrayData> for FixedSizeBinaryArray {
fn from(data: ArrayData) -> Self {
let (data_type, len, nulls, offset, buffers, _child_data) = data.into_parts();

assert_eq!(
data.buffers().len(),
buffers.len(),
1,
"FixedSizeBinaryArray data should contain 1 buffer only (values)"
);
let value_length = match data.data_type() {
DataType::FixedSizeBinary(len) => *len,
let value_length = match data_type {
DataType::FixedSizeBinary(len) => len,
_ => panic!("Expected data type to be FixedSizeBinary"),
};

let size = value_length as usize;
let value_data =
data.buffers()[0].slice_with_length(data.offset() * size, data.len() * size);
let value_data = buffers[0].slice_with_length(offset * size, len * size);

Self {
data_type: data.data_type().clone(),
nulls: data.nulls().cloned(),
len: data.len(),
data_type,
nulls,
len,
value_data,
value_length,
}
Expand Down
15 changes: 8 additions & 7 deletions arrow-array/src/array/fixed_size_list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -429,8 +429,10 @@ impl FixedSizeListArray {

impl From<ArrayData> for FixedSizeListArray {
fn from(data: ArrayData) -> Self {
let value_length = match data.data_type() {
DataType::FixedSizeList(_, len) => *len,
let (data_type, len, nulls, offset, _buffers, child_data) = data.into_parts();

let value_length = match data_type {
DataType::FixedSizeList(_, len) => len,
data_type => {
panic!(
"FixedSizeListArray data should contain a FixedSizeList data type, got {data_type}"
Expand All @@ -439,14 +441,13 @@ impl From<ArrayData> for FixedSizeListArray {
};

let size = value_length as usize;
let values =
make_array(data.child_data()[0].slice(data.offset() * size, data.len() * size));
let values = make_array(child_data[0].slice(offset * size, len * size));
Self {
data_type: data.data_type().clone(),
data_type,
values,
nulls: data.nulls().cloned(),
nulls,
value_length,
len: data.len(),
len,
}
}
}
Expand Down
24 changes: 13 additions & 11 deletions arrow-array/src/array/list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -479,23 +479,26 @@ impl<OffsetSize: OffsetSizeTrait> From<FixedSizeListArray> for GenericListArray<

impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
fn try_new_from_array_data(data: ArrayData) -> Result<Self, ArrowError> {
if data.buffers().len() != 1 {
let (data_type, len, nulls, offset, mut buffers, mut child_data) = data.into_parts();

if buffers.len() != 1 {
return Err(ArrowError::InvalidArgumentError(format!(
"ListArray data should contain a single buffer only (value offsets), had {}",
data.buffers().len()
buffers.len()
)));
}
let buffer = buffers.pop().expect("checked above");

if data.child_data().len() != 1 {
if child_data.len() != 1 {
return Err(ArrowError::InvalidArgumentError(format!(
"ListArray should contain a single child array (values array), had {}",
data.child_data().len()
child_data.len()
)));
}

let values = data.child_data()[0].clone();
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here is another example of not having to clone (and allocate Vecs) an entire ArrayData

let values = child_data.pop().expect("checked above");

if let Some(child_data_type) = Self::get_type(data.data_type()) {
if let Some(child_data_type) = Self::get_type(&data_type) {
if values.data_type() != child_data_type {
return Err(ArrowError::InvalidArgumentError(format!(
"[Large]ListArray's child datatype {:?} does not \
Expand All @@ -506,19 +509,18 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
}
} else {
return Err(ArrowError::InvalidArgumentError(format!(
"[Large]ListArray's datatype must be [Large]ListArray(). It is {:?}",
data.data_type()
"[Large]ListArray's datatype must be [Large]ListArray(). It is {data_type:?}",
)));
}

let values = make_array(values);
// SAFETY:
// ArrayData is valid, and verified type above
let value_offsets = unsafe { get_offsets(&data) };
let value_offsets = unsafe { get_offsets(buffer, offset, len) };

Ok(Self {
data_type: data.data_type().clone(),
nulls: data.nulls().cloned(),
data_type,
nulls,
values,
value_offsets,
})
Expand Down
27 changes: 16 additions & 11 deletions arrow-array/src/array/list_view_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -576,23 +576,25 @@ impl<OffsetSize: OffsetSizeTrait> From<FixedSizeListArray> for GenericListViewAr

impl<OffsetSize: OffsetSizeTrait> GenericListViewArray<OffsetSize> {
fn try_new_from_array_data(data: ArrayData) -> Result<Self, ArrowError> {
if data.buffers().len() != 2 {
let (data_type, len, nulls, offset, mut buffers, mut child_data) = data.into_parts();

if buffers.len() != 2 {
return Err(ArrowError::InvalidArgumentError(format!(
"ListViewArray data should contain two buffers (value offsets & value sizes), had {}",
data.buffers().len()
buffers.len()
)));
}

if data.child_data().len() != 1 {
if child_data.len() != 1 {
return Err(ArrowError::InvalidArgumentError(format!(
"ListViewArray should contain a single child array (values array), had {}",
data.child_data().len()
child_data.len()
)));
}

let values = data.child_data()[0].clone();
let values = child_data.pop().expect("checked above");

if let Some(child_data_type) = Self::get_type(data.data_type()) {
if let Some(child_data_type) = Self::get_type(&data_type) {
if values.data_type() != child_data_type {
return Err(ArrowError::InvalidArgumentError(format!(
"{}ListViewArray's child datatype {:?} does not \
Expand All @@ -607,18 +609,21 @@ impl<OffsetSize: OffsetSizeTrait> GenericListViewArray<OffsetSize> {
"{}ListViewArray's datatype must be {}ListViewArray(). It is {:?}",
OffsetSize::PREFIX,
OffsetSize::PREFIX,
data.data_type()
data_type
)));
}

let values = make_array(values);
// ArrayData is valid, and verified type above
let value_offsets = ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len());
let value_sizes = ScalarBuffer::new(data.buffers()[1].clone(), data.offset(), data.len());
// buffer[0] is offsets, buffer[1] is sizes
let sizes_buffer = buffers.pop().expect("checked above");
let offsets_buffer = buffers.pop().expect("checked above");
let value_offsets = ScalarBuffer::new(offsets_buffer, offset, len);
let value_sizes = ScalarBuffer::new(sizes_buffer, offset, len);

Ok(Self {
data_type: data.data_type().clone(),
nulls: data.nulls().cloned(),
data_type,
nulls,
values,
value_offsets,
value_sizes,
Expand Down
25 changes: 13 additions & 12 deletions arrow-array/src/array/map_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -272,28 +272,29 @@ impl From<MapArray> for ArrayData {

impl MapArray {
fn try_new_from_array_data(data: ArrayData) -> Result<Self, ArrowError> {
if !matches!(data.data_type(), DataType::Map(_, _)) {
let (data_type, len, nulls, offset, mut buffers, mut child_data) = data.into_parts();

if !matches!(data_type, DataType::Map(_, _)) {
return Err(ArrowError::InvalidArgumentError(format!(
"MapArray expected ArrayData with DataType::Map got {}",
data.data_type()
"MapArray expected ArrayData with DataType::Map got {data_type}",
)));
}

if data.buffers().len() != 1 {
if buffers.len() != 1 {
return Err(ArrowError::InvalidArgumentError(format!(
"MapArray data should contain a single buffer only (value offsets), had {}",
data.len()
buffers.len(),
)));
}
let buffer = buffers.pop().expect("checked above");

if data.child_data().len() != 1 {
if child_data.len() != 1 {
return Err(ArrowError::InvalidArgumentError(format!(
"MapArray should contain a single child array (values array), had {}",
data.child_data().len()
child_data.len()
)));
}

let entries = data.child_data()[0].clone();
let entries = child_data.pop().expect("checked above");

if let DataType::Struct(fields) = entries.data_type() {
if fields.len() != 2 {
Expand All @@ -312,11 +313,11 @@ impl MapArray {

// SAFETY:
// ArrayData is valid, and verified type above
let value_offsets = unsafe { get_offsets(&data) };
let value_offsets = unsafe { get_offsets(buffer, offset, len) };

Ok(Self {
data_type: data.data_type().clone(),
nulls: data.nulls().cloned(),
data_type,
nulls,
entries,
value_offsets,
})
Expand Down
Loading
Loading