diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index acea680ae374..641983700334 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -389,24 +389,21 @@ impl From>> for BooleanArray { impl From for BooleanArray { fn from(data: ArrayData) -> Self { + let (data_type, len, nulls, offset, mut buffers, _child_data) = data.into_parts(); assert_eq!( - data.data_type(), - &DataType::Boolean, - "BooleanArray expected ArrayData with type {} got {}", + data_type, DataType::Boolean, - data.data_type() + "BooleanArray expected ArrayData with type Boolean got {data_type:?}", ); assert_eq!( - data.buffers().len(), + buffers.len(), 1, "BooleanArray data should contain a single buffer only (values buffer)" ); - let values = BooleanBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()); + let buffer = buffers.pop().expect("checked above"); + let values = BooleanBuffer::new(buffer, offset, len); - Self { - values, - nulls: data.nulls().cloned(), - } + Self { values, nulls } } } diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index bd85bffcfe44..71c221678826 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -542,30 +542,34 @@ impl<'a, T: ByteArrayType> ArrayAccessor for &'a GenericByteArray { impl From for GenericByteArray { fn from(data: ArrayData) -> Self { + let (data_type, len, nulls, offset, mut buffers, _child_data) = data.into_parts(); assert_eq!( - data.data_type(), - &Self::DATA_TYPE, + data_type, + Self::DATA_TYPE, "{}{}Array expects DataType::{}", T::Offset::PREFIX, T::PREFIX, Self::DATA_TYPE ); assert_eq!( - data.buffers().len(), + buffers.len(), 2, "{}{}Array data should contain 2 buffers only (offsets and values)", T::Offset::PREFIX, T::PREFIX, ); + // buffers are offset then value, so pop in reverse + let value_data = buffers.pop().expect("checked above"); + let offset_buffer = buffers.pop().expect("checked above"); + // SAFETY: // ArrayData is valid, and verified type above - let value_offsets = unsafe { get_offsets(&data) }; - let value_data = data.buffers()[1].clone(); + let value_offsets = unsafe { get_offsets(offset_buffer, offset, len) }; Self { value_offsets, value_data, data_type: T::DATA_TYPE, - nulls: data.nulls().cloned(), + nulls, } } } diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index b31c76ab5a27..501ceb212b68 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -943,15 +943,15 @@ impl<'a, T: ByteViewType + ?Sized> IntoIterator for &'a GenericByteViewArray } impl From for GenericByteViewArray { - fn from(value: ArrayData) -> Self { - let views = value.buffers()[0].clone(); - let views = ScalarBuffer::new(views, value.offset(), value.len()); - let buffers = value.buffers()[1..].to_vec(); + fn from(data: ArrayData) -> Self { + let (_data_type, len, nulls, offset, mut buffers, _child_data) = data.into_parts(); + let views = buffers.remove(0); // need to maintain order of remaining buffers + let views = ScalarBuffer::new(views, offset, len); Self { data_type: T::DATA_TYPE, views, buffers, - nulls: value.nulls().cloned(), + nulls, phantom: Default::default(), } } diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index be7703b13c5c..33cc2ade949f 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -26,7 +26,7 @@ use crate::{ use arrow_buffer::bit_util::set_bit; use arrow_buffer::buffer::NullBuffer; use arrow_buffer::{ArrowNativeType, BooleanBuffer, BooleanBufferBuilder}; -use arrow_data::ArrayData; +use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType}; use std::any::Any; use std::sync::Arc; @@ -583,18 +583,21 @@ impl DictionaryArray { /// Constructs a `DictionaryArray` from an array data reference. impl From for DictionaryArray { fn from(data: ArrayData) -> Self { + let (data_type, len, nulls, offset, buffers, mut child_data) = data.into_parts(); + assert_eq!( - data.buffers().len(), + buffers.len(), 1, "DictionaryArray data should contain a single buffer only (keys)." ); assert_eq!( - data.child_data().len(), + child_data.len(), 1, "DictionaryArray should contain a single child array (values)." ); + let cd = child_data.pop().expect("checked above"); - if let DataType::Dictionary(key_data_type, _) = data.data_type() { + if let DataType::Dictionary(key_data_type, _) = &data_type { assert_eq!( &T::DATA_TYPE, key_data_type.as_ref(), @@ -603,17 +606,17 @@ impl From for DictionaryArray { key_data_type ); - let values = make_array(data.child_data()[0].clone()); - let data_type = data.data_type().clone(); + let values = make_array(cd); // create a zero-copy of the keys' data // SAFETY: // ArrayData is valid and verified type above - let keys = PrimitiveArray::::from(unsafe { - data.into_builder() - .data_type(T::DATA_TYPE) - .child_data(vec![]) + ArrayDataBuilder::new(T::DATA_TYPE) + .buffers(buffers) + .nulls(nulls) + .offset(offset) + .len(len) .build_unchecked() }); diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index b94e168cfe7c..f736d1f1eb15 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -497,24 +497,25 @@ impl FixedSizeBinaryArray { impl From for FixedSizeBinaryArray { fn from(data: ArrayData) -> Self { + let (data_type, len, nulls, offset, buffers, _child_data) = data.into_parts(); + assert_eq!( - data.buffers().len(), + buffers.len(), 1, "FixedSizeBinaryArray data should contain 1 buffer only (values)" ); - let value_length = match data.data_type() { - DataType::FixedSizeBinary(len) => *len, + let value_length = match data_type { + DataType::FixedSizeBinary(len) => len, _ => panic!("Expected data type to be FixedSizeBinary"), }; let size = value_length as usize; - let value_data = - data.buffers()[0].slice_with_length(data.offset() * size, data.len() * size); + let value_data = buffers[0].slice_with_length(offset * size, len * size); Self { - data_type: data.data_type().clone(), - nulls: data.nulls().cloned(), - len: data.len(), + data_type, + nulls, + len, value_data, value_length, } diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index 3d5e8a0787c2..89dd992b0f45 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -429,8 +429,10 @@ impl FixedSizeListArray { impl From for FixedSizeListArray { fn from(data: ArrayData) -> Self { - let value_length = match data.data_type() { - DataType::FixedSizeList(_, len) => *len, + let (data_type, len, nulls, offset, _buffers, child_data) = data.into_parts(); + + let value_length = match data_type { + DataType::FixedSizeList(_, len) => len, data_type => { panic!( "FixedSizeListArray data should contain a FixedSizeList data type, got {data_type}" @@ -439,14 +441,13 @@ impl From for FixedSizeListArray { }; let size = value_length as usize; - let values = - make_array(data.child_data()[0].slice(data.offset() * size, data.len() * size)); + let values = make_array(child_data[0].slice(offset * size, len * size)); Self { - data_type: data.data_type().clone(), + data_type, values, - nulls: data.nulls().cloned(), + nulls, value_length, - len: data.len(), + len, } } } diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 225be14ae365..f3f764a6ca55 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -479,23 +479,26 @@ impl From for GenericListArray< impl GenericListArray { fn try_new_from_array_data(data: ArrayData) -> Result { - if data.buffers().len() != 1 { + let (data_type, len, nulls, offset, mut buffers, mut child_data) = data.into_parts(); + + if buffers.len() != 1 { return Err(ArrowError::InvalidArgumentError(format!( "ListArray data should contain a single buffer only (value offsets), had {}", - data.buffers().len() + buffers.len() ))); } + let buffer = buffers.pop().expect("checked above"); - if data.child_data().len() != 1 { + if child_data.len() != 1 { return Err(ArrowError::InvalidArgumentError(format!( "ListArray should contain a single child array (values array), had {}", - data.child_data().len() + child_data.len() ))); } - let values = data.child_data()[0].clone(); + let values = child_data.pop().expect("checked above"); - if let Some(child_data_type) = Self::get_type(data.data_type()) { + if let Some(child_data_type) = Self::get_type(&data_type) { if values.data_type() != child_data_type { return Err(ArrowError::InvalidArgumentError(format!( "[Large]ListArray's child datatype {:?} does not \ @@ -506,19 +509,18 @@ impl GenericListArray { } } else { return Err(ArrowError::InvalidArgumentError(format!( - "[Large]ListArray's datatype must be [Large]ListArray(). It is {:?}", - data.data_type() + "[Large]ListArray's datatype must be [Large]ListArray(). It is {data_type:?}", ))); } let values = make_array(values); // SAFETY: // ArrayData is valid, and verified type above - let value_offsets = unsafe { get_offsets(&data) }; + let value_offsets = unsafe { get_offsets(buffer, offset, len) }; Ok(Self { - data_type: data.data_type().clone(), - nulls: data.nulls().cloned(), + data_type, + nulls, values, value_offsets, }) diff --git a/arrow-array/src/array/list_view_array.rs b/arrow-array/src/array/list_view_array.rs index 52c88d581d20..73ca5e607999 100644 --- a/arrow-array/src/array/list_view_array.rs +++ b/arrow-array/src/array/list_view_array.rs @@ -576,23 +576,25 @@ impl From for GenericListViewAr impl GenericListViewArray { fn try_new_from_array_data(data: ArrayData) -> Result { - if data.buffers().len() != 2 { + let (data_type, len, nulls, offset, mut buffers, mut child_data) = data.into_parts(); + + if buffers.len() != 2 { return Err(ArrowError::InvalidArgumentError(format!( "ListViewArray data should contain two buffers (value offsets & value sizes), had {}", - data.buffers().len() + buffers.len() ))); } - if data.child_data().len() != 1 { + if child_data.len() != 1 { return Err(ArrowError::InvalidArgumentError(format!( "ListViewArray should contain a single child array (values array), had {}", - data.child_data().len() + child_data.len() ))); } - let values = data.child_data()[0].clone(); + let values = child_data.pop().expect("checked above"); - if let Some(child_data_type) = Self::get_type(data.data_type()) { + if let Some(child_data_type) = Self::get_type(&data_type) { if values.data_type() != child_data_type { return Err(ArrowError::InvalidArgumentError(format!( "{}ListViewArray's child datatype {:?} does not \ @@ -607,18 +609,21 @@ impl GenericListViewArray { "{}ListViewArray's datatype must be {}ListViewArray(). It is {:?}", OffsetSize::PREFIX, OffsetSize::PREFIX, - data.data_type() + data_type ))); } let values = make_array(values); // ArrayData is valid, and verified type above - let value_offsets = ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()); - let value_sizes = ScalarBuffer::new(data.buffers()[1].clone(), data.offset(), data.len()); + // buffer[0] is offsets, buffer[1] is sizes + let sizes_buffer = buffers.pop().expect("checked above"); + let offsets_buffer = buffers.pop().expect("checked above"); + let value_offsets = ScalarBuffer::new(offsets_buffer, offset, len); + let value_sizes = ScalarBuffer::new(sizes_buffer, offset, len); Ok(Self { - data_type: data.data_type().clone(), - nulls: data.nulls().cloned(), + data_type, + nulls, values, value_offsets, value_sizes, diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index 86608d586f34..d2a84b5e5631 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -272,28 +272,29 @@ impl From for ArrayData { impl MapArray { fn try_new_from_array_data(data: ArrayData) -> Result { - if !matches!(data.data_type(), DataType::Map(_, _)) { + let (data_type, len, nulls, offset, mut buffers, mut child_data) = data.into_parts(); + + if !matches!(data_type, DataType::Map(_, _)) { return Err(ArrowError::InvalidArgumentError(format!( - "MapArray expected ArrayData with DataType::Map got {}", - data.data_type() + "MapArray expected ArrayData with DataType::Map got {data_type}", ))); } - if data.buffers().len() != 1 { + if buffers.len() != 1 { return Err(ArrowError::InvalidArgumentError(format!( "MapArray data should contain a single buffer only (value offsets), had {}", - data.len() + buffers.len(), ))); } + let buffer = buffers.pop().expect("checked above"); - if data.child_data().len() != 1 { + if child_data.len() != 1 { return Err(ArrowError::InvalidArgumentError(format!( "MapArray should contain a single child array (values array), had {}", - data.child_data().len() + child_data.len() ))); } - - let entries = data.child_data()[0].clone(); + let entries = child_data.pop().expect("checked above"); if let DataType::Struct(fields) = entries.data_type() { if fields.len() != 2 { @@ -312,11 +313,11 @@ impl MapArray { // SAFETY: // ArrayData is valid, and verified type above - let value_offsets = unsafe { get_offsets(&data) }; + let value_offsets = unsafe { get_offsets(buffer, offset, len) }; Ok(Self { - data_type: data.data_type().clone(), - nulls: data.nulls().cloned(), + data_type, + nulls, entries, value_offsets, }) diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 75e32d57e89c..1ffa275f1af0 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -20,7 +20,7 @@ mod binary_array; use crate::types::*; -use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer, ScalarBuffer}; +use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, OffsetBuffer, ScalarBuffer}; use arrow_data::ArrayData; use arrow_schema::{DataType, IntervalUnit, TimeUnit}; use std::any::Any; @@ -893,22 +893,24 @@ pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef { make_array(ArrayData::new_null(data_type, length)) } -/// Helper function that gets offset from an [`ArrayData`] +/// Helper function that gets offset from a buffer and array length /// /// # Safety /// -/// - ArrayData must contain a valid [`OffsetBuffer`] as its first buffer -unsafe fn get_offsets(data: &ArrayData) -> OffsetBuffer { - match data.is_empty() && data.buffers()[0].is_empty() { - true => OffsetBuffer::new_empty(), - false => { - let buffer = - ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len() + 1); - // Safety: - // ArrayData is valid - unsafe { OffsetBuffer::new_unchecked(buffer) } - } - } +/// - buffers must contain a valid [`OffsetBuffer`] for the given length and offset +unsafe fn get_offsets( + buffer: Buffer, + offset: usize, + len: usize, +) -> OffsetBuffer { + if len == 0 && buffer.is_empty() { + return OffsetBuffer::new_empty(); + } + + let scalar_buffer = ScalarBuffer::new(buffer, offset, len + 1); + // Safety: + // Arguments were valid + unsafe { OffsetBuffer::new_unchecked(scalar_buffer) } } /// Helper function for printing potentially long arrays. diff --git a/arrow-array/src/array/null_array.rs b/arrow-array/src/array/null_array.rs index b682466b6738..18c155b78bf4 100644 --- a/arrow-array/src/array/null_array.rs +++ b/arrow-array/src/array/null_array.rs @@ -138,21 +138,19 @@ impl Array for NullArray { impl From for NullArray { fn from(data: ArrayData) -> Self { + let (data_type, len, nulls, _offset, buffers, _child_data) = data.into_parts(); + assert_eq!( - data.data_type(), - &DataType::Null, + data_type, + DataType::Null, "NullArray data type should be Null" ); - assert_eq!( - data.buffers().len(), - 0, - "NullArray data should contain 0 buffers" - ); + assert_eq!(buffers.len(), 0, "NullArray data should contain 0 buffers"); assert!( - data.nulls().is_none(), + nulls.is_none(), "NullArray data should not contain a null buffer, as no buffers are required" ); - Self { len: data.len() } + Self { len } } } diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 457c2428145e..9bd8bf40f38d 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -1601,18 +1601,21 @@ impl PrimitiveArray { /// Constructs a `PrimitiveArray` from an array data reference. impl From for PrimitiveArray { fn from(data: ArrayData) -> Self { - Self::assert_compatible(data.data_type()); + let (data_type, len, nulls, offset, mut buffers, _child_data) = data.into_parts(); + + Self::assert_compatible(&data_type); assert_eq!( - data.buffers().len(), + buffers.len(), 1, "PrimitiveArray data should contain a single buffer only (values buffer)" ); + let buffer = buffers.pop().expect("checked above"); - let values = ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()); + let values = ScalarBuffer::new(buffer, offset, len); Self { - data_type: data.data_type().clone(), + data_type, values, - nulls: data.nulls().cloned(), + nulls, } } } diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index 5254a0ed3cdc..8de8cedc65e8 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -213,27 +213,39 @@ impl RunArray { impl From for RunArray { // The method assumes the caller already validated the data using `ArrayData::validate_data()` fn from(data: ArrayData) -> Self { - match data.data_type() { + let (data_type, len, _nulls, offset, _buffers, mut child_data) = data.into_parts(); + + match &data_type { DataType::RunEndEncoded(_, _) => {} _ => { panic!( - "Invalid data type for RunArray. The data type should be DataType::RunEndEncoded" + "Invalid data type {data_type:?} for RunArray. Should be DataType::RunEndEncoded" ); } } // Safety // ArrayData is valid - let child = &data.child_data()[0]; - assert_eq!(child.data_type(), &R::DATA_TYPE, "Incorrect run ends type"); + // child[0] is run ends , child[1] is values + let values_child = child_data + .pop() + .expect("RunArray data should have two child arrays"); + let run_end_child = child_data + .pop() + .expect("RunArray data should have two child arrays"); + assert_eq!( + run_end_child.data_type(), + &R::DATA_TYPE, + "Incorrect run ends type" + ); let run_ends = unsafe { - let scalar = child.buffers()[0].clone().into(); - RunEndBuffer::new_unchecked(scalar, data.offset(), data.len()) + let scalar = run_end_child.buffers()[0].clone().into(); + RunEndBuffer::new_unchecked(scalar, offset, len) }; - let values = make_array(data.child_data()[1].clone()); + let values = make_array(values_child); Self { - data_type: data.data_type().clone(), + data_type, run_ends, values, } diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index 6ad1ead0d250..a738a733218a 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -347,25 +347,26 @@ impl StructArray { impl From for StructArray { fn from(data: ArrayData) -> Self { - let parent_offset = data.offset(); - let parent_len = data.len(); + let (data_type, len, nulls, offset, _buffers, child_data) = data.into_parts(); - let fields = data - .child_data() - .iter() + let parent_offset = offset; + let parent_len = len; + + let fields = child_data + .into_iter() .map(|cd| { if parent_offset != 0 || parent_len != cd.len() { make_array(cd.slice(parent_offset, parent_len)) } else { - make_array(cd.clone()) + make_array(cd) } }) .collect(); Self { - len: data.len(), - data_type: data.data_type().clone(), - nulls: data.nulls().cloned(), + len, + data_type, + nulls, fields, } } diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index e08542bc8638..3ff4df044dc9 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -680,32 +680,36 @@ impl UnionArray { impl From for UnionArray { fn from(data: ArrayData) -> Self { - let (fields, mode) = match data.data_type() { - DataType::Union(fields, mode) => (fields, *mode), + // TODO should the _nulls be used/validated? + let (data_type, len, _nulls, offset, mut buffers, child_data) = data.into_parts(); + + let (fields, mode) = match &data_type { + DataType::Union(fields, mode) => (fields, mode), d => panic!("UnionArray expected ArrayData with type Union got {d}"), }; + let (type_ids, offsets) = match mode { - UnionMode::Sparse => ( - ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()), - None, - ), - UnionMode::Dense => ( - ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()), - Some(ScalarBuffer::new( - data.buffers()[1].clone(), - data.offset(), - data.len(), - )), - ), + UnionMode::Sparse => { + let buffer = buffers.pop().expect("1 buffer for type_ids"); + (ScalarBuffer::new(buffer, offset, len), None) + } + UnionMode::Dense => { + let offsets_buffer = buffers.pop().expect("2 buffers for type_ids and offsets"); + let type_ids_buffer = buffers.pop().expect("2 buffers for type_ids and offsets"); + ( + ScalarBuffer::new(type_ids_buffer, offset, len), + Some(ScalarBuffer::new(offsets_buffer, offset, len)), + ) + } }; let max_id = fields.iter().map(|(i, _)| i).max().unwrap_or_default() as usize; let mut boxed_fields = vec![None; max_id + 1]; - for (cd, (field_id, _)) in data.child_data().iter().zip(fields.iter()) { - boxed_fields[field_id as usize] = Some(make_array(cd.clone())); + for (cd, (field_id, _)) in child_data.into_iter().zip(fields.iter()) { + boxed_fields[field_id as usize] = Some(make_array(cd)); } Self { - data_type: data.data_type().clone(), + data_type, type_ids, offsets, fields: boxed_fields, diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index df3b2c578f36..15996d546614 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -309,6 +309,9 @@ impl ArrayData { /// /// Note: This is a low level API and most users of the arrow crate should create /// arrays using the builders found in [arrow_array](https://docs.rs/arrow-array) + /// or [`ArrayDataBuilder`]. + /// + /// See also [`Self::into_parts`] to recover the fields pub fn try_new( data_type: DataType, len: usize, @@ -351,6 +354,33 @@ impl ArrayData { Ok(new_self) } + /// Return the constituent parts of this ArrayData + /// + /// This is the inverse of [`ArrayData::try_new`]. + /// + /// Returns `(data_type, len, nulls, offset, buffers, child_data)` + pub fn into_parts( + self, + ) -> ( + DataType, + usize, + Option, + usize, + Vec, + Vec, + ) { + let Self { + data_type, + len, + nulls, + offset, + buffers, + child_data, + } = self; + + (data_type, len, nulls, offset, buffers, child_data) + } + /// Returns a builder to construct a [`ArrayData`] instance of the same [`DataType`] #[inline] pub const fn builder(data_type: DataType) -> ArrayDataBuilder {