Skip to content

Add List View & Large List View basic ArrayData construction and validation #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 67 additions & 3 deletions arrow-data/src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -929,6 +929,42 @@ impl ArrayData {
Ok(())
}

/// Does a cheap sanity check that the `self.len` values in `buffer` are valid
/// offsets and sizes (of type T) into some other buffer of `values_length` bytes long
fn validate_offsets_and_sizes<T: ArrowNativeType + num::Num + std::fmt::Display>(
&self,
values_length: usize,
) -> Result<(), ArrowError> {
let offsets: &[T] = self.typed_buffer(0, self.len)?;
let sizes: &[T] = self.typed_buffer(1, self.len)?;
for i in 0..values_length {
let size = sizes[i].to_usize().ok_or_else(|| {
ArrowError::InvalidArgumentError(format!(
"Error converting size[{}] ({}) to usize for {}",
i, sizes[i], self.data_type
))})?;
let offset = offsets[i].to_usize().ok_or_else(|| {
ArrowError::InvalidArgumentError(format!(
"Error converting offset[{}] ({}) to usize for {}",
i, offsets[i], self.data_type
))})?;
if offset > values_length {
return Err(ArrowError::InvalidArgumentError(format!(
"Size {} at index {} is offset {} is out of bounds for {}",
size, i, offset, self.data_type
)));
}
if size > values_length - offset {
return Err(ArrowError::InvalidArgumentError(format!(
"Size {} at index {} is larger than the remaining values for {}",
size, i, self.data_type
)));
}
}

Ok(())
}

/// Validates the layout of `child_data` ArrayData structures
fn validate_child_data(&self) -> Result<(), ArrowError> {
match &self.data_type {
Expand All @@ -942,6 +978,16 @@ impl ArrayData {
self.validate_offsets::<i64>(values_data.len)?;
Ok(())
}
DataType::ListView(field) => {
let values_data = self.get_single_valid_child_data(field.data_type())?;
self.validate_offsets_and_sizes::<i32>(values_data.len)?;
Ok(())
}
DataType::LargeListView(field) => {
let values_data = self.get_single_valid_child_data(field.data_type())?;
self.validate_offsets_and_sizes::<i64>(values_data.len)?;
Ok(())
}
DataType::FixedSizeList(field, list_size) => {
let values_data = self.get_single_valid_child_data(field.data_type())?;

Expand Down Expand Up @@ -1546,9 +1592,8 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout {
DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), // all in child data
DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
DataType::ListView(_) | DataType::LargeListView(_) => {
unimplemented!("ListView/LargeListView not implemented")
}
DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), // all in child data,
Expand Down Expand Up @@ -1661,6 +1706,25 @@ impl DataTypeLayout {
variadic: true,
}
}

/// Describes a list view type
pub fn new_list_view<T>() -> Self {
Self {
buffers: vec![
BufferSpec::FixedWidth {
byte_width: mem::size_of::<T>(),
alignment: mem::align_of::<T>(),
},
BufferSpec::FixedWidth {
byte_width: mem::size_of::<T>(),
alignment: mem::align_of::<T>(),
},
],
can_contain_null_mask: true,
variadic: true,
}
}

}

/// Layout specification for a single data type buffer
Expand Down
68 changes: 68 additions & 0 deletions arrow/tests/array_validation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,74 @@ fn test_validate_offsets_last_too_large() {
.unwrap();
}


/// Test that the list of type `data_type` generates correct offset and size out of bounds errors
fn check_list_view_offsets_sizes<T: ArrowNativeType>(data_type: DataType, offsets : Vec<T>, sizes : Vec<T>) {
let values: Int32Array = [Some(1), Some(2), Some(3), Some(4)].into_iter().collect();
let offsets_buffer = Buffer::from_slice_ref(offsets);
let sizes_buffer = Buffer::from_slice_ref(sizes);
ArrayData::try_new(
data_type,
4,
None,
0,
vec![offsets_buffer,sizes_buffer],
vec![values.into_data()],
)
.unwrap();
}

#[test]
#[should_panic(expected = "Size 3 at index 3 is larger than the remaining values for ListView")]
fn test_validate_list_view_offsets_sizes() {
let field_type = Field::new("f", DataType::Int32, true);
check_list_view_offsets_sizes::<i32>(DataType::ListView(Arc::new(field_type)), vec![0, 1, 1, 2], vec![1,1,1,3]);
}

#[test]
#[should_panic(expected = "Size 3 at index 3 is larger than the remaining values for LargeListView")]
fn test_validate_large_list_view_offsets_sizes() {
let field_type = Field::new("f", DataType::Int32, true);
check_list_view_offsets_sizes::<i64>(DataType::LargeListView(Arc::new(field_type)),vec![0, 1, 1, 2], vec![1,1,1,3]);
}

#[test]
#[should_panic(
expected = "Error converting offset[1] (-1) to usize for ListView"
)]
fn test_validate_list_view_negative_offsets() {
let field_type = Field::new("f", DataType::Int32, true);
check_list_view_offsets_sizes::<i32>(DataType::ListView(Arc::new(field_type)),vec![0, -1, 1, 2], vec![1,1,1,3]);
}

#[test]
#[should_panic(
expected = "Error converting size[2] (-1) to usize for ListView"
)]
fn test_validate_list_view_negative_sizes() {
let field_type = Field::new("f", DataType::Int32, true);
check_list_view_offsets_sizes::<i32>(DataType::ListView(Arc::new(field_type)),vec![0, 1, 1, 2], vec![1,1,-1,3]);
}

#[test]
#[should_panic(
expected = "Error converting offset[1] (-1) to usize for LargeListView"
)]
fn test_validate_large_list_view_negative_offsets() {
let field_type = Field::new("f", DataType::Int32, true);
check_list_view_offsets_sizes::<i64>(DataType::LargeListView(Arc::new(field_type)),vec![0, -1, 1, 2], vec![1,1,1,3]);
}

#[test]
#[should_panic(
expected = "Error converting size[2] (-1) to usize for LargeListView"
)]
fn test_validate_large_list_view_negative_sizes() {
let field_type = Field::new("f", DataType::Int32, true);
check_list_view_offsets_sizes::<i64>(DataType::LargeListView(Arc::new(field_type)),vec![0, 1, 1, 2], vec![1,1,-1,3]);
}


#[test]
#[should_panic(
expected = "Values length 4 is less than the length (2) multiplied by the value size (2) for FixedSizeList"
Expand Down