diff --git a/arrow-arith/src/aggregate.rs b/arrow-arith/src/aggregate.rs index 0fbddbc6e6df..91623bc22b92 100644 --- a/arrow-arith/src/aggregate.rs +++ b/arrow-arith/src/aggregate.rs @@ -809,15 +809,6 @@ where /// Returns the minimum value in the array, according to the natural order. /// For floating point arrays any NaN values are considered to be greater than any other non-null value -/// -/// # Example -/// ```rust -/// # use arrow_array::Int32Array; -/// # use arrow_arith::aggregate::min; -/// let array = Int32Array::from(vec![8, 2, 4]); -/// let result = min(&array); -/// assert_eq!(result, Some(2)); -/// ``` pub fn min(array: &PrimitiveArray) -> Option where T::Native: PartialOrd, @@ -827,15 +818,6 @@ where /// Returns the maximum value in the array, according to the natural order. /// For floating point arrays any NaN values are considered to be greater than any other non-null value -/// -/// # Example -/// ```rust -/// # use arrow_array::Int32Array; -/// # use arrow_arith::aggregate::max; -/// let array = Int32Array::from(vec![4, 8, 2]); -/// let result = max(&array); -/// assert_eq!(result, Some(8)); -/// ``` pub fn max(array: &PrimitiveArray) -> Option where T::Native: PartialOrd, diff --git a/arrow-arith/src/boolean.rs b/arrow-arith/src/boolean.rs index d94df49de256..6bf438e64618 100644 --- a/arrow-arith/src/boolean.rs +++ b/arrow-arith/src/boolean.rs @@ -23,7 +23,7 @@ //! [here](https://doc.rust-lang.org/stable/core/arch/) for more information. use arrow_array::*; -use arrow_buffer::buffer::{bitwise_bin_op_helper, bitwise_quaternary_op_helper}; +use arrow_buffer::buffer::bitwise_quaternary_op_helper; use arrow_buffer::{BooleanBuffer, NullBuffer, buffer_bin_and_not}; use arrow_schema::ArrowError; @@ -74,7 +74,7 @@ pub fn and_kleene(left: &BooleanArray, right: &BooleanArray) -> Result Result { // Same as above - Some(bitwise_bin_op_helper( + Some(BooleanBuffer::from_bitwise_binary_op( right_null_buffer.buffer(), right_null_buffer.offset(), left_values.inner(), @@ -100,7 +100,7 @@ pub fn and_kleene(left: &BooleanArray, right: &BooleanArray) -> Result Result Result Result { // Same as above - Some(bitwise_bin_op_helper( + Some(BooleanBuffer::from_bitwise_binary_op( right_nulls.buffer(), right_nulls.offset(), left_values.inner(), @@ -195,7 +196,7 @@ pub fn or_kleene(left: &BooleanArray, right: &BooleanArray) -> Result Result( + left: impl AsRef<[u8]>, + left_offset_in_bits: usize, + right: impl AsRef<[u8]>, + right_offset_in_bits: usize, + len_in_bits: usize, + mut op: F, + ) -> Self + where + F: FnMut(u64, u64) -> u64, + { + let left = left.as_ref(); + let right = right.as_ref(); + // try fast path for aligned input + // If the underlying buffers are aligned to u64 we can apply the operation directly on the u64 slices + // to improve performance. + if left_offset_in_bits & 0x7 == 0 && right_offset_in_bits & 0x7 == 0 { + unsafe { + let (left_prefix, left_u64s, left_suffix) = left.align_to::(); + let (right_prefix, right_u64s, right_suffix) = right.align_to::(); + // if there is no prefix or suffix, both buffers are aligned and we can do the operation directly + // on u64s + // TODO also handle non empty suffixes by processing them separately + if left_prefix.is_empty() + && right_prefix.is_empty() + && left_suffix.is_empty() + && right_suffix.is_empty() + { + let result_u64s = left_u64s + .iter() + .zip(right_u64s.iter()) + .map(|(l, r)| op(*l, *r)) + .collect::>(); + return BooleanBuffer { + buffer: Buffer::from(result_u64s), + bit_offset: 0, + bit_len: len_in_bits, + }; + } + } + } + let left_chunks = BitChunks::new(left, left_offset_in_bits, len_in_bits); + let right_chunks = BitChunks::new(right, right_offset_in_bits, len_in_bits); + + let chunks = left_chunks + .iter() + .zip(right_chunks.iter()) + .map(|(left, right)| op(left, right)); + // Soundness: `BitChunks` is a `BitChunks` iterator which + // correctly reports its upper bound + let mut buffer = unsafe { MutableBuffer::from_trusted_len_iter(chunks) }; + + let remainder_bytes = bit_util::ceil(left_chunks.remainder_len(), 8); + let rem = op(left_chunks.remainder_bits(), right_chunks.remainder_bits()); + // we are counting its starting from the least significant bit, to to_le_bytes should be correct + let rem = &rem.to_le_bytes()[0..remainder_bytes]; + buffer.extend_from_slice(rem); + + BooleanBuffer { + buffer: Buffer::from(buffer), + bit_offset: 0, + bit_len: len_in_bits, + } + } + /// Returns the number of set bits in this buffer pub fn count_set_bits(&self) -> usize { self.buffer @@ -655,4 +756,42 @@ mod tests { assert_eq!(result, expected); } } + + #[test] + fn test_from_bitwise_binary_op() { + // pick random boolean inputs + let input_bools_left = (0..1024) + .map(|_| rand::random::()) + .collect::>(); + let input_bools_right = (0..1024) + .map(|_| rand::random::()) + .collect::>(); + let input_buffer_left = BooleanBuffer::from(&input_bools_left[..]); + let input_buffer_right = BooleanBuffer::from(&input_bools_right[..]); + + for left_offset in 0..200 { + for right_offset in [0, 4, 5, 17, 33, 24, 45, 64, 65, 100, 200] { + for len_offset in [0, 1, 44, 100, 256, 300, 512] { + let len = 1024 - len_offset - left_offset.max(right_offset); // ensure we don't go out of bounds + // compute with AND + let result = BooleanBuffer::from_bitwise_binary_op( + input_buffer_left.values(), + left_offset, + input_buffer_right.values(), + right_offset, + len, + |a, b| a & b, + ); + // compute directly from bools + let expected = input_bools_left[left_offset..] + .iter() + .zip(&input_bools_right[right_offset..]) + .take(len) + .map(|(a, b)| *a & *b) + .collect::(); + assert_eq!(result, expected); + } + } + } + } } diff --git a/arrow-buffer/src/buffer/ops.rs b/arrow-buffer/src/buffer/ops.rs index 05593504b1cf..c9e7971709e7 100644 --- a/arrow-buffer/src/buffer/ops.rs +++ b/arrow-buffer/src/buffer/ops.rs @@ -61,35 +61,30 @@ where /// Apply a bitwise operation `op` to two inputs and return the result as a Buffer. /// The inputs are treated as bitmaps, meaning that offsets and length are specified in number of bits. +#[deprecated( + since = "57.2.0", + note = "use BooleanBuffer::from_bitwise_binary_op instead" +)] pub fn bitwise_bin_op_helper( left: &Buffer, left_offset_in_bits: usize, right: &Buffer, right_offset_in_bits: usize, len_in_bits: usize, - mut op: F, + op: F, ) -> Buffer where F: FnMut(u64, u64) -> u64, { - let left_chunks = left.bit_chunks(left_offset_in_bits, len_in_bits); - let right_chunks = right.bit_chunks(right_offset_in_bits, len_in_bits); - - let chunks = left_chunks - .iter() - .zip(right_chunks.iter()) - .map(|(left, right)| op(left, right)); - // Soundness: `BitChunks` is a `BitChunks` iterator which - // correctly reports its upper bound - let mut buffer = unsafe { MutableBuffer::from_trusted_len_iter(chunks) }; - - let remainder_bytes = ceil(left_chunks.remainder_len(), 8); - let rem = op(left_chunks.remainder_bits(), right_chunks.remainder_bits()); - // we are counting its starting from the least significant bit, to to_le_bytes should be correct - let rem = &rem.to_le_bytes()[0..remainder_bytes]; - buffer.extend_from_slice(rem); - - buffer.into() + BooleanBuffer::from_bitwise_binary_op( + left, + left_offset_in_bits, + right, + right_offset_in_bits, + len_in_bits, + op, + ) + .into_inner() } /// Apply a bitwise operation `op` to one input and return the result as a Buffer. @@ -119,7 +114,7 @@ pub fn buffer_bin_and( right_offset_in_bits: usize, len_in_bits: usize, ) -> Buffer { - bitwise_bin_op_helper( + BooleanBuffer::from_bitwise_binary_op( left, left_offset_in_bits, right, @@ -127,6 +122,7 @@ pub fn buffer_bin_and( len_in_bits, |a, b| a & b, ) + .into_inner() } /// Apply a bitwise or to two inputs and return the result as a Buffer. @@ -138,7 +134,7 @@ pub fn buffer_bin_or( right_offset_in_bits: usize, len_in_bits: usize, ) -> Buffer { - bitwise_bin_op_helper( + BooleanBuffer::from_bitwise_binary_op( left, left_offset_in_bits, right, @@ -146,6 +142,7 @@ pub fn buffer_bin_or( len_in_bits, |a, b| a | b, ) + .into_inner() } /// Apply a bitwise xor to two inputs and return the result as a Buffer. @@ -157,7 +154,7 @@ pub fn buffer_bin_xor( right_offset_in_bits: usize, len_in_bits: usize, ) -> Buffer { - bitwise_bin_op_helper( + BooleanBuffer::from_bitwise_binary_op( left, left_offset_in_bits, right, @@ -165,6 +162,7 @@ pub fn buffer_bin_xor( len_in_bits, |a, b| a ^ b, ) + .into_inner() } /// Apply a bitwise and_not to two inputs and return the result as a Buffer. @@ -176,7 +174,7 @@ pub fn buffer_bin_and_not( right_offset_in_bits: usize, len_in_bits: usize, ) -> Buffer { - bitwise_bin_op_helper( + BooleanBuffer::from_bitwise_binary_op( left, left_offset_in_bits, right, @@ -184,11 +182,11 @@ pub fn buffer_bin_and_not( len_in_bits, |a, b| a & !b, ) + .into_inner() } /// Apply a bitwise not to one input and return the result as a Buffer. /// The input is treated as a bitmap, meaning that offset and length are specified in number of bits. pub fn buffer_unary_not(left: &Buffer, offset_in_bits: usize, len_in_bits: usize) -> Buffer { - // TODO: should we deprecate this function in favor of the Buffer ! impl ? BooleanBuffer::from_bitwise_unary_op(left, offset_in_bits, len_in_bits, |a| !a).into_inner() } diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs index 211cabf7afc0..4f2c5383a5bc 100644 --- a/arrow-select/src/nullif.rs +++ b/arrow-select/src/nullif.rs @@ -18,7 +18,6 @@ //! Implements the `nullif` function for Arrow arrays. use arrow_array::{Array, ArrayRef, BooleanArray, make_array}; -use arrow_buffer::buffer::bitwise_bin_op_helper; use arrow_buffer::{BooleanBuffer, NullBuffer}; use arrow_schema::{ArrowError, DataType}; @@ -75,7 +74,7 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result { let mut valid_count = 0; - let b = bitwise_bin_op_helper( + let b = BooleanBuffer::from_bitwise_binary_op( left.buffer(), left.offset(), right.inner(), @@ -91,18 +90,15 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result { let mut null_count = 0; - let buffer = - BooleanBuffer::from_bitwise_unary_op(right.inner(), right.offset(), len, |b| { - let t = !b; - null_count += t.count_zeros() as usize; - t - }) - .into_inner(); - (buffer, null_count) + let b = BooleanBuffer::from_bitwise_unary_op(right.inner(), right.offset(), len, |b| { + let t = !b; + null_count += t.count_zeros() as usize; + t + }); + (b, null_count) } }; - let combined = BooleanBuffer::new(combined, 0, len); // Safety: // Counted nulls whilst computing let nulls = unsafe { NullBuffer::new_unchecked(combined, null_count) }; @@ -494,7 +490,18 @@ mod tests { let r_data = r.to_data(); r_data.validate().unwrap(); - assert_eq!(r.as_ref(), &expected); + assert_eq!( + r.as_ref(), + &expected, + "expected nulls: {:#?}\n\n\ + result nulls: {:#?}\n\n\\ + expected values: {:#?}\n\n\ + result values: {:#?}", + expected.nulls(), + r.nulls(), + expected.values(), + r.as_primitive::().values() + ); } #[test] diff --git a/arrow/benches/zip_kernels.rs b/arrow/benches/zip_kernels.rs index 65f6bb280f00..31cbca639717 100644 --- a/arrow/benches/zip_kernels.rs +++ b/arrow/benches/zip_kernels.rs @@ -21,7 +21,6 @@ use rand::distr::{Distribution, StandardUniform}; use rand::prelude::StdRng; use rand::{Rng, SeedableRng}; use std::hint; -use std::ops::Range; use std::sync::Arc; use arrow::array::*; @@ -134,35 +133,6 @@ where } } -struct GenerateStringView { - range: Range, - description: String, - _marker: std::marker::PhantomData, -} - -impl InputGenerator for GenerateStringView { - fn name(&self) -> &str { - self.description.as_str() - } - fn generate_scalar_with_null_value(&self) -> ArrayRef { - new_null_array(&DataType::Utf8View, 1) - } - - fn generate_non_null_scalars(&self, seed: u64, number_of_scalars: usize) -> Vec { - let array = self.generate_array(seed, number_of_scalars, 0.0); - (0..number_of_scalars).map(|i| array.slice(i, 1)).collect() - } - - fn generate_array(&self, seed: u64, array_length: usize, null_percentage: f32) -> ArrayRef { - Arc::new(create_string_view_array_with_len_range_and_seed( - array_length, - null_percentage, - self.range.clone(), - seed, - )) - } -} - fn mask_cases(len: usize) -> Vec<(&'static str, BooleanArray)> { vec![ ("all_true", create_boolean_array(len, 0.0, 1.0)), @@ -303,24 +273,6 @@ fn add_benchmark(c: &mut Criterion) { _marker: std::marker::PhantomData, }, ); - - bench_zip_on_input_generator( - c, - &GenerateStringView { - description: "string_views size (3..10)".to_string(), - range: 3..10, - _marker: std::marker::PhantomData, - }, - ); - - bench_zip_on_input_generator( - c, - &GenerateStringView { - description: "string_views size (10..100)".to_string(), - range: 10..100, - _marker: std::marker::PhantomData, - }, - ); } criterion_group!(benches, add_benchmark); diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs index 1f1dcff9b62a..9f83a50f4f8f 100644 --- a/arrow/src/util/bench_util.rs +++ b/arrow/src/util/bench_util.rs @@ -208,33 +208,6 @@ pub fn create_string_array_with_len_range_and_prefix_and_seed, - seed: u64, -) -> StringViewArray { - let rng = &mut StdRng::seed_from_u64(seed); - (0..size) - .map(|_| { - if rng.random::() < null_density { - None - } else { - let str_len = rng.random_range(range.clone()); - let value = rng.sample_iter(&Alphanumeric).take(str_len).collect(); - let value = String::from_utf8(value).unwrap(); - Some(value) - } - }) - .collect() -} fn create_string_view_array_with_len_range_and_prefix( size: usize, diff --git a/parquet/src/arrow/array_reader/struct_array.rs b/parquet/src/arrow/array_reader/struct_array.rs index b4a6a375334f..8df6a25c9102 100644 --- a/parquet/src/arrow/array_reader/struct_array.rs +++ b/parquet/src/arrow/array_reader/struct_array.rs @@ -129,8 +129,8 @@ impl ArrayReader for StructArrayReader { .len(children_array_len) .child_data( children_array - .into_iter() - .map(|x| x.into_data()) + .iter() + .map(|x| x.to_data()) .collect::>(), );