Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 13 additions & 11 deletions arrow-arith/src/boolean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
//! [here](https://doc.rust-lang.org/stable/core/arch/) for more information.

use arrow_array::*;
use arrow_buffer::buffer::{bitwise_bin_op_helper, bitwise_quaternary_op_helper};
use arrow_buffer::buffer::bitwise_quaternary_op_helper;
use arrow_buffer::{BooleanBuffer, NullBuffer, buffer_bin_and_not};
use arrow_schema::ArrowError;

Expand Down Expand Up @@ -74,7 +74,7 @@ pub fn and_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanAr
// The final null bit is set only if:
// 1. left null bit is set, or
// 2. right data bit is false (because null AND false = false).
Some(bitwise_bin_op_helper(
Some(BooleanBuffer::from_bitwise_binary_op(
left_null_buffer.buffer(),
left_null_buffer.offset(),
right_values.inner(),
Expand All @@ -85,7 +85,7 @@ pub fn and_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanAr
}
(None, Some(right_null_buffer)) => {
// Same as above
Some(bitwise_bin_op_helper(
Some(BooleanBuffer::from_bitwise_binary_op(
right_null_buffer.buffer(),
right_null_buffer.offset(),
left_values.inner(),
Expand All @@ -100,7 +100,7 @@ pub fn and_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanAr
// d is right data bits.
// The final null bits are:
// (a | (c & !d)) & (c | (a & !b))
Some(bitwise_quaternary_op_helper(
let buffer = bitwise_quaternary_op_helper(
[
left_null_buffer.buffer(),
left_values.inner(),
Expand All @@ -115,10 +115,11 @@ pub fn and_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanAr
],
left.len(),
|a, b, c, d| (a | (c & !d)) & (c | (a & !b)),
))
);
Some(BooleanBuffer::new(buffer, 0, left.len()))
}
};
let nulls = buffer.map(|b| NullBuffer::new(BooleanBuffer::new(b, 0, left.len())));
let nulls = buffer.map(NullBuffer::new);
Ok(BooleanArray::new(left_values & right_values, nulls))
}

Expand Down Expand Up @@ -169,7 +170,7 @@ pub fn or_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanArr
// The final null bit is set only if:
// 1. left null bit is set, or
// 2. right data bit is true (because null OR true = true).
Some(bitwise_bin_op_helper(
Some(BooleanBuffer::from_bitwise_binary_op(
left_nulls.buffer(),
left_nulls.offset(),
right_values.inner(),
Expand All @@ -180,7 +181,7 @@ pub fn or_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanArr
}
(None, Some(right_nulls)) => {
// Same as above
Some(bitwise_bin_op_helper(
Some(BooleanBuffer::from_bitwise_binary_op(
right_nulls.buffer(),
right_nulls.offset(),
left_values.inner(),
Expand All @@ -195,7 +196,7 @@ pub fn or_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanArr
// d is right data bits.
// The final null bits are:
// (a | (c & d)) & (c | (a & b))
Some(bitwise_quaternary_op_helper(
let buffer = bitwise_quaternary_op_helper(
[
left_nulls.buffer(),
left_values.inner(),
Expand All @@ -210,11 +211,12 @@ pub fn or_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanArr
],
left.len(),
|a, b, c, d| (a | (c & d)) & (c | (a & b)),
))
);
Some(BooleanBuffer::new(buffer, 0, left.len()))
}
};

let nulls = buffer.map(|b| NullBuffer::new(BooleanBuffer::new(b, 0, left.len())));
let nulls = buffer.map(NullBuffer::new);
Ok(BooleanArray::new(left_values | right_values, nulls))
}

Expand Down
151 changes: 148 additions & 3 deletions arrow-buffer/src/buffer/boolean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -165,12 +165,14 @@ impl BooleanBuffer {
/// * `op` must only apply bitwise operations
/// on the relevant bits; the input `u64` may contain irrelevant bits
/// and may be processed differently on different endian architectures.
/// * `op` may be called with input bits outside the requested range
/// * The output always has zero offset
///
/// # See Also
/// - [`BooleanBuffer::from_bitwise_binary_op`] to create a new buffer from a binary operation
/// - [`apply_bitwise_unary_op`](bit_util::apply_bitwise_unary_op) for in-place unary bitwise operations
///
/// # Example: Create new [`BooleanBuffer`] from bitwise `NOT` of an input [`Buffer`]
/// # Example: Create new [`BooleanBuffer`] from bitwise `NOT` of a byte slice
/// ```
/// # use arrow_buffer::BooleanBuffer;
/// let input = [0b11001100u8, 0b10111010u8]; // 2 bytes = 16 bits
Expand Down Expand Up @@ -220,9 +222,8 @@ impl BooleanBuffer {
result.truncate(chunks.num_bytes());
}

let buffer = Buffer::from(result);
BooleanBuffer {
buffer,
buffer: Buffer::from(result),
bit_offset: 0,
bit_len: len_in_bits,
}
Expand Down Expand Up @@ -253,6 +254,112 @@ impl BooleanBuffer {
Some(BooleanBuffer::new(buffer, 0, len_in_bits))
}

/// Create a new [`BooleanBuffer`] by applying the bitwise operation `op` to
/// the relevant bits from two input buffers.
///
/// This function is faster than applying the operation bit by bit as
/// it processes input buffers in chunks of 64 bits (8 bytes) at a time
///
/// # Notes:
/// See notes on [Self::from_bitwise_unary_op]
///
/// # See Also
/// - [`BooleanBuffer::from_bitwise_unary_op`] for unary operations on a single input buffer.
/// - [`apply_bitwise_binary_op`](bit_util::apply_bitwise_binary_op) for in-place binary bitwise operations
///
/// # Example: Create new [`BooleanBuffer`] from bitwise `AND` of two [`Buffer`]s
/// ```
/// # use arrow_buffer::{Buffer, BooleanBuffer};
/// let left = Buffer::from(vec![0b11001100u8, 0b10111010u8]); // 2 bytes = 16 bits
/// let right = Buffer::from(vec![0b10101010u8, 0b11011100u8, 0b11110000u8]); // 3 bytes = 24 bits
/// // AND of the first 12 bits
/// let result = BooleanBuffer::from_bitwise_binary_op(
/// &left, 0, &right, 0, 12, |a, b| a & b
/// );
/// assert_eq!(result.inner().as_slice(), &[0b10001000u8, 0b00001000u8]);
/// ```
///
/// # Example: Create new [`BooleanBuffer`] from bitwise `OR` of two byte slices
/// ```
/// # use arrow_buffer::BooleanBuffer;
/// let left = [0b11001100u8, 0b10111010u8];
/// let right = [0b10101010u8, 0b11011100u8];
/// // OR of bits 4..16 from left and bits 0..12 from right
/// let result = BooleanBuffer::from_bitwise_binary_op(
/// &left, 4, &right, 0, 12, |a, b| a | b
/// );
/// assert_eq!(result.inner().as_slice(), &[0b10101110u8, 0b00001111u8]);
/// ```
pub fn from_bitwise_binary_op<F>(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is the new API

left: impl AsRef<[u8]>,
left_offset_in_bits: usize,
right: impl AsRef<[u8]>,
right_offset_in_bits: usize,
len_in_bits: usize,
mut op: F,
) -> Self
where
F: FnMut(u64, u64) -> u64,
{
let left = left.as_ref();
let right = right.as_ref();
// try fast path for aligned input
// If the underlying buffers are aligned to u64 we can apply the operation directly on the u64 slices
// to improve performance.
if left_offset_in_bits & 0x7 == 0 && right_offset_in_bits & 0x7 == 0 {
// align to byte boundary
let left = &left[left_offset_in_bits / 8..];
let right = &right[right_offset_in_bits / 8..];

unsafe {
let (left_prefix, left_u64s, left_suffix) = left.align_to::<u64>();
let (right_prefix, right_u64s, right_suffix) = right.align_to::<u64>();
// if there is no prefix or suffix, both buffers are aligned and
// we can do the operation directly on u64s.
// TODO: consider `slice::as_chunks` and `u64::from_le_bytes` when MSRV reaches 1.88.
// https://github.com/apache/arrow-rs/pull/9022#discussion_r2639949361
if left_prefix.is_empty()
&& right_prefix.is_empty()
&& left_suffix.is_empty()
&& right_suffix.is_empty()
{
let result_u64s = left_u64s
.iter()
.zip(right_u64s.iter())
.map(|(l, r)| op(*l, *r))
.collect::<Vec<u64>>();
return BooleanBuffer {
buffer: Buffer::from(result_u64s),
bit_offset: 0,
bit_len: len_in_bits,
};
}
}
}
let left_chunks = BitChunks::new(left, left_offset_in_bits, len_in_bits);
let right_chunks = BitChunks::new(right, right_offset_in_bits, len_in_bits);

let chunks = left_chunks
.iter()
.zip(right_chunks.iter())
.map(|(left, right)| op(left, right));
// Soundness: `BitChunks` is a `BitChunks` iterator which
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trusted Len ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated in a22ad8d

// correctly reports its upper bound
let mut buffer = unsafe { MutableBuffer::from_trusted_len_iter(chunks) };

let remainder_bytes = bit_util::ceil(left_chunks.remainder_len(), 8);
let rem = op(left_chunks.remainder_bits(), right_chunks.remainder_bits());
// we are counting its starting from the least significant bit, to to_le_bytes should be correct
let rem = &rem.to_le_bytes()[0..remainder_bytes];
buffer.extend_from_slice(rem);

BooleanBuffer {
buffer: Buffer::from(buffer),
bit_offset: 0,
bit_len: len_in_bits,
}
}

/// Returns the number of set bits in this buffer
pub fn count_set_bits(&self) -> usize {
self.buffer
Expand Down Expand Up @@ -655,4 +762,42 @@ mod tests {
assert_eq!(result, expected);
}
}

#[test]
fn test_from_bitwise_binary_op() {
// pick random boolean inputs
let input_bools_left = (0..1024)
.map(|_| rand::random::<bool>())
.collect::<Vec<bool>>();
let input_bools_right = (0..1024)
.map(|_| rand::random::<bool>())
.collect::<Vec<bool>>();
let input_buffer_left = BooleanBuffer::from(&input_bools_left[..]);
let input_buffer_right = BooleanBuffer::from(&input_bools_right[..]);

for left_offset in 0..200 {
for right_offset in [0, 4, 5, 17, 33, 24, 45, 64, 65, 100, 200] {
for len_offset in [0, 1, 44, 100, 256, 300, 512] {
let len = 1024 - len_offset - left_offset.max(right_offset); // ensure we don't go out of bounds
// compute with AND
let result = BooleanBuffer::from_bitwise_binary_op(
input_buffer_left.values(),
left_offset,
input_buffer_right.values(),
right_offset,
len,
|a, b| a & b,
);
// compute directly from bools
let expected = input_bools_left[left_offset..]
.iter()
.zip(&input_bools_right[right_offset..])
.take(len)
.map(|(a, b)| *a & *b)
.collect::<BooleanBuffer>();
assert_eq!(result, expected);
}
}
}
}
}
Loading
Loading