Skip to content
24 changes: 13 additions & 11 deletions arrow-arith/src/boolean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
//! [here](https://doc.rust-lang.org/stable/core/arch/) for more information.

use arrow_array::*;
use arrow_buffer::buffer::{bitwise_bin_op_helper, bitwise_quaternary_op_helper};
use arrow_buffer::buffer::bitwise_quaternary_op_helper;
use arrow_buffer::{BooleanBuffer, NullBuffer, buffer_bin_and_not};
use arrow_schema::ArrowError;

Expand Down Expand Up @@ -74,7 +74,7 @@ pub fn and_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanAr
// The final null bit is set only if:
// 1. left null bit is set, or
// 2. right data bit is false (because null AND false = false).
Some(bitwise_bin_op_helper(
Some(BooleanBuffer::from_bitwise_binary_op(
left_null_buffer.buffer(),
left_null_buffer.offset(),
right_values.inner(),
Expand All @@ -85,7 +85,7 @@ pub fn and_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanAr
}
(None, Some(right_null_buffer)) => {
// Same as above
Some(bitwise_bin_op_helper(
Some(BooleanBuffer::from_bitwise_binary_op(
right_null_buffer.buffer(),
right_null_buffer.offset(),
left_values.inner(),
Expand All @@ -100,7 +100,7 @@ pub fn and_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanAr
// d is right data bits.
// The final null bits are:
// (a | (c & !d)) & (c | (a & !b))
Some(bitwise_quaternary_op_helper(
let buffer = bitwise_quaternary_op_helper(
[
left_null_buffer.buffer(),
left_values.inner(),
Expand All @@ -115,10 +115,11 @@ pub fn and_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanAr
],
left.len(),
|a, b, c, d| (a | (c & !d)) & (c | (a & !b)),
))
);
Some(BooleanBuffer::new(buffer, 0, left.len()))
}
};
let nulls = buffer.map(|b| NullBuffer::new(BooleanBuffer::new(b, 0, left.len())));
let nulls = buffer.map(NullBuffer::new);
Ok(BooleanArray::new(left_values & right_values, nulls))
}

Expand Down Expand Up @@ -169,7 +170,7 @@ pub fn or_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanArr
// The final null bit is set only if:
// 1. left null bit is set, or
// 2. right data bit is true (because null OR true = true).
Some(bitwise_bin_op_helper(
Some(BooleanBuffer::from_bitwise_binary_op(
left_nulls.buffer(),
left_nulls.offset(),
right_values.inner(),
Expand All @@ -180,7 +181,7 @@ pub fn or_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanArr
}
(None, Some(right_nulls)) => {
// Same as above
Some(bitwise_bin_op_helper(
Some(BooleanBuffer::from_bitwise_binary_op(
right_nulls.buffer(),
right_nulls.offset(),
left_values.inner(),
Expand All @@ -195,7 +196,7 @@ pub fn or_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanArr
// d is right data bits.
// The final null bits are:
// (a | (c & d)) & (c | (a & b))
Some(bitwise_quaternary_op_helper(
let buffer = bitwise_quaternary_op_helper(
[
left_nulls.buffer(),
left_values.inner(),
Expand All @@ -210,11 +211,12 @@ pub fn or_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanArr
],
left.len(),
|a, b, c, d| (a | (c & d)) & (c | (a & b)),
))
);
Some(BooleanBuffer::new(buffer, 0, left.len()))
}
};

let nulls = buffer.map(|b| NullBuffer::new(BooleanBuffer::new(b, 0, left.len())));
let nulls = buffer.map(NullBuffer::new);
Ok(BooleanArray::new(left_values | right_values, nulls))
}

Expand Down
166 changes: 162 additions & 4 deletions arrow-buffer/src/buffer/boolean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ use crate::bit_chunk_iterator::BitChunks;
use crate::bit_iterator::{BitIndexIterator, BitIndexU32Iterator, BitIterator, BitSliceIterator};
use crate::{
BooleanBufferBuilder, Buffer, MutableBuffer, bit_util, buffer_bin_and, buffer_bin_or,
buffer_bin_xor, buffer_unary_not,
buffer_bin_xor, buffer_unary_not, util,
};

use std::ops::{BitAnd, BitOr, BitXor, Not};
Expand Down Expand Up @@ -127,9 +127,10 @@ impl BooleanBuffer {
/// * The output always has zero offset
///
/// # See Also
/// - [`BooleanBuffer::from_bitwise_binary_op`] to create a new buffer from a binary operation
/// - [`apply_bitwise_unary_op`](bit_util::apply_bitwise_unary_op) for in-place unary bitwise operations
///
/// # Example: Create new [`BooleanBuffer`] from bitwise `NOT` of an input [`Buffer`]
/// # Example: Create new [`BooleanBuffer`] from bitwise `NOT` of a byte slice
/// ```
/// # use arrow_buffer::BooleanBuffer;
/// let input = [0b11001100u8, 0b10111010u8]; // 2 bytes = 16 bits
Expand Down Expand Up @@ -179,9 +180,8 @@ impl BooleanBuffer {
result.truncate(chunks.num_bytes());
}

let buffer = Buffer::from(result);
BooleanBuffer {
buffer,
buffer: Buffer::from(result),
offset: 0,
len: len_in_bits,
}
Expand Down Expand Up @@ -212,6 +212,126 @@ impl BooleanBuffer {
Some(BooleanBuffer::new(buffer, 0, len_in_bits))
}

/// Create a new [`BooleanBuffer`] by applying the bitwise operation `op` to
/// the relevant bits from two input buffers.
///
/// This function is faster than applying the operation bit by bit as
/// it processes input buffers in chunks of 64 bits (8 bytes) at a time
///
/// # Notes:
/// See notes on [Self::from_bitwise_unary_op]
///
/// # See Also
/// - [`BooleanBuffer::from_bitwise_unary_op`] for unary operations on a single input buffer.
/// - [`apply_bitwise_binary_op`](bit_util::apply_bitwise_binary_op) for in-place binary bitwise operations
///
/// # Example: Create new [`BooleanBuffer`] from bitwise `AND` of two [`Buffer`]s
/// ```
/// # use arrow_buffer::{Buffer, BooleanBuffer};
/// let left = Buffer::from(vec![0b11001100u8, 0b10111010u8]); // 2 bytes = 16 bits
/// let right = Buffer::from(vec![0b10101010u8, 0b11011100u8, 0b11110000u8]); // 3 bytes = 24 bits
/// // AND of the first 12 bits
/// let result = BooleanBuffer::from_bitwise_binary_op(
/// &left, 0, &right, 0, 12, |a, b| a & b
/// );
/// assert_eq!(result.inner().as_slice(), &[0b10001000u8, 0b00001000u8]);
/// ```
///
/// # Example: Create new [`BooleanBuffer`] from bitwise `OR` of two byte slices
/// ```
/// # use arrow_buffer::BooleanBuffer;
/// let left = [0b11001100u8, 0b10111010u8];
/// let right = [0b10101010u8, 0b11011100u8];
/// // OR of bits 4..16 from left and bits 0..12 from right
/// let result = BooleanBuffer::from_bitwise_binary_op(
/// &left, 4, &right, 0, 12, |a, b| a | b
/// );
/// assert_eq!(result.inner().as_slice(), &[0b10101110u8, 0b00001111u8]);
/// ```
pub fn from_bitwise_binary_op<F>(
left: impl AsRef<[u8]>,
left_offset_in_bits: usize,
right: impl AsRef<[u8]>,
right_offset_in_bits: usize,
len_in_bits: usize,
mut op: F,
) -> Self
where
F: FnMut(u64, u64) -> u64,
{
// try fast path for aligned input
if left_offset_in_bits & 0x7 == 0 && right_offset_in_bits & 0x7 == 0 {
if let Some(result) = Self::try_from_aligned_bitwise_binary_op(
&left.as_ref()[left_offset_in_bits / 8..], // aligned to byte boundary
&right.as_ref()[right_offset_in_bits / 8..],
len_in_bits,
&mut op,
) {
return result;
}
}
let left_chunks = BitChunks::new(left.as_ref(), left_offset_in_bits, len_in_bits);
let right_chunks = BitChunks::new(right.as_ref(), right_offset_in_bits, len_in_bits);

let chunks = left_chunks
.iter()
.zip(right_chunks.iter())
.map(|(left, right)| op(left, right));
// Soundness: `BitChunks` is a `BitChunks` iterator which
// correctly reports its upper bound
let mut buffer = unsafe { MutableBuffer::from_trusted_len_iter(chunks) };

let remainder_bytes = util::bit_util::ceil(left_chunks.remainder_len(), 8);
let rem = op(left_chunks.remainder_bits(), right_chunks.remainder_bits());
// we are counting its starting from the least significant bit, to to_le_bytes should be correct
let rem = &rem.to_le_bytes()[0..remainder_bytes];
buffer.extend_from_slice(rem);

BooleanBuffer {
buffer: Buffer::from(buffer),
offset: 0,
len: len_in_bits,
}
}

/// Like [`Self::from_bitwise_binary_op`] but optimized for the case where the
/// inputs are aligned to byte boundaries
///
/// Returns `None` if the inputs are not fully u64 aligned
fn try_from_aligned_bitwise_binary_op<F>(
left: &[u8],
right: &[u8],
len_in_bits: usize,
op: &mut F,
) -> Option<Self>
where
F: FnMut(u64, u64) -> u64,
{
// Safety: all valid bytes are valid u64s
let (left_prefix, left_u64s, left_suffix) = unsafe { left.align_to::<u64>() };
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you know how much of a difference the full u64 alignment really makes on current target architectures? I think unaligned loads on x86 are not really slower. Maybe only if they cross a cache line, which should happen every 8th load. With that assumption, using slice::as_chunks and u64::from_le_bytes on each chunk might be simpler, and would then only require handling a suffix.

Staying with align_to and bailing on non-empty prefixes should also be fine, I would expect buffers to be aligned most of the time, but having a byte length that is a multiple of 8 might be less likely.

Other than this nit the code looks good to me 👍

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TLDR is I don't really know how much of a difference this makes, and I am struggling with measuring the difference

Right now, the benchmarks seems super sensitive to any change. For example, in the most recent version of this PR I simply moved the code and added the check for alignment. And this results in consistently reproducible slowdowns on some of the sliced bechmarks

and              1.00    211.5±1.92ns        ? ?/sec    1.30    275.5±4.61ns        ? ?/sec
and_sliced_1     1.00   1095.6±3.62ns        ? ?/sec    1.12  1228.6±12.32ns        ? ?/sec
and_sliced_24    1.39    337.9±3.92ns        ? ?/sec    1.00    243.4±1.74ns        ? ?/sec

With that assumption, using slice::as_chunks and u64::from_le_bytes on each chunk might be simpler, and would then only require handling a suffix.

I tried a version like this and will see how it performs

    /// Like [`Self::from_bitwise_binary_op`] but optimized for the case where the
    /// inputs are aligned to byte boundaries
    ///
    /// Returns `None` if the inputs are not fully u64 aligned
    fn try_from_aligned_bitwise_binary_op<F>(
        left: &[u8],
        right: &[u8],
        len_in_bits: usize,
        op: &mut F,
    ) -> Option<Self>
    where
        F: FnMut(u64, u64) -> u64,
    {
        // trim to length
        let len_in_bytes = ceil(len_in_bits, 8);
        let left = &left[0..len_in_bytes];
        let right = &right[0..len_in_bytes];
        // Safety: all valid bytes are valid u64s
        let (left_prefix, left_u64s, left_suffix) = unsafe { left.align_to::<u64>() };
        let (right_prefix, right_u64s, right_suffix) = unsafe { right.align_to::<u64>() };
        if left_prefix.is_empty()
            && right_prefix.is_empty()
            && left_suffix.is_empty()
            && right_suffix.is_empty()
        {
            // the buffers are word (64 bit) aligned, so use optimized Vec code.
            let result_u64s = left_u64s
                .iter()
                .zip(right_u64s.iter())
                .map(|(l, r)| op(*l, *r))
                .collect::<Vec<u64>>();
            Some(BooleanBuffer::new(
                Buffer::from(result_u64s),
                0,
                len_in_bits,
            ))
        } else {
            let (left_slices, left_remainder) = left.as_chunks::<8>();
            let (right_slices, right_remainder) = right.as_chunks::<8>();
            debug_assert_eq!(left_slices.len(), right_slices.len());
            debug_assert_eq!(left_remainder.len(), right_remainder.len());
            let mut mutable_result = MutableBuffer::with_capacity(left_slices.len() * 8 + left_remainder.len());
            mutable_result.extend_from_iter(left_slices.iter().zip(right_slices.iter())
                .map(|(l,r)| {
                    op(u64::from_le_bytes(*l),
                       u64::from_le_bytes(*r))
            }));
            if !left_remainder.is_empty() {
                let rem = op(
                    u64::from_le_bytes({
                        let mut bytes = [0u8; 8];
                        bytes[..left_remainder.len()].copy_from_slice(left_remainder);
                        bytes
                    }),
                    u64::from_le_bytes({
                        let mut bytes = [0u8; 8];
                        bytes[..right_remainder.len()].copy_from_slice(right_remainder);
                        bytes
                    }),
                );
                println!("copying {} remainder bytes: {:?}", left_remainder.len(), &rem.to_le_bytes()[..left_remainder.len()]);
                mutable_result.extend_from_slice(&rem.to_le_bytes()[..left_remainder.len()]);
            }
            Some(BooleanBuffer {
                buffer: Buffer::from(mutable_result),
                offset: 0,
                len: len_in_bits,
            })
        }
    }

Sadly, it seems like we are not going to be able to use as_chunks

``rust
error: current MSRV (Minimum Supported Rust Version) is 1.85.0 but this item is stable since `1.88.0`
--> arrow-buffer/src/buffer/boolean.rs:335:54
|
335 | let (left_slices, left_remainder) = left.as_chunks::<8>();
| ^^^^^^^^^^^^^^^^
|
= note: you may want to conditionally increase the MSRV considered by Clippy using the `clippy::msrv` attribute
= help: for further information visit https://rust-lang.github.io/rust-clippy/rust-1.91.0/index.html#incompatible_msrv
= note: `-D clippy::incompatible-msrv` implied by `-D warnings`
= help: to override `-D warnings` add `#[allow(clippy::incompatible_msrv)]`

let (right_prefix, right_u64s, right_suffix) = unsafe { right.align_to::<u64>() };
if !(left_prefix.is_empty()
&& right_prefix.is_empty()
&& left_suffix.is_empty()
&& right_suffix.is_empty())
{
// Couldn't make this case any faster than the default path
// would be cool to handle non empty prefixes/suffixes too,
return None;
}
// the buffers are word (64 bit) aligned, so use optimized Vec code.
let result_u64s = left_u64s
.iter()
.zip(right_u64s.iter())
.map(|(l, r)| op(*l, *r))
.collect::<Vec<u64>>();
Some(BooleanBuffer::new(
Buffer::from(result_u64s),
0,
len_in_bits,
))
}

/// Returns the number of set bits in this buffer
pub fn count_set_bits(&self) -> usize {
self.buffer.count_set_bits_offset(self.offset, self.len)
Expand Down Expand Up @@ -591,4 +711,42 @@ mod tests {
assert_eq!(result, expected);
}
}

#[test]
fn test_from_bitwise_binary_op() {
// pick random boolean inputs
let input_bools_left = (0..1024)
.map(|_| rand::random::<bool>())
.collect::<Vec<bool>>();
let input_bools_right = (0..1024)
.map(|_| rand::random::<bool>())
.collect::<Vec<bool>>();
let input_buffer_left = BooleanBuffer::from(&input_bools_left[..]);
let input_buffer_right = BooleanBuffer::from(&input_bools_right[..]);

for left_offset in 0..200 {
for right_offset in [0, 4, 5, 17, 33, 24, 45, 64, 65, 100, 200] {
for len_offset in [0, 1, 44, 100, 256, 300, 512] {
let len = 1024 - len_offset - left_offset.max(right_offset); // ensure we don't go out of bounds
// compute with AND
let result = BooleanBuffer::from_bitwise_binary_op(
input_buffer_left.values(),
left_offset,
input_buffer_right.values(),
right_offset,
len,
|a, b| a & b,
);
// compute directly from bools
let expected = input_bools_left[left_offset..]
.iter()
.zip(&input_bools_right[right_offset..])
.take(len)
.map(|(a, b)| *a & *b)
.collect::<BooleanBuffer>();
assert_eq!(result, expected);
}
}
}
}
}
Loading
Loading