Skip to content
This repository was archived by the owner on Apr 4, 2023. It is now read-only.

introduce the roaring multiop in milli #581

Draft
wants to merge 25 commits into
base: main
Choose a base branch
from
Draft
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
simplify the cbo roaring bitmap merge_into function
irevoire committed Aug 24, 2022
commit 03d63a8a59120359cac12804a943eaf9ca765048
48 changes: 20 additions & 28 deletions milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs
Original file line number Diff line number Diff line change
@@ -3,7 +3,7 @@ use std::io;
use std::mem::size_of;

use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
use roaring::RoaringBitmap;
use roaring::{IterExt, RoaringBitmap};

/// This is the limit where using a byteorder became less size efficient
/// than using a direct roaring encoding, it is also the point where we are able
@@ -59,37 +59,29 @@ impl CboRoaringBitmapCodec {
/// serialized in the buffer else a RoaringBitmap is created from the
/// values and is serialized in the buffer.
pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec<u8>) -> io::Result<()> {
let mut roaring = RoaringBitmap::new();
let mut vec = Vec::new();

for bytes in slices {
if bytes.len() <= THRESHOLD * size_of::<u32>() {
let mut reader = bytes.as_ref();
while let Ok(integer) = reader.read_u32::<NativeEndian>() {
vec.push(integer);
let roaring = slices
.iter()
.map(|slice| {
if slice.len() <= THRESHOLD * size_of::<u32>() {
let mut reader = slice.as_ref();
while let Ok(integer) = reader.read_u32::<NativeEndian>() {
vec.push(integer);
}
vec.sort_unstable();
// we can unwrap safely because the vector is sorted
let res = RoaringBitmap::from_sorted_iter(vec.iter().copied()).unwrap();
vec.clear();
Ok(res)
} else {
RoaringBitmap::deserialize_from(slice.as_ref()).into()
}
} else {
roaring |= RoaringBitmap::deserialize_from(bytes.as_ref())?;
}
}
})
.collect::<io::Result<Vec<_>>>()?;
let roaring = roaring.or();

if roaring.is_empty() {
vec.sort_unstable();
vec.dedup();

if vec.len() <= THRESHOLD {
for integer in vec {
buffer.extend_from_slice(&integer.to_ne_bytes());
}
} else {
// We can unwrap safely because the vector is sorted upper.
let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()).unwrap();
roaring.serialize_into(buffer)?;
}
} else {
roaring.extend(vec);
roaring.serialize_into(buffer)?;
}
roaring.serialize_into(buffer)?;

Ok(())
}