diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index dc4cbe6834c4..a00bf7271bba 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -325,7 +325,8 @@ impl GenericByteViewArray { /// Use with caution as this can be an expensive operation, only use it when you are sure that the view /// array is significantly smaller than when it is originally created, e.g., after filtering or slicing. pub fn gc(&self) -> Self { - let mut builder = GenericByteViewBuilder::::with_capacity(self.len()); + let mut builder = + GenericByteViewBuilder::::with_capacity(self.len()).with_deduplicate_strings(); for v in self.iter() { builder.append_option(v); diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index 2bcc5a3f302f..dda553545640 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -22,6 +22,8 @@ use std::sync::Arc; use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, ScalarBuffer}; use arrow_data::ByteView; use arrow_schema::ArrowError; +use hashbrown::hash_table::Entry; +use hashbrown::HashTable; use crate::builder::ArrayBuilder; use crate::types::bytes::ByteArrayNativeType; @@ -57,6 +59,9 @@ pub struct GenericByteViewBuilder { completed: Vec, in_progress: Vec, block_size: u32, + /// Some if deduplicating strings + /// map ` -> ` + string_tracker: Option<(HashTable, ahash::RandomState)>, phantom: PhantomData, } @@ -74,6 +79,7 @@ impl GenericByteViewBuilder { completed: vec![], in_progress: vec![], block_size: DEFAULT_BLOCK_SIZE, + string_tracker: None, phantom: Default::default(), } } @@ -83,6 +89,20 @@ impl GenericByteViewBuilder { Self { block_size, ..self } } + /// Deduplicate strings while building the array + /// + /// This will potentially decrease the memory usage if the array have repeated strings + /// It will also increase the time to build the array as it needs to hash the strings + pub fn with_deduplicate_strings(self) -> Self { + Self { + string_tracker: Some(( + HashTable::with_capacity(self.views_builder.capacity()), + Default::default(), + )), + ..self + } + } + /// Append a new data block returning the new block offset /// /// Note: this will first flush any in-progress block @@ -179,6 +199,26 @@ impl GenericByteViewBuilder { self.completed.push(block); } + /// Returns the value at the given index + /// Useful if we want to know what value has been inserted to the builder + fn get_value(&self, index: usize) -> &[u8] { + let view = self.views_builder.as_slice().get(index).unwrap(); + let len = *view as u32; + if len <= 12 { + // # Safety + // The view is valid from the builder + unsafe { GenericByteViewArray::::inline_value(view, len as usize) } + } else { + let view = ByteView::from(*view); + if view.buffer_index < self.completed.len() as u32 { + let block = &self.completed[view.buffer_index as usize]; + &block[view.offset as usize..view.offset as usize + view.length as usize] + } else { + &self.in_progress[view.offset as usize..view.offset as usize + view.length as usize] + } + } + } + /// Appends a value into the builder /// /// # Panics @@ -199,6 +239,40 @@ impl GenericByteViewBuilder { return; } + // Deduplication if: + // (1) deduplication is enabled. + // (2) len > 12 + if let Some((mut ht, hasher)) = self.string_tracker.take() { + let hash_val = hasher.hash_one(v); + let hasher_fn = |v: &_| hasher.hash_one(v); + + let entry = ht.entry( + hash_val, + |idx| { + let stored_value = self.get_value(*idx); + v == stored_value + }, + hasher_fn, + ); + match entry { + Entry::Occupied(occupied) => { + // If the string already exists, we will directly use the view + let idx = occupied.get(); + self.views_builder + .append(self.views_builder.as_slice()[*idx]); + self.null_buffer_builder.append_non_null(); + self.string_tracker = Some((ht, hasher)); + return; + } + Entry::Vacant(vacant) => { + // o.w. we insert the (string hash -> view index) + // the idx is current length of views_builder, as we are inserting a new view + vacant.insert(self.views_builder.len()); + } + } + self.string_tracker = Some((ht, hasher)); + } + let required_cap = self.in_progress.len() + v.len(); if self.in_progress.capacity() < required_cap { self.flush_in_progress(); @@ -357,6 +431,42 @@ mod tests { use super::*; use crate::Array; + #[test] + fn test_string_view_deduplicate() { + let value_1 = "long string to test string view"; + let value_2 = "not so similar string but long"; + + let mut builder = StringViewBuilder::new() + .with_deduplicate_strings() + .with_block_size(value_1.len() as u32 * 2); // so that we will have multiple buffers + + let values = vec![ + Some(value_1), + Some(value_2), + Some("short"), + Some(value_1), + None, + Some(value_2), + Some(value_1), + ]; + builder.extend(values.clone()); + + let array = builder.finish_cloned(); + array.to_data().validate_full().unwrap(); + assert_eq!(array.data_buffers().len(), 1); // without duplication we would need 3 buffers. + let actual: Vec<_> = array.iter().collect(); + assert_eq!(actual, values); + + let view0 = array.views().first().unwrap(); + let view3 = array.views().get(3).unwrap(); + let view6 = array.views().get(6).unwrap(); + + assert_eq!(view0, view3); + assert_eq!(view0, view6); + + assert_eq!(array.views().get(1), array.views().get(5)); + } + #[test] fn test_string_view() { let b1 = Buffer::from(b"world\xFFbananas\xF0\x9F\x98\x81");