Skip to content

Commit 2b986df

Browse files
XiangpengHaoalamb
andauthored
Deduplicate strings/binarys when building view types (#6005)
* implement string view deduplication in builder * make clippy happy * Apply suggestions from code review Co-authored-by: Andrew Lamb <[email protected]> * better coding style --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent b9562b9 commit 2b986df

File tree

2 files changed

+112
-1
lines changed

2 files changed

+112
-1
lines changed

arrow-array/src/array/byte_view_array.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,8 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
325325
/// Use with caution as this can be an expensive operation, only use it when you are sure that the view
326326
/// array is significantly smaller than when it is originally created, e.g., after filtering or slicing.
327327
pub fn gc(&self) -> Self {
328-
let mut builder = GenericByteViewBuilder::<T>::with_capacity(self.len());
328+
let mut builder =
329+
GenericByteViewBuilder::<T>::with_capacity(self.len()).with_deduplicate_strings();
329330

330331
for v in self.iter() {
331332
builder.append_option(v);

arrow-array/src/builder/generic_bytes_view_builder.rs

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ use std::sync::Arc;
2222
use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, ScalarBuffer};
2323
use arrow_data::ByteView;
2424
use arrow_schema::ArrowError;
25+
use hashbrown::hash_table::Entry;
26+
use hashbrown::HashTable;
2527

2628
use crate::builder::ArrayBuilder;
2729
use crate::types::bytes::ByteArrayNativeType;
@@ -57,6 +59,9 @@ pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> {
5759
completed: Vec<Buffer>,
5860
in_progress: Vec<u8>,
5961
block_size: u32,
62+
/// Some if deduplicating strings
63+
/// map `<string hash> -> <index to the views>`
64+
string_tracker: Option<(HashTable<usize>, ahash::RandomState)>,
6065
phantom: PhantomData<T>,
6166
}
6267

@@ -74,6 +79,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
7479
completed: vec![],
7580
in_progress: vec![],
7681
block_size: DEFAULT_BLOCK_SIZE,
82+
string_tracker: None,
7783
phantom: Default::default(),
7884
}
7985
}
@@ -83,6 +89,20 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
8389
Self { block_size, ..self }
8490
}
8591

92+
/// Deduplicate strings while building the array
93+
///
94+
/// This will potentially decrease the memory usage if the array have repeated strings
95+
/// It will also increase the time to build the array as it needs to hash the strings
96+
pub fn with_deduplicate_strings(self) -> Self {
97+
Self {
98+
string_tracker: Some((
99+
HashTable::with_capacity(self.views_builder.capacity()),
100+
Default::default(),
101+
)),
102+
..self
103+
}
104+
}
105+
86106
/// Append a new data block returning the new block offset
87107
///
88108
/// Note: this will first flush any in-progress block
@@ -179,6 +199,26 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
179199
self.completed.push(block);
180200
}
181201

202+
/// Returns the value at the given index
203+
/// Useful if we want to know what value has been inserted to the builder
204+
fn get_value(&self, index: usize) -> &[u8] {
205+
let view = self.views_builder.as_slice().get(index).unwrap();
206+
let len = *view as u32;
207+
if len <= 12 {
208+
// # Safety
209+
// The view is valid from the builder
210+
unsafe { GenericByteViewArray::<T>::inline_value(view, len as usize) }
211+
} else {
212+
let view = ByteView::from(*view);
213+
if view.buffer_index < self.completed.len() as u32 {
214+
let block = &self.completed[view.buffer_index as usize];
215+
&block[view.offset as usize..view.offset as usize + view.length as usize]
216+
} else {
217+
&self.in_progress[view.offset as usize..view.offset as usize + view.length as usize]
218+
}
219+
}
220+
}
221+
182222
/// Appends a value into the builder
183223
///
184224
/// # Panics
@@ -199,6 +239,40 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
199239
return;
200240
}
201241

242+
// Deduplication if:
243+
// (1) deduplication is enabled.
244+
// (2) len > 12
245+
if let Some((mut ht, hasher)) = self.string_tracker.take() {
246+
let hash_val = hasher.hash_one(v);
247+
let hasher_fn = |v: &_| hasher.hash_one(v);
248+
249+
let entry = ht.entry(
250+
hash_val,
251+
|idx| {
252+
let stored_value = self.get_value(*idx);
253+
v == stored_value
254+
},
255+
hasher_fn,
256+
);
257+
match entry {
258+
Entry::Occupied(occupied) => {
259+
// If the string already exists, we will directly use the view
260+
let idx = occupied.get();
261+
self.views_builder
262+
.append(self.views_builder.as_slice()[*idx]);
263+
self.null_buffer_builder.append_non_null();
264+
self.string_tracker = Some((ht, hasher));
265+
return;
266+
}
267+
Entry::Vacant(vacant) => {
268+
// o.w. we insert the (string hash -> view index)
269+
// the idx is current length of views_builder, as we are inserting a new view
270+
vacant.insert(self.views_builder.len());
271+
}
272+
}
273+
self.string_tracker = Some((ht, hasher));
274+
}
275+
202276
let required_cap = self.in_progress.len() + v.len();
203277
if self.in_progress.capacity() < required_cap {
204278
self.flush_in_progress();
@@ -357,6 +431,42 @@ mod tests {
357431
use super::*;
358432
use crate::Array;
359433

434+
#[test]
435+
fn test_string_view_deduplicate() {
436+
let value_1 = "long string to test string view";
437+
let value_2 = "not so similar string but long";
438+
439+
let mut builder = StringViewBuilder::new()
440+
.with_deduplicate_strings()
441+
.with_block_size(value_1.len() as u32 * 2); // so that we will have multiple buffers
442+
443+
let values = vec![
444+
Some(value_1),
445+
Some(value_2),
446+
Some("short"),
447+
Some(value_1),
448+
None,
449+
Some(value_2),
450+
Some(value_1),
451+
];
452+
builder.extend(values.clone());
453+
454+
let array = builder.finish_cloned();
455+
array.to_data().validate_full().unwrap();
456+
assert_eq!(array.data_buffers().len(), 1); // without duplication we would need 3 buffers.
457+
let actual: Vec<_> = array.iter().collect();
458+
assert_eq!(actual, values);
459+
460+
let view0 = array.views().first().unwrap();
461+
let view3 = array.views().get(3).unwrap();
462+
let view6 = array.views().get(6).unwrap();
463+
464+
assert_eq!(view0, view3);
465+
assert_eq!(view0, view6);
466+
467+
assert_eq!(array.views().get(1), array.views().get(5));
468+
}
469+
360470
#[test]
361471
fn test_string_view() {
362472
let b1 = Buffer::from(b"world\xFFbananas\xF0\x9F\x98\x81");

0 commit comments

Comments
 (0)