@@ -22,6 +22,8 @@ use std::sync::Arc;
22
22
use arrow_buffer:: { Buffer , BufferBuilder , NullBufferBuilder , ScalarBuffer } ;
23
23
use arrow_data:: ByteView ;
24
24
use arrow_schema:: ArrowError ;
25
+ use hashbrown:: hash_table:: Entry ;
26
+ use hashbrown:: HashTable ;
25
27
26
28
use crate :: builder:: ArrayBuilder ;
27
29
use crate :: types:: bytes:: ByteArrayNativeType ;
@@ -57,6 +59,9 @@ pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> {
57
59
completed : Vec < Buffer > ,
58
60
in_progress : Vec < u8 > ,
59
61
block_size : u32 ,
62
+ /// Some if deduplicating strings
63
+ /// map `<string hash> -> <index to the views>`
64
+ string_tracker : Option < ( HashTable < usize > , ahash:: RandomState ) > ,
60
65
phantom : PhantomData < T > ,
61
66
}
62
67
@@ -74,6 +79,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
74
79
completed : vec ! [ ] ,
75
80
in_progress : vec ! [ ] ,
76
81
block_size : DEFAULT_BLOCK_SIZE ,
82
+ string_tracker : None ,
77
83
phantom : Default :: default ( ) ,
78
84
}
79
85
}
@@ -83,6 +89,20 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
83
89
Self { block_size, ..self }
84
90
}
85
91
92
+ /// Deduplicate strings while building the array
93
+ ///
94
+ /// This will potentially decrease the memory usage if the array have repeated strings
95
+ /// It will also increase the time to build the array as it needs to hash the strings
96
+ pub fn with_deduplicate_strings ( self ) -> Self {
97
+ Self {
98
+ string_tracker : Some ( (
99
+ HashTable :: with_capacity ( self . views_builder . capacity ( ) ) ,
100
+ Default :: default ( ) ,
101
+ ) ) ,
102
+ ..self
103
+ }
104
+ }
105
+
86
106
/// Append a new data block returning the new block offset
87
107
///
88
108
/// Note: this will first flush any in-progress block
@@ -179,6 +199,26 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
179
199
self . completed . push ( block) ;
180
200
}
181
201
202
+ /// Returns the value at the given index
203
+ /// Useful if we want to know what value has been inserted to the builder
204
+ fn get_value ( & self , index : usize ) -> & [ u8 ] {
205
+ let view = self . views_builder . as_slice ( ) . get ( index) . unwrap ( ) ;
206
+ let len = * view as u32 ;
207
+ if len <= 12 {
208
+ // # Safety
209
+ // The view is valid from the builder
210
+ unsafe { GenericByteViewArray :: < T > :: inline_value ( view, len as usize ) }
211
+ } else {
212
+ let view = ByteView :: from ( * view) ;
213
+ if view. buffer_index < self . completed . len ( ) as u32 {
214
+ let block = & self . completed [ view. buffer_index as usize ] ;
215
+ & block[ view. offset as usize ..view. offset as usize + view. length as usize ]
216
+ } else {
217
+ & self . in_progress [ view. offset as usize ..view. offset as usize + view. length as usize ]
218
+ }
219
+ }
220
+ }
221
+
182
222
/// Appends a value into the builder
183
223
///
184
224
/// # Panics
@@ -199,6 +239,40 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
199
239
return ;
200
240
}
201
241
242
+ // Deduplication if:
243
+ // (1) deduplication is enabled.
244
+ // (2) len > 12
245
+ if let Some ( ( mut ht, hasher) ) = self . string_tracker . take ( ) {
246
+ let hash_val = hasher. hash_one ( v) ;
247
+ let hasher_fn = |v : & _ | hasher. hash_one ( v) ;
248
+
249
+ let entry = ht. entry (
250
+ hash_val,
251
+ |idx| {
252
+ let stored_value = self . get_value ( * idx) ;
253
+ v == stored_value
254
+ } ,
255
+ hasher_fn,
256
+ ) ;
257
+ match entry {
258
+ Entry :: Occupied ( occupied) => {
259
+ // If the string already exists, we will directly use the view
260
+ let idx = occupied. get ( ) ;
261
+ self . views_builder
262
+ . append ( self . views_builder . as_slice ( ) [ * idx] ) ;
263
+ self . null_buffer_builder . append_non_null ( ) ;
264
+ self . string_tracker = Some ( ( ht, hasher) ) ;
265
+ return ;
266
+ }
267
+ Entry :: Vacant ( vacant) => {
268
+ // o.w. we insert the (string hash -> view index)
269
+ // the idx is current length of views_builder, as we are inserting a new view
270
+ vacant. insert ( self . views_builder . len ( ) ) ;
271
+ }
272
+ }
273
+ self . string_tracker = Some ( ( ht, hasher) ) ;
274
+ }
275
+
202
276
let required_cap = self . in_progress . len ( ) + v. len ( ) ;
203
277
if self . in_progress . capacity ( ) < required_cap {
204
278
self . flush_in_progress ( ) ;
@@ -357,6 +431,42 @@ mod tests {
357
431
use super :: * ;
358
432
use crate :: Array ;
359
433
434
+ #[ test]
435
+ fn test_string_view_deduplicate ( ) {
436
+ let value_1 = "long string to test string view" ;
437
+ let value_2 = "not so similar string but long" ;
438
+
439
+ let mut builder = StringViewBuilder :: new ( )
440
+ . with_deduplicate_strings ( )
441
+ . with_block_size ( value_1. len ( ) as u32 * 2 ) ; // so that we will have multiple buffers
442
+
443
+ let values = vec ! [
444
+ Some ( value_1) ,
445
+ Some ( value_2) ,
446
+ Some ( "short" ) ,
447
+ Some ( value_1) ,
448
+ None ,
449
+ Some ( value_2) ,
450
+ Some ( value_1) ,
451
+ ] ;
452
+ builder. extend ( values. clone ( ) ) ;
453
+
454
+ let array = builder. finish_cloned ( ) ;
455
+ array. to_data ( ) . validate_full ( ) . unwrap ( ) ;
456
+ assert_eq ! ( array. data_buffers( ) . len( ) , 1 ) ; // without duplication we would need 3 buffers.
457
+ let actual: Vec < _ > = array. iter ( ) . collect ( ) ;
458
+ assert_eq ! ( actual, values) ;
459
+
460
+ let view0 = array. views ( ) . first ( ) . unwrap ( ) ;
461
+ let view3 = array. views ( ) . get ( 3 ) . unwrap ( ) ;
462
+ let view6 = array. views ( ) . get ( 6 ) . unwrap ( ) ;
463
+
464
+ assert_eq ! ( view0, view3) ;
465
+ assert_eq ! ( view0, view6) ;
466
+
467
+ assert_eq ! ( array. views( ) . get( 1 ) , array. views( ) . get( 5 ) ) ;
468
+ }
469
+
360
470
#[ test]
361
471
fn test_string_view ( ) {
362
472
let b1 = Buffer :: from ( b"world\xFF bananas\xF0 \x9F \x98 \x81 " ) ;
0 commit comments