@@ -30,7 +30,30 @@ use crate::types::bytes::ByteArrayNativeType;
30
30
use crate :: types:: { BinaryViewType , ByteViewType , StringViewType } ;
31
31
use crate :: { ArrayRef , GenericByteViewArray } ;
32
32
33
- const DEFAULT_BLOCK_SIZE : u32 = 8 * 1024 ;
33
+ const STARTING_BLOCK_SIZE : u32 = 8 * 1024 ; // 8KiB
34
+ const MAX_BLOCK_SIZE : u32 = 2 * 1024 * 1024 ; // 2MiB
35
+
36
+ enum BlockSizeGrowthStrategy {
37
+ Fixed { size : u32 } ,
38
+ Exponential { current_size : u32 } ,
39
+ }
40
+
41
+ impl BlockSizeGrowthStrategy {
42
+ fn next_size ( & mut self ) -> u32 {
43
+ match self {
44
+ Self :: Fixed { size } => * size,
45
+ Self :: Exponential { current_size } => {
46
+ if * current_size < MAX_BLOCK_SIZE {
47
+ // we have fixed start/end block sizes, so we can't overflow
48
+ * current_size = current_size. saturating_mul ( 2 ) ;
49
+ * current_size
50
+ } else {
51
+ MAX_BLOCK_SIZE
52
+ }
53
+ }
54
+ }
55
+ }
56
+ }
34
57
35
58
/// A builder for [`GenericByteViewArray`]
36
59
///
@@ -58,7 +81,7 @@ pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> {
58
81
null_buffer_builder : NullBufferBuilder ,
59
82
completed : Vec < Buffer > ,
60
83
in_progress : Vec < u8 > ,
61
- block_size : u32 ,
84
+ block_size : BlockSizeGrowthStrategy ,
62
85
/// Some if deduplicating strings
63
86
/// map `<string hash> -> <index to the views>`
64
87
string_tracker : Option < ( HashTable < usize > , ahash:: RandomState ) > ,
@@ -78,15 +101,42 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
78
101
null_buffer_builder : NullBufferBuilder :: new ( capacity) ,
79
102
completed : vec ! [ ] ,
80
103
in_progress : vec ! [ ] ,
81
- block_size : DEFAULT_BLOCK_SIZE ,
104
+ block_size : BlockSizeGrowthStrategy :: Exponential {
105
+ current_size : STARTING_BLOCK_SIZE ,
106
+ } ,
82
107
string_tracker : None ,
83
108
phantom : Default :: default ( ) ,
84
109
}
85
110
}
86
111
112
+ /// Set a fixed buffer size for variable length strings
113
+ ///
114
+ /// The block size is the size of the buffer used to store values greater
115
+ /// than 12 bytes. The builder allocates new buffers when the current
116
+ /// buffer is full.
117
+ ///
118
+ /// By default the builder balances buffer size and buffer count by
119
+ /// growing buffer size exponentially from 8KB up to 2MB. The
120
+ /// first buffer allocated is 8KB, then 16KB, then 32KB, etc up to 2MB.
121
+ ///
122
+ /// If this method is used, any new buffers allocated are
123
+ /// exactly this size. This can be useful for advanced users
124
+ /// that want to control the memory usage and buffer count.
125
+ ///
126
+ /// See <https://github.com/apache/arrow-rs/issues/6094> for more details on the implications.
127
+ pub fn with_fixed_block_size ( self , block_size : u32 ) -> Self {
128
+ debug_assert ! ( block_size > 0 , "Block size must be greater than 0" ) ;
129
+ Self {
130
+ block_size : BlockSizeGrowthStrategy :: Fixed { size : block_size } ,
131
+ ..self
132
+ }
133
+ }
134
+
87
135
/// Override the size of buffers to allocate for holding string data
136
+ /// Use `with_fixed_block_size` instead.
137
+ #[ deprecated( note = "Use `with_fixed_block_size` instead" ) ]
88
138
pub fn with_block_size ( self , block_size : u32 ) -> Self {
89
- Self { block_size , .. self }
139
+ self . with_fixed_block_size ( block_size )
90
140
}
91
141
92
142
/// Deduplicate strings while building the array
@@ -277,7 +327,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
277
327
let required_cap = self . in_progress . len ( ) + v. len ( ) ;
278
328
if self . in_progress . capacity ( ) < required_cap {
279
329
self . flush_in_progress ( ) ;
280
- let to_reserve = v. len ( ) . max ( self . block_size as usize ) ;
330
+ let to_reserve = v. len ( ) . max ( self . block_size . next_size ( ) as usize ) ;
281
331
self . in_progress . reserve ( to_reserve) ;
282
332
} ;
283
333
let offset = self . in_progress . len ( ) as u32 ;
@@ -478,7 +528,7 @@ mod tests {
478
528
479
529
let mut builder = StringViewBuilder :: new ( )
480
530
. with_deduplicate_strings ( )
481
- . with_block_size ( value_1. len ( ) as u32 * 2 ) ; // so that we will have multiple buffers
531
+ . with_fixed_block_size ( value_1. len ( ) as u32 * 2 ) ; // so that we will have multiple buffers
482
532
483
533
let values = vec ! [
484
534
Some ( value_1) ,
@@ -585,4 +635,46 @@ mod tests {
585
635
"Invalid argument error: No block found with index 5"
586
636
) ;
587
637
}
638
+
639
+ #[ test]
640
+ fn test_string_view_with_block_size_growth ( ) {
641
+ let mut exp_builder = StringViewBuilder :: new ( ) ;
642
+ let mut fixed_builder = StringViewBuilder :: new ( ) . with_fixed_block_size ( STARTING_BLOCK_SIZE ) ;
643
+
644
+ let long_string = String :: from_utf8 ( vec ! [ b'a' ; STARTING_BLOCK_SIZE as usize ] ) . unwrap ( ) ;
645
+
646
+ for i in 0 ..9 {
647
+ // 8k, 16k, 32k, 64k, 128k, 256k, 512k, 1M, 2M
648
+ for _ in 0 ..( 2_u32 . pow ( i) ) {
649
+ exp_builder. append_value ( & long_string) ;
650
+ fixed_builder. append_value ( & long_string) ;
651
+ }
652
+ exp_builder. flush_in_progress ( ) ;
653
+ fixed_builder. flush_in_progress ( ) ;
654
+
655
+ // Every step only add one buffer, but the buffer size is much larger
656
+ assert_eq ! ( exp_builder. completed. len( ) , i as usize + 1 ) ;
657
+ assert_eq ! (
658
+ exp_builder. completed[ i as usize ] . len( ) ,
659
+ STARTING_BLOCK_SIZE as usize * 2_usize . pow( i)
660
+ ) ;
661
+
662
+ // This step we added 2^i blocks, the sum of blocks should be 2^(i+1) - 1
663
+ assert_eq ! ( fixed_builder. completed. len( ) , 2_usize . pow( i + 1 ) - 1 ) ;
664
+
665
+ // Every buffer is fixed size
666
+ assert ! ( fixed_builder
667
+ . completed
668
+ . iter( )
669
+ . all( |b| b. len( ) == STARTING_BLOCK_SIZE as usize ) ) ;
670
+ }
671
+
672
+ // Add one more value, and the buffer stop growing.
673
+ exp_builder. append_value ( & long_string) ;
674
+ exp_builder. flush_in_progress ( ) ;
675
+ assert_eq ! (
676
+ exp_builder. completed. last( ) . unwrap( ) . capacity( ) ,
677
+ MAX_BLOCK_SIZE as usize
678
+ ) ;
679
+ }
588
680
}
0 commit comments