Skip to content

Commit bd1e76b

Browse files
XiangpengHaoalamb
andauthored
Implement exponential block size growing strategy for StringViewBuilder (#6136)
* new block size growing strategy * Update arrow-array/src/builder/generic_bytes_view_builder.rs Co-authored-by: Andrew Lamb <[email protected]> * update function name, deprecate old function * update comments --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 11f2bb8 commit bd1e76b

File tree

3 files changed

+104
-12
lines changed

3 files changed

+104
-12
lines changed

arrow-array/src/array/byte_view_array.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -757,7 +757,7 @@ mod tests {
757757
fn test_in_progress_recreation() {
758758
let array = {
759759
// make a builder with small block size.
760-
let mut builder = StringViewBuilder::new().with_block_size(14);
760+
let mut builder = StringViewBuilder::new().with_fixed_block_size(14);
761761
builder.append_value("large payload over 12 bytes");
762762
builder.append_option(Some("another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created"));
763763
builder.finish()
@@ -848,7 +848,7 @@ mod tests {
848848
];
849849

850850
let array = {
851-
let mut builder = StringViewBuilder::new().with_block_size(8); // create multiple buffers
851+
let mut builder = StringViewBuilder::new().with_fixed_block_size(8); // create multiple buffers
852852
test_data.into_iter().for_each(|v| builder.append_option(v));
853853
builder.finish()
854854
};

arrow-array/src/builder/generic_bytes_view_builder.rs

Lines changed: 98 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,30 @@ use crate::types::bytes::ByteArrayNativeType;
3030
use crate::types::{BinaryViewType, ByteViewType, StringViewType};
3131
use crate::{ArrayRef, GenericByteViewArray};
3232

33-
const DEFAULT_BLOCK_SIZE: u32 = 8 * 1024;
33+
const STARTING_BLOCK_SIZE: u32 = 8 * 1024; // 8KiB
34+
const MAX_BLOCK_SIZE: u32 = 2 * 1024 * 1024; // 2MiB
35+
36+
enum BlockSizeGrowthStrategy {
37+
Fixed { size: u32 },
38+
Exponential { current_size: u32 },
39+
}
40+
41+
impl BlockSizeGrowthStrategy {
42+
fn next_size(&mut self) -> u32 {
43+
match self {
44+
Self::Fixed { size } => *size,
45+
Self::Exponential { current_size } => {
46+
if *current_size < MAX_BLOCK_SIZE {
47+
// we have fixed start/end block sizes, so we can't overflow
48+
*current_size = current_size.saturating_mul(2);
49+
*current_size
50+
} else {
51+
MAX_BLOCK_SIZE
52+
}
53+
}
54+
}
55+
}
56+
}
3457

3558
/// A builder for [`GenericByteViewArray`]
3659
///
@@ -58,7 +81,7 @@ pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> {
5881
null_buffer_builder: NullBufferBuilder,
5982
completed: Vec<Buffer>,
6083
in_progress: Vec<u8>,
61-
block_size: u32,
84+
block_size: BlockSizeGrowthStrategy,
6285
/// Some if deduplicating strings
6386
/// map `<string hash> -> <index to the views>`
6487
string_tracker: Option<(HashTable<usize>, ahash::RandomState)>,
@@ -78,15 +101,42 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
78101
null_buffer_builder: NullBufferBuilder::new(capacity),
79102
completed: vec![],
80103
in_progress: vec![],
81-
block_size: DEFAULT_BLOCK_SIZE,
104+
block_size: BlockSizeGrowthStrategy::Exponential {
105+
current_size: STARTING_BLOCK_SIZE,
106+
},
82107
string_tracker: None,
83108
phantom: Default::default(),
84109
}
85110
}
86111

112+
/// Set a fixed buffer size for variable length strings
113+
///
114+
/// The block size is the size of the buffer used to store values greater
115+
/// than 12 bytes. The builder allocates new buffers when the current
116+
/// buffer is full.
117+
///
118+
/// By default the builder balances buffer size and buffer count by
119+
/// growing buffer size exponentially from 8KB up to 2MB. The
120+
/// first buffer allocated is 8KB, then 16KB, then 32KB, etc up to 2MB.
121+
///
122+
/// If this method is used, any new buffers allocated are
123+
/// exactly this size. This can be useful for advanced users
124+
/// that want to control the memory usage and buffer count.
125+
///
126+
/// See <https://github.com/apache/arrow-rs/issues/6094> for more details on the implications.
127+
pub fn with_fixed_block_size(self, block_size: u32) -> Self {
128+
debug_assert!(block_size > 0, "Block size must be greater than 0");
129+
Self {
130+
block_size: BlockSizeGrowthStrategy::Fixed { size: block_size },
131+
..self
132+
}
133+
}
134+
87135
/// Override the size of buffers to allocate for holding string data
136+
/// Use `with_fixed_block_size` instead.
137+
#[deprecated(note = "Use `with_fixed_block_size` instead")]
88138
pub fn with_block_size(self, block_size: u32) -> Self {
89-
Self { block_size, ..self }
139+
self.with_fixed_block_size(block_size)
90140
}
91141

92142
/// Deduplicate strings while building the array
@@ -277,7 +327,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
277327
let required_cap = self.in_progress.len() + v.len();
278328
if self.in_progress.capacity() < required_cap {
279329
self.flush_in_progress();
280-
let to_reserve = v.len().max(self.block_size as usize);
330+
let to_reserve = v.len().max(self.block_size.next_size() as usize);
281331
self.in_progress.reserve(to_reserve);
282332
};
283333
let offset = self.in_progress.len() as u32;
@@ -478,7 +528,7 @@ mod tests {
478528

479529
let mut builder = StringViewBuilder::new()
480530
.with_deduplicate_strings()
481-
.with_block_size(value_1.len() as u32 * 2); // so that we will have multiple buffers
531+
.with_fixed_block_size(value_1.len() as u32 * 2); // so that we will have multiple buffers
482532

483533
let values = vec![
484534
Some(value_1),
@@ -585,4 +635,46 @@ mod tests {
585635
"Invalid argument error: No block found with index 5"
586636
);
587637
}
638+
639+
#[test]
640+
fn test_string_view_with_block_size_growth() {
641+
let mut exp_builder = StringViewBuilder::new();
642+
let mut fixed_builder = StringViewBuilder::new().with_fixed_block_size(STARTING_BLOCK_SIZE);
643+
644+
let long_string = String::from_utf8(vec![b'a'; STARTING_BLOCK_SIZE as usize]).unwrap();
645+
646+
for i in 0..9 {
647+
// 8k, 16k, 32k, 64k, 128k, 256k, 512k, 1M, 2M
648+
for _ in 0..(2_u32.pow(i)) {
649+
exp_builder.append_value(&long_string);
650+
fixed_builder.append_value(&long_string);
651+
}
652+
exp_builder.flush_in_progress();
653+
fixed_builder.flush_in_progress();
654+
655+
// Every step only add one buffer, but the buffer size is much larger
656+
assert_eq!(exp_builder.completed.len(), i as usize + 1);
657+
assert_eq!(
658+
exp_builder.completed[i as usize].len(),
659+
STARTING_BLOCK_SIZE as usize * 2_usize.pow(i)
660+
);
661+
662+
// This step we added 2^i blocks, the sum of blocks should be 2^(i+1) - 1
663+
assert_eq!(fixed_builder.completed.len(), 2_usize.pow(i + 1) - 1);
664+
665+
// Every buffer is fixed size
666+
assert!(fixed_builder
667+
.completed
668+
.iter()
669+
.all(|b| b.len() == STARTING_BLOCK_SIZE as usize));
670+
}
671+
672+
// Add one more value, and the buffer stop growing.
673+
exp_builder.append_value(&long_string);
674+
exp_builder.flush_in_progress();
675+
assert_eq!(
676+
exp_builder.completed.last().unwrap().capacity(),
677+
MAX_BLOCK_SIZE as usize
678+
);
679+
}
588680
}

arrow-cast/src/cast/mod.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5321,7 +5321,7 @@ mod tests {
53215321
let typed_dict = string_dict_array.downcast_dict::<StringArray>().unwrap();
53225322

53235323
let string_view_array = {
5324-
let mut builder = StringViewBuilder::new().with_block_size(8); // multiple buffers.
5324+
let mut builder = StringViewBuilder::new().with_fixed_block_size(8); // multiple buffers.
53255325
for v in typed_dict.into_iter() {
53265326
builder.append_option(v);
53275327
}
@@ -5338,7 +5338,7 @@ mod tests {
53385338
let typed_binary_dict = binary_dict_array.downcast_dict::<BinaryArray>().unwrap();
53395339

53405340
let binary_view_array = {
5341-
let mut builder = BinaryViewBuilder::new().with_block_size(8); // multiple buffers.
5341+
let mut builder = BinaryViewBuilder::new().with_fixed_block_size(8); // multiple buffers.
53425342
for v in typed_binary_dict.into_iter() {
53435343
builder.append_option(v);
53445344
}
@@ -5381,7 +5381,7 @@ mod tests {
53815381
O: OffsetSizeTrait,
53825382
{
53835383
let view_array = {
5384-
let mut builder = StringViewBuilder::new().with_block_size(8); // multiple buffers.
5384+
let mut builder = StringViewBuilder::new().with_fixed_block_size(8); // multiple buffers.
53855385
for s in VIEW_TEST_DATA.iter() {
53865386
builder.append_option(*s);
53875387
}
@@ -5410,7 +5410,7 @@ mod tests {
54105410
O: OffsetSizeTrait,
54115411
{
54125412
let view_array = {
5413-
let mut builder = BinaryViewBuilder::new().with_block_size(8); // multiple buffers.
5413+
let mut builder = BinaryViewBuilder::new().with_fixed_block_size(8); // multiple buffers.
54145414
for s in VIEW_TEST_DATA.iter() {
54155415
builder.append_option(*s);
54165416
}

0 commit comments

Comments
 (0)