Skip to content

Commit 658e58f

Browse files
authored
add benchmark to track performance (#6101)
1 parent 5de1d5e commit 658e58f

File tree

1 file changed

+35
-2
lines changed

1 file changed

+35
-2
lines changed

parquet/benches/arrow_reader.rs

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -263,9 +263,10 @@ where
263263
InMemoryPageIterator::new(pages)
264264
}
265265

266-
fn build_plain_encoded_byte_array_page_iterator(
266+
fn build_plain_encoded_byte_array_page_iterator_inner(
267267
column_desc: ColumnDescPtr,
268268
null_density: f32,
269+
short_string: bool,
269270
) -> impl PageIterator + Clone {
270271
let max_def_level = column_desc.max_def_level();
271272
let max_rep_level = column_desc.max_rep_level();
@@ -285,7 +286,11 @@ fn build_plain_encoded_byte_array_page_iterator(
285286
max_def_level
286287
};
287288
if def_level == max_def_level {
288-
let string_value = format!("Test value {k}, row group: {i}, page: {j}");
289+
let string_value = if short_string {
290+
format!("{k}{i}{j}")
291+
} else {
292+
format!("Test value {k}, row group: {i}, page: {j}")
293+
};
289294
values.push(parquet::data_type::ByteArray::from(string_value.as_str()));
290295
}
291296
def_levels.push(def_level);
@@ -303,6 +308,13 @@ fn build_plain_encoded_byte_array_page_iterator(
303308
InMemoryPageIterator::new(pages)
304309
}
305310

311+
fn build_plain_encoded_byte_array_page_iterator(
312+
column_desc: ColumnDescPtr,
313+
null_density: f32,
314+
) -> impl PageIterator + Clone {
315+
build_plain_encoded_byte_array_page_iterator_inner(column_desc, null_density, false)
316+
}
317+
306318
fn build_dictionary_encoded_string_page_iterator(
307319
column_desc: ColumnDescPtr,
308320
null_density: f32,
@@ -1066,6 +1078,27 @@ fn add_benches(c: &mut Criterion) {
10661078

10671079
let mut group = c.benchmark_group("arrow_array_reader/BinaryViewArray");
10681080

1081+
// binary view, plain encoded, no NULLs, short string
1082+
let plain_byte_array_no_null_data = build_plain_encoded_byte_array_page_iterator_inner(
1083+
mandatory_binary_column_desc.clone(),
1084+
0.0,
1085+
true,
1086+
);
1087+
1088+
// Short strings should not be slower than long strings, however, as discussed in https://github.com/apache/arrow-rs/issues/6034,
1089+
// the current implementation is more than 2x slower.
1090+
// This benchmark tracks the performance of short strings so that we can optimize it.
1091+
group.bench_function("plain encoded, mandatory, no NULLs, short string", |b| {
1092+
b.iter(|| {
1093+
let array_reader = create_byte_view_array_reader(
1094+
plain_byte_array_no_null_data.clone(),
1095+
mandatory_binary_column_desc.clone(),
1096+
);
1097+
count = bench_array_reader(array_reader);
1098+
});
1099+
assert_eq!(count, EXPECTED_VALUE_COUNT);
1100+
});
1101+
10691102
// binary view, plain encoded, no NULLs
10701103
let plain_byte_array_no_null_data =
10711104
build_plain_encoded_byte_array_page_iterator(mandatory_binary_column_desc.clone(), 0.0);

0 commit comments

Comments
 (0)