@@ -263,9 +263,10 @@ where
263
263
InMemoryPageIterator :: new ( pages)
264
264
}
265
265
266
- fn build_plain_encoded_byte_array_page_iterator (
266
+ fn build_plain_encoded_byte_array_page_iterator_inner (
267
267
column_desc : ColumnDescPtr ,
268
268
null_density : f32 ,
269
+ short_string : bool ,
269
270
) -> impl PageIterator + Clone {
270
271
let max_def_level = column_desc. max_def_level ( ) ;
271
272
let max_rep_level = column_desc. max_rep_level ( ) ;
@@ -285,7 +286,11 @@ fn build_plain_encoded_byte_array_page_iterator(
285
286
max_def_level
286
287
} ;
287
288
if def_level == max_def_level {
288
- let string_value = format ! ( "Test value {k}, row group: {i}, page: {j}" ) ;
289
+ let string_value = if short_string {
290
+ format ! ( "{k}{i}{j}" )
291
+ } else {
292
+ format ! ( "Test value {k}, row group: {i}, page: {j}" )
293
+ } ;
289
294
values. push ( parquet:: data_type:: ByteArray :: from ( string_value. as_str ( ) ) ) ;
290
295
}
291
296
def_levels. push ( def_level) ;
@@ -303,6 +308,13 @@ fn build_plain_encoded_byte_array_page_iterator(
303
308
InMemoryPageIterator :: new ( pages)
304
309
}
305
310
311
+ fn build_plain_encoded_byte_array_page_iterator (
312
+ column_desc : ColumnDescPtr ,
313
+ null_density : f32 ,
314
+ ) -> impl PageIterator + Clone {
315
+ build_plain_encoded_byte_array_page_iterator_inner ( column_desc, null_density, false )
316
+ }
317
+
306
318
fn build_dictionary_encoded_string_page_iterator (
307
319
column_desc : ColumnDescPtr ,
308
320
null_density : f32 ,
@@ -1066,6 +1078,27 @@ fn add_benches(c: &mut Criterion) {
1066
1078
1067
1079
let mut group = c. benchmark_group ( "arrow_array_reader/BinaryViewArray" ) ;
1068
1080
1081
+ // binary view, plain encoded, no NULLs, short string
1082
+ let plain_byte_array_no_null_data = build_plain_encoded_byte_array_page_iterator_inner (
1083
+ mandatory_binary_column_desc. clone ( ) ,
1084
+ 0.0 ,
1085
+ true ,
1086
+ ) ;
1087
+
1088
+ // Short strings should not be slower than long strings, however, as discussed in https://github.com/apache/arrow-rs/issues/6034,
1089
+ // the current implementation is more than 2x slower.
1090
+ // This benchmark tracks the performance of short strings so that we can optimize it.
1091
+ group. bench_function ( "plain encoded, mandatory, no NULLs, short string" , |b| {
1092
+ b. iter ( || {
1093
+ let array_reader = create_byte_view_array_reader (
1094
+ plain_byte_array_no_null_data. clone ( ) ,
1095
+ mandatory_binary_column_desc. clone ( ) ,
1096
+ ) ;
1097
+ count = bench_array_reader ( array_reader) ;
1098
+ } ) ;
1099
+ assert_eq ! ( count, EXPECTED_VALUE_COUNT ) ;
1100
+ } ) ;
1101
+
1069
1102
// binary view, plain encoded, no NULLs
1070
1103
let plain_byte_array_no_null_data =
1071
1104
build_plain_encoded_byte_array_page_iterator ( mandatory_binary_column_desc. clone ( ) , 0.0 ) ;
0 commit comments