diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index 587255cc6b6a..7726ee35240f 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -420,23 +420,49 @@ pub type StringViewBuilder = GenericByteViewBuilder; /// [`GenericByteViewBuilder::append_null`] as normal. pub type BinaryViewBuilder = GenericByteViewBuilder; +/// Creates a view from a fixed length input (the compiler can generate +/// specialized code for this) +fn make_inlined_view(data: &[u8]) -> u128 { + let mut view_buffer = [0; 16]; + view_buffer[0..4].copy_from_slice(&(LEN as u32).to_le_bytes()); + view_buffer[4..4 + LEN].copy_from_slice(&data[..LEN]); + u128::from_le_bytes(view_buffer) +} + /// Create a view based on the given data, block id and offset -#[inline(always)] +/// Note that the code below is carefully examined with x86_64 assembly code: +/// The goal is to avoid calling into `ptr::copy_non_interleave`, which makes function call (i.e., not inlined), +/// which slows down things. +#[inline(never)] pub fn make_view(data: &[u8], block_id: u32, offset: u32) -> u128 { - let len = data.len() as u32; - if len <= 12 { - let mut view_buffer = [0; 16]; - view_buffer[0..4].copy_from_slice(&len.to_le_bytes()); - view_buffer[4..4 + data.len()].copy_from_slice(data); - u128::from_le_bytes(view_buffer) - } else { - let view = ByteView { - length: len, - prefix: u32::from_le_bytes(data[0..4].try_into().unwrap()), - buffer_index: block_id, - offset, - }; - view.into() + let len = data.len(); + + // Generate specialized code for each potential small string length + // to improve performance + match len { + 0 => make_inlined_view::<0>(data), + 1 => make_inlined_view::<1>(data), + 2 => make_inlined_view::<2>(data), + 3 => make_inlined_view::<3>(data), + 4 => make_inlined_view::<4>(data), + 5 => make_inlined_view::<5>(data), + 6 => make_inlined_view::<6>(data), + 7 => make_inlined_view::<7>(data), + 8 => make_inlined_view::<8>(data), + 9 => make_inlined_view::<9>(data), + 10 => make_inlined_view::<10>(data), + 11 => make_inlined_view::<11>(data), + 12 => make_inlined_view::<12>(data), + // When string is longer than 12 bytes, it can't be inlined, we create a ByteView instead. + _ => { + let view = ByteView { + length: len as u32, + prefix: u32::from_le_bytes(data[0..4].try_into().unwrap()), + buffer_index: block_id, + offset, + }; + view.as_u128() + } } } diff --git a/parquet/benches/arrow_reader.rs b/parquet/benches/arrow_reader.rs index 927998ac2489..814e75c249bf 100644 --- a/parquet/benches/arrow_reader.rs +++ b/parquet/benches/arrow_reader.rs @@ -263,9 +263,10 @@ where InMemoryPageIterator::new(pages) } -fn build_plain_encoded_byte_array_page_iterator( +fn build_plain_encoded_byte_array_page_iterator_inner( column_desc: ColumnDescPtr, null_density: f32, + short_string: bool, ) -> impl PageIterator + Clone { let max_def_level = column_desc.max_def_level(); let max_rep_level = column_desc.max_rep_level(); @@ -285,7 +286,11 @@ fn build_plain_encoded_byte_array_page_iterator( max_def_level }; if def_level == max_def_level { - let string_value = format!("Test value {k}, row group: {i}, page: {j}"); + let string_value = if short_string { + format!("{k}{i}{j}") + } else { + format!("Test value {k}, row group: {i}, page: {j}") + }; values.push(parquet::data_type::ByteArray::from(string_value.as_str())); } def_levels.push(def_level); @@ -303,6 +308,13 @@ fn build_plain_encoded_byte_array_page_iterator( InMemoryPageIterator::new(pages) } +fn build_plain_encoded_byte_array_page_iterator( + column_desc: ColumnDescPtr, + null_density: f32, +) -> impl PageIterator + Clone { + build_plain_encoded_byte_array_page_iterator_inner(column_desc, null_density, false) +} + fn build_dictionary_encoded_string_page_iterator( column_desc: ColumnDescPtr, null_density: f32, @@ -1066,6 +1078,27 @@ fn add_benches(c: &mut Criterion) { let mut group = c.benchmark_group("arrow_array_reader/BinaryViewArray"); + // binary view, plain encoded, no NULLs, short string + let plain_byte_array_no_null_data = build_plain_encoded_byte_array_page_iterator_inner( + mandatory_binary_column_desc.clone(), + 0.0, + true, + ); + + // Short strings should not be slower than long strings, however, as discussed in https://github.com/apache/arrow-rs/issues/6034, + // the current implementation is more than 2x slower. + // This benchmark tracks the performance of short strings so that we can optimize it. + group.bench_function("plain encoded, mandatory, no NULLs, short string", |b| { + b.iter(|| { + let array_reader = create_byte_view_array_reader( + plain_byte_array_no_null_data.clone(), + mandatory_binary_column_desc.clone(), + ); + count = bench_array_reader(array_reader); + }); + assert_eq!(count, EXPECTED_VALUE_COUNT); + }); + // binary view, plain encoded, no NULLs let plain_byte_array_no_null_data = build_plain_encoded_byte_array_page_iterator(mandatory_binary_column_desc.clone(), 0.0);