From 160283e3e0d6c77c237a39e49721f618ff313f85 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Mon, 22 Jul 2024 09:51:36 -0400 Subject: [PATCH 1/5] add benchmark to track performance --- parquet/benches/arrow_reader.rs | 37 +++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/parquet/benches/arrow_reader.rs b/parquet/benches/arrow_reader.rs index 927998ac2489..814e75c249bf 100644 --- a/parquet/benches/arrow_reader.rs +++ b/parquet/benches/arrow_reader.rs @@ -263,9 +263,10 @@ where InMemoryPageIterator::new(pages) } -fn build_plain_encoded_byte_array_page_iterator( +fn build_plain_encoded_byte_array_page_iterator_inner( column_desc: ColumnDescPtr, null_density: f32, + short_string: bool, ) -> impl PageIterator + Clone { let max_def_level = column_desc.max_def_level(); let max_rep_level = column_desc.max_rep_level(); @@ -285,7 +286,11 @@ fn build_plain_encoded_byte_array_page_iterator( max_def_level }; if def_level == max_def_level { - let string_value = format!("Test value {k}, row group: {i}, page: {j}"); + let string_value = if short_string { + format!("{k}{i}{j}") + } else { + format!("Test value {k}, row group: {i}, page: {j}") + }; values.push(parquet::data_type::ByteArray::from(string_value.as_str())); } def_levels.push(def_level); @@ -303,6 +308,13 @@ fn build_plain_encoded_byte_array_page_iterator( InMemoryPageIterator::new(pages) } +fn build_plain_encoded_byte_array_page_iterator( + column_desc: ColumnDescPtr, + null_density: f32, +) -> impl PageIterator + Clone { + build_plain_encoded_byte_array_page_iterator_inner(column_desc, null_density, false) +} + fn build_dictionary_encoded_string_page_iterator( column_desc: ColumnDescPtr, null_density: f32, @@ -1066,6 +1078,27 @@ fn add_benches(c: &mut Criterion) { let mut group = c.benchmark_group("arrow_array_reader/BinaryViewArray"); + // binary view, plain encoded, no NULLs, short string + let plain_byte_array_no_null_data = build_plain_encoded_byte_array_page_iterator_inner( + mandatory_binary_column_desc.clone(), + 0.0, + true, + ); + + // Short strings should not be slower than long strings, however, as discussed in https://github.com/apache/arrow-rs/issues/6034, + // the current implementation is more than 2x slower. + // This benchmark tracks the performance of short strings so that we can optimize it. + group.bench_function("plain encoded, mandatory, no NULLs, short string", |b| { + b.iter(|| { + let array_reader = create_byte_view_array_reader( + plain_byte_array_no_null_data.clone(), + mandatory_binary_column_desc.clone(), + ); + count = bench_array_reader(array_reader); + }); + assert_eq!(count, EXPECTED_VALUE_COUNT); + }); + // binary view, plain encoded, no NULLs let plain_byte_array_no_null_data = build_plain_encoded_byte_array_page_iterator(mandatory_binary_column_desc.clone(), 0.0); From f8c889dd7209c61f7c420a59660aa39e0f5682f2 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Mon, 22 Jul 2024 11:44:03 -0400 Subject: [PATCH 2/5] fast byte view construction --- .../src/builder/generic_bytes_view_builder.rs | 55 ++++++++++++++----- parquet/benches/arrow_reader.rs | 2 +- 2 files changed, 41 insertions(+), 16 deletions(-) diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index 587255cc6b6a..b89bb34c2448 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -420,23 +420,48 @@ pub type StringViewBuilder = GenericByteViewBuilder; /// [`GenericByteViewBuilder::append_null`] as normal. pub type BinaryViewBuilder = GenericByteViewBuilder; +/// Creates a view from a fixed length input (the compiler can generate +/// specialized code for this) +fn make_view_inner(data: &[u8]) -> u128 { + let mut view_buffer = [0; 16]; + view_buffer[0..4].copy_from_slice(&(LEN as u32).to_le_bytes()); + view_buffer[4..4 + LEN].copy_from_slice(&data[..LEN]); + u128::from_le_bytes(view_buffer) +} + /// Create a view based on the given data, block id and offset -#[inline(always)] +/// Note that the code below is carefully examined with x86_64 assembly code: https://godbolt.org/z/685YPsd5G +/// The goal is to avoid calling into `ptr::copy_non_interleave`, which makes function call (i.e., not inlined), +/// which slows down things. +#[inline(never)] pub fn make_view(data: &[u8], block_id: u32, offset: u32) -> u128 { - let len = data.len() as u32; - if len <= 12 { - let mut view_buffer = [0; 16]; - view_buffer[0..4].copy_from_slice(&len.to_le_bytes()); - view_buffer[4..4 + data.len()].copy_from_slice(data); - u128::from_le_bytes(view_buffer) - } else { - let view = ByteView { - length: len, - prefix: u32::from_le_bytes(data[0..4].try_into().unwrap()), - buffer_index: block_id, - offset, - }; - view.into() + let len = data.len(); + + // Generate specialized code for each potential small string length + // to improve performance + match len { + 0 => make_view_inner::<0>(data), + 1 => make_view_inner::<1>(data), + 2 => make_view_inner::<2>(data), + 3 => make_view_inner::<3>(data), + 4 => make_view_inner::<4>(data), + 5 => make_view_inner::<5>(data), + 6 => make_view_inner::<6>(data), + 7 => make_view_inner::<7>(data), + 8 => make_view_inner::<8>(data), + 9 => make_view_inner::<9>(data), + 10 => make_view_inner::<10>(data), + 11 => make_view_inner::<11>(data), + 12 => make_view_inner::<12>(data), + _ => { + let view = ByteView { + length: len as u32, + prefix: u32::from_le_bytes(data[0..4].try_into().unwrap()), + buffer_index: block_id, + offset, + }; + view.as_u128() + } } } diff --git a/parquet/benches/arrow_reader.rs b/parquet/benches/arrow_reader.rs index 814e75c249bf..1b61fcd58546 100644 --- a/parquet/benches/arrow_reader.rs +++ b/parquet/benches/arrow_reader.rs @@ -287,7 +287,7 @@ fn build_plain_encoded_byte_array_page_iterator_inner( }; if def_level == max_def_level { let string_value = if short_string { - format!("{k}{i}{j}") + format!("test") } else { format!("Test value {k}, row group: {i}, page: {j}") }; From b8336413b5cc527dfe11460f9ba67ea0152a0e54 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Mon, 22 Jul 2024 11:55:28 -0400 Subject: [PATCH 3/5] make doc happy --- arrow-array/src/builder/generic_bytes_view_builder.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index b89bb34c2448..ccb79b5d1372 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -430,7 +430,7 @@ fn make_view_inner(data: &[u8]) -> u128 { } /// Create a view based on the given data, block id and offset -/// Note that the code below is carefully examined with x86_64 assembly code: https://godbolt.org/z/685YPsd5G +/// Note that the code below is carefully examined with x86_64 assembly code: /// The goal is to avoid calling into `ptr::copy_non_interleave`, which makes function call (i.e., not inlined), /// which slows down things. #[inline(never)] From 10608e1f7cf913bb6f7a7f14e2b94479657296ff Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Mon, 22 Jul 2024 12:00:04 -0400 Subject: [PATCH 4/5] fix clippy --- parquet/benches/arrow_reader.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/benches/arrow_reader.rs b/parquet/benches/arrow_reader.rs index 1b61fcd58546..814e75c249bf 100644 --- a/parquet/benches/arrow_reader.rs +++ b/parquet/benches/arrow_reader.rs @@ -287,7 +287,7 @@ fn build_plain_encoded_byte_array_page_iterator_inner( }; if def_level == max_def_level { let string_value = if short_string { - format!("test") + format!("{k}{i}{j}") } else { format!("Test value {k}, row group: {i}, page: {j}") }; From 13462596894ccdec51edd7ea8abbfae335117b25 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Mon, 22 Jul 2024 12:36:12 -0400 Subject: [PATCH 5/5] update comments --- .../src/builder/generic_bytes_view_builder.rs | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index ccb79b5d1372..7726ee35240f 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -422,7 +422,7 @@ pub type BinaryViewBuilder = GenericByteViewBuilder; /// Creates a view from a fixed length input (the compiler can generate /// specialized code for this) -fn make_view_inner(data: &[u8]) -> u128 { +fn make_inlined_view(data: &[u8]) -> u128 { let mut view_buffer = [0; 16]; view_buffer[0..4].copy_from_slice(&(LEN as u32).to_le_bytes()); view_buffer[4..4 + LEN].copy_from_slice(&data[..LEN]); @@ -440,19 +440,20 @@ pub fn make_view(data: &[u8], block_id: u32, offset: u32) -> u128 { // Generate specialized code for each potential small string length // to improve performance match len { - 0 => make_view_inner::<0>(data), - 1 => make_view_inner::<1>(data), - 2 => make_view_inner::<2>(data), - 3 => make_view_inner::<3>(data), - 4 => make_view_inner::<4>(data), - 5 => make_view_inner::<5>(data), - 6 => make_view_inner::<6>(data), - 7 => make_view_inner::<7>(data), - 8 => make_view_inner::<8>(data), - 9 => make_view_inner::<9>(data), - 10 => make_view_inner::<10>(data), - 11 => make_view_inner::<11>(data), - 12 => make_view_inner::<12>(data), + 0 => make_inlined_view::<0>(data), + 1 => make_inlined_view::<1>(data), + 2 => make_inlined_view::<2>(data), + 3 => make_inlined_view::<3>(data), + 4 => make_inlined_view::<4>(data), + 5 => make_inlined_view::<5>(data), + 6 => make_inlined_view::<6>(data), + 7 => make_inlined_view::<7>(data), + 8 => make_inlined_view::<8>(data), + 9 => make_inlined_view::<9>(data), + 10 => make_inlined_view::<10>(data), + 11 => make_inlined_view::<11>(data), + 12 => make_inlined_view::<12>(data), + // When string is longer than 12 bytes, it can't be inlined, we create a ByteView instead. _ => { let view = ByteView { length: len as u32,