-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Optimize performance of string::ascii function
#16087
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,116 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| extern crate criterion; | ||
| mod helper; | ||
|
|
||
| use arrow::datatypes::{DataType, Field}; | ||
| use criterion::{black_box, criterion_group, criterion_main, Criterion}; | ||
| use datafusion_expr::ScalarFunctionArgs; | ||
| use helper::gen_string_array; | ||
|
|
||
| fn criterion_benchmark(c: &mut Criterion) { | ||
| let ascii = datafusion_functions::string::ascii(); | ||
|
|
||
| // All benches are single batch run with 8192 rows | ||
| const N_ROWS: usize = 8192; | ||
| const STR_LEN: usize = 16; | ||
| const NULL_DENSITY: f32 = 0.2; | ||
| const UTF8_DENSITY_OF_ALL_ASCII: f32 = 0.0; | ||
| const NORMAL_UTF8_DENSITY: f32 = 0.8; | ||
|
|
||
| // StringArray ASCII only | ||
| let args_string_ascii = gen_string_array( | ||
| N_ROWS, | ||
| STR_LEN, | ||
| NULL_DENSITY, | ||
| UTF8_DENSITY_OF_ALL_ASCII, | ||
| false, | ||
| ); | ||
| c.bench_function("ascii/string_ascii_only", |b| { | ||
| b.iter(|| { | ||
| black_box(ascii.invoke_with_args(ScalarFunctionArgs { | ||
| args: args_string_ascii.clone(), | ||
| arg_fields: vec![&Field::new( | ||
| "a", | ||
| args_string_ascii[0].data_type(), | ||
| true, | ||
| )], | ||
| number_rows: N_ROWS, | ||
| return_field: &Field::new("f", DataType::Utf8, true), | ||
| })) | ||
| }) | ||
| }); | ||
|
|
||
| // StringArray UTF8 | ||
| let args_string_utf8 = | ||
| gen_string_array(N_ROWS, STR_LEN, NULL_DENSITY, NORMAL_UTF8_DENSITY, false); | ||
| c.bench_function("ascii/string_utf8", |b| { | ||
| b.iter(|| { | ||
| black_box(ascii.invoke_with_args(ScalarFunctionArgs { | ||
| args: args_string_utf8.clone(), | ||
| arg_fields: vec![&Field::new("a", args_string_utf8[0].data_type(), true)], | ||
| number_rows: N_ROWS, | ||
| return_field: &Field::new("f", DataType::Utf8, true), | ||
| })) | ||
| }) | ||
| }); | ||
|
|
||
| // StringViewArray ASCII only | ||
| let args_string_view_ascii = gen_string_array( | ||
| N_ROWS, | ||
| STR_LEN, | ||
| NULL_DENSITY, | ||
| UTF8_DENSITY_OF_ALL_ASCII, | ||
| true, | ||
| ); | ||
| c.bench_function("ascii/string_view_ascii_only", |b| { | ||
| b.iter(|| { | ||
| black_box(ascii.invoke_with_args(ScalarFunctionArgs { | ||
| args: args_string_view_ascii.clone(), | ||
| arg_fields: vec![&Field::new( | ||
| "a", | ||
| args_string_view_ascii[0].data_type(), | ||
| true, | ||
| )], | ||
| number_rows: N_ROWS, | ||
| return_field: &Field::new("f", DataType::Utf8, true), | ||
| })) | ||
| }) | ||
| }); | ||
|
|
||
| // StringViewArray UTF8 | ||
| let args_string_view_utf8 = | ||
| gen_string_array(N_ROWS, STR_LEN, NULL_DENSITY, NORMAL_UTF8_DENSITY, true); | ||
| c.bench_function("ascii/string_view_utf8", |b| { | ||
| b.iter(|| { | ||
| black_box(ascii.invoke_with_args(ScalarFunctionArgs { | ||
| args: args_string_view_utf8.clone(), | ||
| arg_fields: vec![&Field::new( | ||
| "a", | ||
| args_string_view_utf8[0].data_type(), | ||
| true, | ||
| )], | ||
| number_rows: N_ROWS, | ||
| return_field: &Field::new("f", DataType::Utf8, true), | ||
| })) | ||
| }) | ||
| }); | ||
| } | ||
|
|
||
| criterion_group!(benches, criterion_benchmark); | ||
| criterion_main!(benches); | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,7 +16,7 @@ | |
| // under the License. | ||
|
|
||
| use crate::utils::make_scalar_function; | ||
| use arrow::array::{ArrayAccessor, ArrayIter, ArrayRef, AsArray, Int32Array}; | ||
| use arrow::array::{ArrayRef, AsArray, Int32Array, StringArrayType}; | ||
| use arrow::datatypes::DataType; | ||
| use arrow::error::ArrowError; | ||
| use datafusion_common::types::logical_string; | ||
|
|
@@ -103,19 +103,22 @@ impl ScalarUDFImpl for AsciiFunc { | |
|
|
||
| fn calculate_ascii<'a, V>(array: V) -> Result<ArrayRef, ArrowError> | ||
| where | ||
| V: ArrayAccessor<Item = &'a str>, | ||
| V: StringArrayType<'a, Item = &'a str>, | ||
| { | ||
| let iter = ArrayIter::new(array); | ||
| let result = iter | ||
| .map(|string| { | ||
| string.map(|s| { | ||
| let mut chars = s.chars(); | ||
| chars.next().map_or(0, |v| v as i32) | ||
| }) | ||
| let values: Vec<_> = (0..array.len()) | ||
| .map(|i| { | ||
| if array.is_null(i) { | ||
| 0 | ||
|
Comment on lines
+110
to
+111
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it be legal to omit this
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be legal. I am not sure it would be faster, though we could easily check with a benchmark once this PR is merged We could at least add a check on the outside of the loop // If the array has no nulls, skip checking each value
if array.null_count() == 0 {
let values: Vec<_> = (0..array.len())....
}
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I had to look it up
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I realize now the spec doesn't say anything about the offset pointing to valid utf8 or not.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That is my understanding as well -- it should be safe (in the rust sense) to look up an entry that is set to null |
||
| } else { | ||
| let s = array.value(i); | ||
| s.chars().next().map_or(0, |c| c as i32) | ||
| } | ||
| }) | ||
| .collect::<Int32Array>(); | ||
| .collect(); | ||
|
|
||
| Ok(Arc::new(result) as ArrayRef) | ||
| let array = Int32Array::new(values.into(), array.nulls().cloned()); | ||
|
|
||
| Ok(Arc::new(array)) | ||
| } | ||
|
|
||
| /// Returns the numeric code of the first character of the argument. | ||
|
|
@@ -182,6 +185,7 @@ mod tests { | |
| test_ascii!(Some(String::from("x")), Ok(Some(120))); | ||
| test_ascii!(Some(String::from("a")), Ok(Some(97))); | ||
| test_ascii!(Some(String::from("")), Ok(Some(0))); | ||
| test_ascii!(Some(String::from("🚀")), Ok(Some(128640))); | ||
| test_ascii!(None, Ok(None)); | ||
| Ok(()) | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it makes sense to add a variant of the tests with
NULL_DENSITYof 0.0.This way we can optimize for the case with only non-null values.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I just added benchmark for this case. Seems that there is not much change in performance.