Skip to content

Commit acde46b

Browse files
author
Tai Le Manh
committed
Optimize performance of string::ascii function
1 parent 3e30f77 commit acde46b

File tree

3 files changed

+149
-14
lines changed

3 files changed

+149
-14
lines changed

datafusion/functions/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,11 @@ criterion = { workspace = true }
9090
rand = { workspace = true }
9191
tokio = { workspace = true, features = ["macros", "rt", "sync"] }
9292

93+
[[bench]]
94+
harness = false
95+
name = "ascii"
96+
required-features = ["string_expressions"]
97+
9398
[[bench]]
9499
harness = false
95100
name = "concat"
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
extern crate criterion;
19+
mod helper;
20+
21+
use arrow::datatypes::{DataType, Field};
22+
use criterion::{black_box, criterion_group, criterion_main, Criterion};
23+
use datafusion_expr::ScalarFunctionArgs;
24+
use helper::gen_string_array;
25+
26+
fn criterion_benchmark(c: &mut Criterion) {
27+
let ascii = datafusion_functions::string::ascii();
28+
29+
// All benches are single batch run with 8192 rows
30+
const N_ROWS: usize = 8192;
31+
const STR_LEN: usize = 16;
32+
const NULL_DENSITY: f32 = 0.1;
33+
const UTF8_DENSITY_OF_ALL_ASCII: f32 = 0.0;
34+
const NORMAL_UTF8_DENSITY: f32 = 0.8;
35+
36+
// StringArray ASCII only
37+
let args_string_ascii = gen_string_array(
38+
N_ROWS,
39+
STR_LEN,
40+
NULL_DENSITY,
41+
UTF8_DENSITY_OF_ALL_ASCII,
42+
false,
43+
);
44+
c.bench_function("ascii/string_ascii_only", |b| {
45+
b.iter(|| {
46+
black_box(ascii.invoke_with_args(ScalarFunctionArgs {
47+
args: args_string_ascii.clone(),
48+
arg_fields: vec![&Field::new(
49+
"a",
50+
args_string_ascii[0].data_type(),
51+
true,
52+
)],
53+
number_rows: N_ROWS,
54+
return_field: &Field::new("f", DataType::Utf8, true),
55+
}))
56+
})
57+
});
58+
59+
// StringArray UTF8
60+
let args_string_utf8 =
61+
gen_string_array(N_ROWS, STR_LEN, NULL_DENSITY, NORMAL_UTF8_DENSITY, false);
62+
c.bench_function("ascii/string_utf8", |b| {
63+
b.iter(|| {
64+
black_box(ascii.invoke_with_args(ScalarFunctionArgs {
65+
args: args_string_utf8.clone(),
66+
arg_fields: vec![&Field::new("a", args_string_utf8[0].data_type(), true)],
67+
number_rows: N_ROWS,
68+
return_field: &Field::new("f", DataType::Utf8, true),
69+
}))
70+
})
71+
});
72+
73+
// StringViewArray ASCII only
74+
let args_string_view_ascii = gen_string_array(
75+
N_ROWS,
76+
STR_LEN,
77+
NULL_DENSITY,
78+
UTF8_DENSITY_OF_ALL_ASCII,
79+
true,
80+
);
81+
c.bench_function("ascii/string_view_ascii_only", |b| {
82+
b.iter(|| {
83+
black_box(ascii.invoke_with_args(ScalarFunctionArgs {
84+
args: args_string_view_ascii.clone(),
85+
arg_fields: vec![&Field::new(
86+
"a",
87+
args_string_view_ascii[0].data_type(),
88+
true,
89+
)],
90+
number_rows: N_ROWS,
91+
return_field: &Field::new("f", DataType::Utf8, true),
92+
}))
93+
})
94+
});
95+
96+
// StringViewArray UTF8
97+
let args_string_view_utf8 =
98+
gen_string_array(N_ROWS, STR_LEN, NULL_DENSITY, NORMAL_UTF8_DENSITY, true);
99+
c.bench_function("ascii/string_view_utf8", |b| {
100+
b.iter(|| {
101+
black_box(ascii.invoke_with_args(ScalarFunctionArgs {
102+
args: args_string_view_utf8.clone(),
103+
arg_fields: vec![&Field::new(
104+
"a",
105+
args_string_view_utf8[0].data_type(),
106+
true,
107+
)],
108+
number_rows: N_ROWS,
109+
return_field: &Field::new("f", DataType::Utf8, true),
110+
}))
111+
})
112+
});
113+
}
114+
115+
criterion_group!(benches, criterion_benchmark);
116+
criterion_main!(benches);

datafusion/functions/src/string/ascii.rs

Lines changed: 28 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,12 @@
1616
// under the License.
1717

1818
use crate::utils::make_scalar_function;
19-
use arrow::array::{ArrayAccessor, ArrayIter, ArrayRef, AsArray, Int32Array};
20-
use arrow::datatypes::DataType;
19+
use arrow::array::{
20+
ArrayData, ArrayRef, ArrowPrimitiveType, AsArray, Int32Array, StringArrayType,
21+
};
22+
use arrow::datatypes::{DataType, Int32Type};
2123
use arrow::error::ArrowError;
24+
use arrow_buffer::Buffer;
2225
use datafusion_common::types::logical_string;
2326
use datafusion_common::{internal_err, Result};
2427
use datafusion_expr::{ColumnarValue, Documentation, TypeSignatureClass};
@@ -103,19 +106,29 @@ impl ScalarUDFImpl for AsciiFunc {
103106

104107
fn calculate_ascii<'a, V>(array: V) -> Result<ArrayRef, ArrowError>
105108
where
106-
V: ArrayAccessor<Item = &'a str>,
109+
V: StringArrayType<'a, Item = &'a str>,
107110
{
108-
let iter = ArrayIter::new(array);
109-
let result = iter
110-
.map(|string| {
111-
string.map(|s| {
112-
let mut chars = s.chars();
113-
chars.next().map_or(0, |v| v as i32)
114-
})
115-
})
116-
.collect::<Int32Array>();
117-
118-
Ok(Arc::new(result) as ArrayRef)
111+
let mut values = Vec::with_capacity(array.len());
112+
113+
for i in 0..array.len() {
114+
if array.is_null(i) {
115+
values.push(0);
116+
} else {
117+
let s = array.value(i);
118+
let code = s.chars().next().map_or(0, |c| c as i32);
119+
values.push(code);
120+
}
121+
}
122+
123+
let nulls = array.nulls().cloned();
124+
125+
let data = ArrayData::builder(Int32Type::DATA_TYPE)
126+
.len(array.len())
127+
.add_buffer(Buffer::from_slice_ref(&values))
128+
.null_bit_buffer(nulls.map(|n| n.buffer().clone()))
129+
.build()?;
130+
131+
Ok(Arc::new(Int32Array::from(data)) as ArrayRef)
119132
}
120133

121134
/// Returns the numeric code of the first character of the argument.
@@ -182,6 +195,7 @@ mod tests {
182195
test_ascii!(Some(String::from("x")), Ok(Some(120)));
183196
test_ascii!(Some(String::from("a")), Ok(Some(97)));
184197
test_ascii!(Some(String::from("")), Ok(Some(0)));
198+
test_ascii!(Some(String::from("🚀")), Ok(Some(128640)));
185199
test_ascii!(None, Ok(None));
186200
Ok(())
187201
}

0 commit comments

Comments
 (0)