Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions datafusion/functions/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,11 @@ criterion = { workspace = true }
rand = { workspace = true }
tokio = { workspace = true, features = ["macros", "rt", "sync"] }

[[bench]]
harness = false
name = "ascii"
required-features = ["string_expressions"]

[[bench]]
harness = false
name = "concat"
Expand Down
116 changes: 116 additions & 0 deletions datafusion/functions/benches/ascii.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

extern crate criterion;
mod helper;

use arrow::datatypes::{DataType, Field};
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use datafusion_expr::ScalarFunctionArgs;
use helper::gen_string_array;

fn criterion_benchmark(c: &mut Criterion) {
let ascii = datafusion_functions::string::ascii();

// All benches are single batch run with 8192 rows
const N_ROWS: usize = 8192;
const STR_LEN: usize = 16;
const NULL_DENSITY: f32 = 0.2;
Copy link
Contributor

@Dandandan Dandandan May 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it makes sense to add a variant of the tests with NULL_DENSITY of 0.0.
This way we can optimize for the case with only non-null values.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it makes sense to add a variant of the tests with NULL_DENSITY of 0.0. This way we can optimize for the case with only non-null values.

I just added benchmark for this case. Seems that there is not much change in performance.

const UTF8_DENSITY_OF_ALL_ASCII: f32 = 0.0;
const NORMAL_UTF8_DENSITY: f32 = 0.8;

// StringArray ASCII only
let args_string_ascii = gen_string_array(
N_ROWS,
STR_LEN,
NULL_DENSITY,
UTF8_DENSITY_OF_ALL_ASCII,
false,
);
c.bench_function("ascii/string_ascii_only", |b| {
b.iter(|| {
black_box(ascii.invoke_with_args(ScalarFunctionArgs {
args: args_string_ascii.clone(),
arg_fields: vec![&Field::new(
"a",
args_string_ascii[0].data_type(),
true,
)],
number_rows: N_ROWS,
return_field: &Field::new("f", DataType::Utf8, true),
}))
})
});

// StringArray UTF8
let args_string_utf8 =
gen_string_array(N_ROWS, STR_LEN, NULL_DENSITY, NORMAL_UTF8_DENSITY, false);
c.bench_function("ascii/string_utf8", |b| {
b.iter(|| {
black_box(ascii.invoke_with_args(ScalarFunctionArgs {
args: args_string_utf8.clone(),
arg_fields: vec![&Field::new("a", args_string_utf8[0].data_type(), true)],
number_rows: N_ROWS,
return_field: &Field::new("f", DataType::Utf8, true),
}))
})
});

// StringViewArray ASCII only
let args_string_view_ascii = gen_string_array(
N_ROWS,
STR_LEN,
NULL_DENSITY,
UTF8_DENSITY_OF_ALL_ASCII,
true,
);
c.bench_function("ascii/string_view_ascii_only", |b| {
b.iter(|| {
black_box(ascii.invoke_with_args(ScalarFunctionArgs {
args: args_string_view_ascii.clone(),
arg_fields: vec![&Field::new(
"a",
args_string_view_ascii[0].data_type(),
true,
)],
number_rows: N_ROWS,
return_field: &Field::new("f", DataType::Utf8, true),
}))
})
});

// StringViewArray UTF8
let args_string_view_utf8 =
gen_string_array(N_ROWS, STR_LEN, NULL_DENSITY, NORMAL_UTF8_DENSITY, true);
c.bench_function("ascii/string_view_utf8", |b| {
b.iter(|| {
black_box(ascii.invoke_with_args(ScalarFunctionArgs {
args: args_string_view_utf8.clone(),
arg_fields: vec![&Field::new(
"a",
args_string_view_utf8[0].data_type(),
true,
)],
number_rows: N_ROWS,
return_field: &Field::new("f", DataType::Utf8, true),
}))
})
});
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
26 changes: 15 additions & 11 deletions datafusion/functions/src/string/ascii.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
// under the License.

use crate::utils::make_scalar_function;
use arrow::array::{ArrayAccessor, ArrayIter, ArrayRef, AsArray, Int32Array};
use arrow::array::{ArrayRef, AsArray, Int32Array, StringArrayType};
use arrow::datatypes::DataType;
use arrow::error::ArrowError;
use datafusion_common::types::logical_string;
Expand Down Expand Up @@ -103,19 +103,22 @@ impl ScalarUDFImpl for AsciiFunc {

fn calculate_ascii<'a, V>(array: V) -> Result<ArrayRef, ArrowError>
where
V: ArrayAccessor<Item = &'a str>,
V: StringArrayType<'a, Item = &'a str>,
{
let iter = ArrayIter::new(array);
let result = iter
.map(|string| {
string.map(|s| {
let mut chars = s.chars();
chars.next().map_or(0, |v| v as i32)
})
let values: Vec<_> = (0..array.len())
.map(|i| {
if array.is_null(i) {
0
Comment on lines +110 to +111
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be legal to omit this if (and the jump it implies)?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be legal. I am not sure it would be faster, though we could easily check with a benchmark once this PR is merged

We could at least add a check on the outside of the loop

// If the array has no nulls, skip checking each value
if array.null_count() == 0 {
  let values: Vec<_> = (0..array.len())....
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had to look it up

Offsets must be monotonically increasing, that is offsets[j+1] >= offsets[j] for 0 <= j < length, even for null slots. This property ensures the location for all values is valid and well defined.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I realize now the spec doesn't say anything about the offset pointing to valid utf8 or not.
But value(i) being not unsafe, I guess arrow-rs should provide that guarantee.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is my understanding as well -- it should be safe (in the rust sense) to look up an entry that is set to null

} else {
let s = array.value(i);
s.chars().next().map_or(0, |c| c as i32)
}
})
.collect::<Int32Array>();
.collect();

Ok(Arc::new(result) as ArrayRef)
let array = Int32Array::new(values.into(), array.nulls().cloned());

Ok(Arc::new(array))
}

/// Returns the numeric code of the first character of the argument.
Expand Down Expand Up @@ -182,6 +185,7 @@ mod tests {
test_ascii!(Some(String::from("x")), Ok(Some(120)));
test_ascii!(Some(String::from("a")), Ok(Some(97)));
test_ascii!(Some(String::from("")), Ok(Some(0)));
test_ascii!(Some(String::from("🚀")), Ok(Some(128640)));
test_ascii!(None, Ok(None));
Ok(())
}
Expand Down