Skip to content

Commit 78f8c5b

Browse files
committed
Support Utf8View for Avro
1 parent 07093a4 commit 78f8c5b

File tree

7 files changed

+897
-25
lines changed

7 files changed

+897
-25
lines changed

arrow-avro/Cargo.toml

+7
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,10 @@ crc = { version = "3.0", optional = true }
5353

5454
[dev-dependencies]
5555
rand = { version = "0.9", default-features = false, features = ["std", "std_rng", "thread_rng"] }
56+
criterion = { version = "0.5", default-features = false }
57+
tempfile = "3.3"
58+
arrow = { workspace = true }
59+
60+
[[bench]]
61+
name = "avro_reader"
62+
harness = false

arrow-avro/benches/avro_reader.rs

+275
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,275 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! Comprehensive benchmarks comparing StringArray vs StringViewArray performance
19+
//!
20+
//! This benchmark suite compares the performance characteristics of StringArray vs
21+
//! StringViewArray across three key dimensions:
22+
//! 1. Array creation performance
23+
//! 2. String value access operations
24+
//! 3. Avro file reading with each array type
25+
26+
use std::fs::File;
27+
use std::io::{BufReader, Read, Write};
28+
use std::sync::Arc;
29+
use std::time::Duration;
30+
31+
use arrow::array::RecordBatch;
32+
use arrow::datatypes::{DataType, Field, Schema};
33+
use arrow_array::{ArrayRef, Int32Array, StringArray, StringViewArray};
34+
use arrow_avro::ReadOptions;
35+
use arrow_schema::ArrowError;
36+
use criterion::*;
37+
use tempfile::NamedTempFile;
38+
39+
fn create_test_data(count: usize, str_length: usize) -> Vec<String> {
40+
(0..count)
41+
.map(|i| format!("str_{}", i) + &"a".repeat(str_length))
42+
.collect()
43+
}
44+
45+
fn create_avro_test_file(row_count: usize, str_length: usize) -> Result<NamedTempFile, ArrowError> {
46+
let schema = Arc::new(Schema::new(vec![
47+
Field::new("string_field", DataType::Utf8, false),
48+
Field::new("int_field", DataType::Int32, false),
49+
]));
50+
51+
let strings = create_test_data(row_count, str_length);
52+
let string_array = StringArray::from_iter(strings.iter().map(|s| Some(s.as_str())));
53+
let int_array = Int32Array::from_iter_values(0..row_count as i32);
54+
let _batch = RecordBatch::try_new(
55+
schema.clone(),
56+
vec![
57+
Arc::new(string_array) as ArrayRef,
58+
Arc::new(int_array) as ArrayRef,
59+
],
60+
)?;
61+
62+
let temp_file = NamedTempFile::new()?;
63+
64+
let mut file = temp_file.reopen()?;
65+
66+
file.write_all(b"AVRO")?;
67+
68+
for (i, string) in strings.iter().enumerate().take(row_count) {
69+
let s = string.as_bytes();
70+
let len = s.len() as u32;
71+
file.write_all(&len.to_le_bytes())?;
72+
file.write_all(s)?;
73+
file.write_all(&(i as i32).to_le_bytes())?;
74+
}
75+
76+
file.flush()?;
77+
Ok(temp_file)
78+
}
79+
80+
fn read_avro_test_file(
81+
file_path: &std::path::Path,
82+
options: &ReadOptions,
83+
) -> Result<RecordBatch, ArrowError> {
84+
let file = File::open(file_path)?;
85+
let mut reader = BufReader::new(file);
86+
87+
let mut header = [0u8; 4];
88+
reader.read_exact(&mut header)?;
89+
90+
let mut strings = Vec::new();
91+
let mut ints = Vec::new();
92+
93+
loop {
94+
let mut len_bytes = [0u8; 4];
95+
if reader.read_exact(&mut len_bytes).is_err() {
96+
break; // End of file
97+
}
98+
99+
let len = u32::from_le_bytes(len_bytes) as usize;
100+
let mut buf = vec![0u8; len];
101+
reader.read_exact(&mut buf)?;
102+
103+
let s = String::from_utf8(buf)
104+
.map_err(|e| ArrowError::ParseError(format!("Invalid UTF-8: {}", e)))?;
105+
106+
strings.push(s);
107+
108+
let mut int_bytes = [0u8; 4];
109+
reader.read_exact(&mut int_bytes)?;
110+
ints.push(i32::from_le_bytes(int_bytes));
111+
}
112+
113+
let string_array: ArrayRef = if options.use_utf8view {
114+
Arc::new(StringViewArray::from_iter(
115+
strings.iter().map(|s| Some(s.as_str())),
116+
))
117+
} else {
118+
Arc::new(StringArray::from_iter(
119+
strings.iter().map(|s| Some(s.as_str())),
120+
))
121+
};
122+
123+
let int_array: ArrayRef = Arc::new(Int32Array::from(ints));
124+
125+
let schema = Arc::new(Schema::new(vec![
126+
if options.use_utf8view {
127+
Field::new("string_field", DataType::Utf8View, false)
128+
} else {
129+
Field::new("string_field", DataType::Utf8, false)
130+
},
131+
Field::new("int_field", DataType::Int32, false),
132+
]));
133+
134+
RecordBatch::try_new(schema, vec![string_array, int_array])
135+
}
136+
137+
fn bench_array_creation(c: &mut Criterion) {
138+
let mut group = c.benchmark_group("array_creation");
139+
group.sample_size(20);
140+
group.measurement_time(Duration::from_secs(5));
141+
142+
for &str_length in &[10, 100, 1000] {
143+
let data = create_test_data(10000, str_length);
144+
let row_count = 1000;
145+
146+
group.bench_function(format!("string_array_{}_chars", str_length), |b| {
147+
b.iter(|| {
148+
let string_array =
149+
StringArray::from_iter(data[0..row_count].iter().map(|s| Some(s.as_str())));
150+
let int_array = Int32Array::from_iter_values(0..row_count as i32);
151+
152+
let schema = Arc::new(Schema::new(vec![
153+
Field::new("string_field", DataType::Utf8, false),
154+
Field::new("int_field", DataType::Int32, false),
155+
]));
156+
157+
let batch = RecordBatch::try_new(
158+
schema,
159+
vec![
160+
Arc::new(string_array) as ArrayRef,
161+
Arc::new(int_array) as ArrayRef,
162+
],
163+
)
164+
.unwrap();
165+
166+
criterion::black_box(batch)
167+
})
168+
});
169+
170+
group.bench_function(format!("string_view_{}_chars", str_length), |b| {
171+
b.iter(|| {
172+
let string_array =
173+
StringViewArray::from_iter(data[0..row_count].iter().map(|s| Some(s.as_str())));
174+
let int_array = Int32Array::from_iter_values(0..row_count as i32);
175+
176+
let schema = Arc::new(Schema::new(vec![
177+
Field::new("string_field", DataType::Utf8View, false),
178+
Field::new("int_field", DataType::Int32, false),
179+
]));
180+
181+
let batch = RecordBatch::try_new(
182+
schema,
183+
vec![
184+
Arc::new(string_array) as ArrayRef,
185+
Arc::new(int_array) as ArrayRef,
186+
],
187+
)
188+
.unwrap();
189+
190+
criterion::black_box(batch)
191+
})
192+
});
193+
}
194+
195+
group.finish();
196+
}
197+
198+
fn bench_string_operations(c: &mut Criterion) {
199+
let mut group = c.benchmark_group("string_operations");
200+
group.sample_size(20);
201+
group.measurement_time(Duration::from_secs(5));
202+
203+
for &str_length in &[10, 100, 1000] {
204+
let data = create_test_data(10000, str_length);
205+
let rows = 1000;
206+
207+
let string_array = StringArray::from_iter(data[0..rows].iter().map(|s| Some(s.as_str())));
208+
let string_view_array =
209+
StringViewArray::from_iter(data[0..rows].iter().map(|s| Some(s.as_str())));
210+
211+
group.bench_function(format!("string_array_value_{}_chars", str_length), |b| {
212+
b.iter(|| {
213+
let mut sum_len = 0;
214+
for i in 0..rows {
215+
sum_len += string_array.value(i).len();
216+
}
217+
criterion::black_box(sum_len)
218+
})
219+
});
220+
221+
group.bench_function(format!("string_view_value_{}_chars", str_length), |b| {
222+
b.iter(|| {
223+
let mut sum_len = 0;
224+
for i in 0..rows {
225+
sum_len += string_view_array.value(i).len();
226+
}
227+
criterion::black_box(sum_len)
228+
})
229+
});
230+
}
231+
232+
group.finish();
233+
}
234+
235+
fn bench_avro_reader(c: &mut Criterion) {
236+
let mut group = c.benchmark_group("avro_reader");
237+
group.sample_size(20);
238+
group.measurement_time(Duration::from_secs(5));
239+
240+
for &str_length in &[10, 100, 1000] {
241+
let row_count = 1000;
242+
let temp_file = create_avro_test_file(row_count, str_length).unwrap();
243+
let file_path = temp_file.path();
244+
245+
group.bench_function(format!("string_array_{}_chars", str_length), |b| {
246+
b.iter(|| {
247+
let options = ReadOptions {
248+
use_utf8view: false,
249+
};
250+
251+
let batch = read_avro_test_file(file_path, &options).unwrap();
252+
criterion::black_box(batch)
253+
})
254+
});
255+
256+
group.bench_function(format!("string_view_{}_chars", str_length), |b| {
257+
b.iter(|| {
258+
let options = ReadOptions { use_utf8view: true };
259+
260+
let batch = read_avro_test_file(file_path, &options).unwrap();
261+
criterion::black_box(batch)
262+
})
263+
});
264+
}
265+
266+
group.finish();
267+
}
268+
269+
criterion_group!(
270+
benches,
271+
bench_array_creation,
272+
bench_string_operations,
273+
bench_avro_reader
274+
);
275+
criterion_main!(benches);

0 commit comments

Comments
 (0)