Skip to content

Commit 9e91ef4

Browse files
etseidlalamb
andauthored
Improve performance of reading int8/int16 Parquet data (#7055)
* allow for reading improperly encode UINT_8 and UINT_16 parquet data * add some benchmarks * remove print * checkpoint some experimental code * checkpoint * add a few more types * modify comment * another edit --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 188a141 commit 9e91ef4

File tree

1 file changed

+42
-2
lines changed

1 file changed

+42
-2
lines changed

parquet/src/arrow/array_reader/primitive_array.rs

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,9 @@ use arrow_array::{
2929
TimestampNanosecondBufferBuilder, TimestampSecondBufferBuilder,
3030
},
3131
ArrayRef, BooleanArray, Decimal128Array, Decimal256Array, Float32Array, Float64Array,
32-
Int32Array, Int64Array, TimestampMicrosecondArray, TimestampMillisecondArray,
33-
TimestampNanosecondArray, TimestampSecondArray, UInt32Array, UInt64Array,
32+
Int16Array, Int32Array, Int64Array, Int8Array, TimestampMicrosecondArray,
33+
TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array,
34+
UInt32Array, UInt64Array, UInt8Array,
3435
};
3536
use arrow_buffer::{i256, BooleanBuffer, Buffer};
3637
use arrow_data::ArrayDataBuilder;
@@ -261,6 +262,45 @@ where
261262
// - date64: cast int32 to date32, then date32 to date64.
262263
// - decimal: cast int32 to decimal, int64 to decimal
263264
let array = match target_type {
265+
// Using `arrow_cast::cast` has been found to be very slow for converting
266+
// INT32 physical type to lower bitwidth logical types. Since rust casts
267+
// are infallible, instead use `unary` which is much faster (by up to 40%).
268+
// One consequence of this approach is that some malformed integer columns
269+
// will return (an arguably correct) result rather than null.
270+
// See https://github.com/apache/arrow-rs/issues/7040 for a discussion of this
271+
// issue.
272+
ArrowType::UInt8 if *(array.data_type()) == ArrowType::Int32 => {
273+
let array = array
274+
.as_any()
275+
.downcast_ref::<Int32Array>()
276+
.unwrap()
277+
.unary(|i| i as u8) as UInt8Array;
278+
Arc::new(array) as ArrayRef
279+
}
280+
ArrowType::Int8 if *(array.data_type()) == ArrowType::Int32 => {
281+
let array = array
282+
.as_any()
283+
.downcast_ref::<Int32Array>()
284+
.unwrap()
285+
.unary(|i| i as i8) as Int8Array;
286+
Arc::new(array) as ArrayRef
287+
}
288+
ArrowType::UInt16 if *(array.data_type()) == ArrowType::Int32 => {
289+
let array = array
290+
.as_any()
291+
.downcast_ref::<Int32Array>()
292+
.unwrap()
293+
.unary(|i| i as u16) as UInt16Array;
294+
Arc::new(array) as ArrayRef
295+
}
296+
ArrowType::Int16 if *(array.data_type()) == ArrowType::Int32 => {
297+
let array = array
298+
.as_any()
299+
.downcast_ref::<Int32Array>()
300+
.unwrap()
301+
.unary(|i| i as i16) as Int16Array;
302+
Arc::new(array) as ArrayRef
303+
}
264304
ArrowType::Date64 if *(array.data_type()) == ArrowType::Int32 => {
265305
// this is cheap as it internally reinterprets the data
266306
let a = arrow_cast::cast(&array, &ArrowType::Date32)?;

0 commit comments

Comments
 (0)