Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions arrow-array/src/array/run_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,12 @@ use crate::{

/// An array of [run-end encoded values](https://arrow.apache.org/docs/format/Columnar.html#run-end-encoded-layout)
///
/// This encoding is variation on [run-length encoding (RLE)](https://en.wikipedia.org/wiki/Run-length_encoding)
/// This encoding is a variation on [run-length encoding (RLE)](https://en.wikipedia.org/wiki/Run-length_encoding)
/// and is good for representing data containing same values repeated consecutively.
///
/// [`RunArray`] contains `run_ends` array and `values` array of same length.
/// The `run_ends` array stores the indexes at which the run ends. The `values` array
/// stores the value of each run. Below example illustrates how a logical array is represented in
/// stores the value of each run. The below example illustrates how a logical array is represented in
/// [`RunArray`]
///
///
Expand Down
2 changes: 1 addition & 1 deletion arrow-schema/src/datatype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ pub enum DataType {
/// that contain many repeated values using less memory, but with
/// a higher CPU overhead for some operations.
///
/// This type mostly used to represent low cardinality string
/// This type is mostly used to represent low cardinality string
/// arrays or a limited set of primitive types as integers.
Dictionary(Box<DataType>, Box<DataType>),
/// Exact 32-bit width decimal value with precision and scale
Expand Down
27 changes: 27 additions & 0 deletions parquet/src/arrow/array_reader/byte_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,28 @@ pub fn make_byte_array_reader(
pages, data_type, reader,
)))
}
// TODO eventually add a dedicated [`ArrayReader`] for REE
ArrowType::RunEndEncoded(_, ref val_field) => match val_field.data_type() {
ArrowType::Binary
| ArrowType::Utf8
| ArrowType::Decimal128(_, _)
| ArrowType::Decimal256(_, _) => {
let reader = GenericRecordReader::new(column_desc);
Ok(Box::new(ByteArrayReader::<i32>::new(
pages, data_type, reader,
)))
}
ArrowType::LargeUtf8 | ArrowType::LargeBinary => {
let reader = GenericRecordReader::new(column_desc);
Ok(Box::new(ByteArrayReader::<i64>::new(
pages, data_type, reader,
)))
}
_ => Err(general_err!(
"invalid RunEndEncoded value type for byte array reader - {}",
data_type
)),
},
_ => Err(general_err!(
"invalid data type for byte array reader - {}",
data_type
Expand Down Expand Up @@ -147,6 +169,11 @@ impl<I: OffsetSizeTrait> ArrayReader for ByteArrayReader<I> {
.with_precision_and_scale(p, s)?;
Arc::new(decimal)
}
// TODO eventually add a dedicated [`ArrayReader`] for REE
ArrowType::RunEndEncoded(_, ref val_field) => {
let array = buffer.into_array(null_buffer, val_field.data_type().clone());
arrow_cast::cast(&array, &self.data_type)?
}
_ => buffer.into_array(null_buffer, self.data_type.clone()),
};

Expand Down
59 changes: 59 additions & 0 deletions parquet/src/arrow/array_reader/fixed_len_byte_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,65 @@ pub fn make_fixed_len_byte_array_reader(
));
}
}
// TODO eventually add a dedicated [`ArrayReader`] for REE
ArrowType::RunEndEncoded(_, val_field) => match val_field.data_type() {
ArrowType::FixedSizeBinary(_) => {}
ArrowType::Decimal32(_, _) => {
if byte_length > 4 {
return Err(general_err!(
"decimal 32 type too large, must be less then 4 bytes, got {}",
byte_length
));
}
}
ArrowType::Decimal64(_, _) => {
if byte_length > 8 {
return Err(general_err!(
"decimal 64 type too large, must be less then 8 bytes, got {}",
byte_length
));
}
}
ArrowType::Decimal128(_, _) => {
if byte_length > 16 {
return Err(general_err!(
"decimal 128 type too large, must be less than 16 bytes, got {}",
byte_length
));
}
}
ArrowType::Decimal256(_, _) => {
if byte_length > 32 {
return Err(general_err!(
"decimal 256 type too large, must be less than 32 bytes, got {}",
byte_length
));
}
}
ArrowType::Interval(_) => {
if byte_length != 12 {
// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#interval
return Err(general_err!(
"interval type must consist of 12 bytes got {}",
byte_length
));
}
}
ArrowType::Float16 => {
if byte_length != 2 {
return Err(general_err!(
"float 16 type must be 2 bytes, got {}",
byte_length
));
}
}
_ => {
return Err(general_err!(
"invalid RunEndEncoded value type for fixed length byte array reader - {}",
data_type
));
}
},
_ => {
return Err(general_err!(
"invalid data type for fixed length byte array reader - {}",
Expand Down
44 changes: 43 additions & 1 deletion parquet/src/arrow/arrow_writer/byte_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ use crate::util::bit_util::num_required_bits;
use crate::util::interner::{Interner, Storage};
use arrow_array::{
Array, ArrayAccessor, BinaryArray, BinaryViewArray, DictionaryArray, FixedSizeBinaryArray,
LargeBinaryArray, LargeStringArray, StringArray, StringViewArray,
LargeBinaryArray, LargeStringArray, RunArray, StringArray, StringViewArray,
};
use arrow_schema::DataType;

Expand Down Expand Up @@ -61,6 +61,28 @@ macro_rules! downcast_dict_op {
};
}

macro_rules! downcast_ree_impl {
($array:ident, $key:ident, $val:ident, $op:expr $(, $arg:expr)*) => {{
$op($array
.as_any()
.downcast_ref::<RunArray<arrow_array::types::$key>>()
.unwrap()
.downcast::<$val>()
.unwrap()$(, $arg)*)
}};
}

macro_rules! downcast_ree_op {
($run_end_field:expr, $val:ident, $array:ident, $op:expr $(, $arg:expr)*) => {
match $run_end_field.data_type() {
DataType::Int16 => downcast_ree_impl!($array, Int16Type, $val, $op$(, $arg)*),
DataType::Int32 => downcast_ree_impl!($array, Int32Type, $val, $op$(, $arg)*),
DataType::Int64 => downcast_ree_impl!($array, Int64Type, $val, $op$(, $arg)*),
_ => unreachable!(),
}
};
}

macro_rules! downcast_op {
($data_type:expr, $array:ident, $op:expr $(, $arg:expr)*) => {
match $data_type {
Expand Down Expand Up @@ -92,6 +114,26 @@ macro_rules! downcast_op {
}
d => unreachable!("cannot downcast {} dictionary value to byte array", d),
},
DataType::RunEndEncoded(run_end, value) => match value.data_type() {
DataType::Utf8 => downcast_ree_op!(run_end, StringArray, $array, $op$(, $arg)*),
DataType::LargeUtf8 => {
downcast_ree_op!(run_end, LargeStringArray, $array, $op$(, $arg)*)
}
DataType::Utf8View => {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably not for this PR, but I notice the dictionary arms above don't have string/binary views 🤔

downcast_ree_op!(run_end, StringViewArray, $array, $op$(, $arg)*)
}
DataType::Binary => downcast_ree_op!(run_end, BinaryArray, $array, $op$(, $arg)*),
DataType::BinaryView => {
downcast_ree_op!(run_end, BinaryViewArray, $array, $op$(, $arg)*)
}
DataType::LargeBinary => {
downcast_ree_op!(run_end, LargeBinaryArray, $array, $op$(, $arg)*)
}
DataType::FixedSizeBinary(_) => {
downcast_ree_op!(run_end, FixedSizeBinaryArray, $array, $op$(, $arg)*)
}
d => unreachable!("cannot downcast {} run end encoded value to byte array", d),
},
d => unreachable!("cannot downcast {} to byte array", d),
}
};
Expand Down
4 changes: 4 additions & 0 deletions parquet/src/arrow/arrow_writer/levels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,10 @@ impl LevelInfoBuilder {
_ => unreachable!(),
})
}
DataType::RunEndEncoded(_, v) if is_leaf(v.data_type()) => {
let levels = ArrayLevels::new(parent_ctx, is_nullable, array.clone());
Ok(Self::Primitive(levels))
}
d => Err(nyi_err!("Datatype {} is not yet supported", d)),
}
}
Expand Down
Loading
Loading