diff --git a/crates/iceberg/src/arrow/mod.rs b/crates/iceberg/src/arrow/mod.rs index 0f01324cb..0c885e65f 100644 --- a/crates/iceberg/src/arrow/mod.rs +++ b/crates/iceberg/src/arrow/mod.rs @@ -22,5 +22,6 @@ pub use schema::*; mod reader; pub(crate) mod record_batch_projector; pub(crate) mod record_batch_transformer; - +mod value; pub use reader::*; +pub use value::*; diff --git a/crates/iceberg/src/arrow/schema.rs b/crates/iceberg/src/arrow/schema.rs index 4de9335d9..b9e36a3b4 100644 --- a/crates/iceberg/src/arrow/schema.rs +++ b/crates/iceberg/src/arrow/schema.rs @@ -221,7 +221,7 @@ pub fn arrow_type_to_type(ty: &DataType) -> Result { const ARROW_FIELD_DOC_KEY: &str = "doc"; -fn get_field_id(field: &Field) -> Result { +pub(super) fn get_field_id(field: &Field) -> Result { if let Some(value) = field.metadata().get(PARQUET_FIELD_ID_META_KEY) { return value.parse::().map_err(|e| { Error::new( diff --git a/crates/iceberg/src/arrow/value.rs b/crates/iceberg/src/arrow/value.rs new file mode 100644 index 000000000..3df12e88c --- /dev/null +++ b/crates/iceberg/src/arrow/value.rs @@ -0,0 +1,880 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::{ + Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Decimal128Array, FixedSizeBinaryArray, + FixedSizeListArray, Float32Array, Float64Array, Int32Array, Int64Array, LargeBinaryArray, + LargeListArray, LargeStringArray, ListArray, MapArray, NullArray, StringArray, StructArray, + Time64MicrosecondArray, TimestampMicrosecondArray, TimestampNanosecondArray, +}; +use arrow_schema::DataType; +use uuid::Uuid; + +use super::get_field_id; +use crate::spec::{ + visit_struct_with_partner, ListPartnerIterator, Literal, Map, MapPartnerIterator, + PartnerAccessor, PrimitiveType, SchemaWithPartnerVisitor, Struct, StructType, +}; +use crate::{Error, ErrorKind, Result}; + +struct ArrowArrayConverter; + +impl SchemaWithPartnerVisitor for ArrowArrayConverter { + type T = Vec>; + + fn schema( + &mut self, + _schema: &crate::spec::Schema, + _partner: &ArrayRef, + value: Vec>, + ) -> Result>> { + Ok(value) + } + + fn field( + &mut self, + field: &crate::spec::NestedFieldRef, + _partner: &ArrayRef, + value: Vec>, + ) -> Result>> { + // Make there is no null value if the field is required + if field.required && value.iter().any(Option::is_none) { + return Err(Error::new( + ErrorKind::DataInvalid, + "The field is required but has null value", + )); + } + Ok(value) + } + + fn r#struct( + &mut self, + _struct: &StructType, + _partner: &ArrayRef, + results: Vec>>, + ) -> Result>> { + let row_len = results.first().map(|column| column.len()).unwrap_or(0); + if results.iter().any(|column| column.len() != row_len) { + return Err(Error::new( + ErrorKind::DataInvalid, + "The struct columns have different row length", + )); + } + + let mut struct_literals = Vec::with_capacity(row_len); + let mut columns_iters = results + .into_iter() + .map(|column| column.into_iter()) + .collect::>(); + + for _ in 0..row_len { + let mut literals = Vec::with_capacity(columns_iters.len()); + for column_iter in columns_iters.iter_mut() { + literals.push(column_iter.next().unwrap()); + } + struct_literals.push(Some(Literal::Struct(Struct::from_iter(literals)))); + } + + Ok(struct_literals) + } + + fn list( + &mut self, + list: &crate::spec::ListType, + _partner: &ArrayRef, + results: Vec>>, + ) -> Result>> { + if list.element_field.required { + if results.iter().any(|row| row.iter().any(Option::is_none)) { + return Err(Error::new( + ErrorKind::DataInvalid, + "The list should not have null value", + )); + } + } + Ok(results + .into_iter() + .map(|row| Some(Literal::List(row))) + .collect()) + } + + fn map( + &mut self, + map: &crate::spec::MapType, + _partner: &ArrayRef, + key_values: Vec>>, + values: Vec>>, + ) -> Result>> { + // Make sure key_value and value have the same row length + if key_values.len() != values.len() { + return Err(Error::new( + ErrorKind::DataInvalid, + "The key value and value of map should have the same row length", + )); + } + + let mut result = Vec::with_capacity(key_values.len()); + for (key, value) in key_values.into_iter().zip(values.into_iter()) { + // Make sure key_value and value have the same length + if key.len() != value.len() { + return Err(Error::new( + ErrorKind::DataInvalid, + "The key value and value of map should have the same length", + )); + } + // Make sure no null value in key_value + if key.iter().any(Option::is_none) { + return Err(Error::new( + ErrorKind::DataInvalid, + "The key value of map should not have null value", + )); + } + + // Make sure no null value in value if value field is required + if map.value_field.required && value.iter().any(Option::is_none) { + return Err(Error::new( + ErrorKind::DataInvalid, + "The value of map should not have null value", + )); + } + + let mut map = Map::new(); + for (k, v) in key.into_iter().zip(value.into_iter()) { + map.insert(k.unwrap(), v.clone()); + } + result.push(Some(Literal::Map(map))); + } + + Ok(result) + } + + fn primitive(&mut self, p: &PrimitiveType, partner: &ArrayRef) -> Result>> { + if let Some(_) = partner.as_any().downcast_ref::() { + return Ok(vec![None; partner.len()]); + } + match p { + PrimitiveType::Boolean => { + let array = partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new(ErrorKind::DataInvalid, "The partner is not a boolean array") + })?; + Ok(array.iter().map(|v| v.map(Literal::bool)).collect()) + } + PrimitiveType::Int => { + let array = partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new(ErrorKind::DataInvalid, "The partner is not a int32 array") + })?; + Ok(array.iter().map(|v| v.map(Literal::int)).collect()) + } + PrimitiveType::Long => { + let array = partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new(ErrorKind::DataInvalid, "The partner is not a int64 array") + })?; + Ok(array.iter().map(|v| v.map(Literal::long)).collect()) + } + PrimitiveType::Float => { + let array = partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new(ErrorKind::DataInvalid, "The partner is not a float32 array") + })?; + Ok(array.iter().map(|v| v.map(Literal::float)).collect()) + } + PrimitiveType::Double => { + let array = partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new(ErrorKind::DataInvalid, "The partner is not a float64 array") + })?; + Ok(array.iter().map(|v| v.map(Literal::double)).collect()) + } + PrimitiveType::Decimal { precision, scale } => { + let array = partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "The partner is not a decimal128 array", + ) + })?; + if let DataType::Decimal128(arrow_precision, arrow_scale) = array.data_type() { + if *arrow_precision as u32 != *precision || *arrow_scale as u32 != *scale { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "The precision or scale ({},{}) of arrow decimal128 array is not compatitable with iceberg decimal type ({},{})", + arrow_precision, arrow_scale, precision, scale + ), + )); + } + } + Ok(array.iter().map(|v| v.map(Literal::decimal)).collect()) + } + PrimitiveType::Date => { + let array = partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new(ErrorKind::DataInvalid, "The partner is not a date32 array") + })?; + Ok(array.iter().map(|v| v.map(Literal::date)).collect()) + } + PrimitiveType::Time => { + let array = partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new(ErrorKind::DataInvalid, "The partner is not a time64 array") + })?; + Ok(array.iter().map(|v| v.map(Literal::time)).collect()) + } + PrimitiveType::Timestamp => { + let array = partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "The partner is not a timestamp array", + ) + })?; + Ok(array.iter().map(|v| v.map(Literal::timestamp)).collect()) + } + PrimitiveType::Timestamptz => { + let array = partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "The partner is not a timestamptz array", + ) + })?; + Ok(array.iter().map(|v| v.map(Literal::timestamptz)).collect()) + } + PrimitiveType::TimestampNs => { + let array = partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "The partner is not a timestamp_ns array", + ) + })?; + Ok(array + .iter() + .map(|v| v.map(Literal::timestamp_nano)) + .collect()) + } + PrimitiveType::TimestamptzNs => { + let array = partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "The partner is not a timestamptz_ns array", + ) + })?; + Ok(array + .iter() + .map(|v| v.map(Literal::timestamptz_nano)) + .collect()) + } + PrimitiveType::String => { + if let Some(array) = partner.as_any().downcast_ref::() { + Ok(array.iter().map(|v| v.map(Literal::string)).collect()) + } else if let Some(array) = partner.as_any().downcast_ref::() { + Ok(array.iter().map(|v| v.map(Literal::string)).collect()) + } else { + return Err(Error::new( + ErrorKind::DataInvalid, + "The partner is not a string array", + )); + } + } + PrimitiveType::Uuid => { + if let Some(array) = partner.as_any().downcast_ref::() { + if array.value_length() != 16 { + return Err(Error::new( + ErrorKind::DataInvalid, + "The partner is not a uuid array", + )); + } + Ok(array + .iter() + .map(|v| { + v.map(|v| { + Ok(Literal::uuid(Uuid::from_bytes(v.try_into().map_err( + |_| { + Error::new( + ErrorKind::DataInvalid, + "Failed to convert binary to uuid", + ) + }, + )?))) + }) + .transpose() + }) + .collect::>>()?) + } else { + return Err(Error::new( + ErrorKind::DataInvalid, + "The partner is not a uuid array", + )); + } + } + PrimitiveType::Fixed(len) => { + let array = partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new(ErrorKind::DataInvalid, "The partner is not a fixed array") + })?; + if array.value_length() != *len as i32 { + return Err(Error::new( + ErrorKind::DataInvalid, + "The length of fixed size binary array is not compatitable with iceberg fixed type", + )); + } + Ok(array + .iter() + .map(|v| v.map(|v| Literal::fixed(v.iter().cloned()))) + .collect()) + } + PrimitiveType::Binary => { + if let Some(array) = partner.as_any().downcast_ref::() { + Ok(array + .iter() + .map(|v| v.map(|v| Literal::binary(v.to_vec()))) + .collect()) + } else if let Some(array) = partner.as_any().downcast_ref::() { + Ok(array + .iter() + .map(|v| v.map(|v| Literal::binary(v.to_vec()))) + .collect()) + } else { + return Err(Error::new( + ErrorKind::DataInvalid, + "The partner is not a binary array", + )); + } + } + } + } + + fn visit_type_before( + &mut self, + _ty: &crate::spec::Type, + partner: &ArrayRef, + ) -> Result>>> { + if let Some(_) = partner.as_any().downcast_ref::() { + return Ok(Some(vec![None; partner.len()])); + } + Ok(None) + } +} + +struct ArrowArrayAccessor; + +impl PartnerAccessor for ArrowArrayAccessor { + type L = ArrowArrayListIterator; + type M = ArrowArrayMapIterator; + + fn struct_parner<'a>(&self, schema_partner: &'a ArrayRef) -> Result<&'a ArrayRef> { + if !matches!(schema_partner.data_type(), DataType::Struct(_)) { + return Err(Error::new( + ErrorKind::DataInvalid, + "The schema partner is not a struct type", + )); + } + Ok(schema_partner) + } + + fn field_partner<'a>( + &self, + struct_partner: &'a ArrayRef, + field_id: i32, + _field_name: &str, + ) -> Result<&'a ArrayRef> { + let struct_array = struct_partner + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "The struct partner is not a struct array", + ) + })?; + let field_pos = struct_array + .fields() + .iter() + .position(|field| { + get_field_id(field) + .map(|id| id == field_id) + .unwrap_or(false) + }) + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + format!("Field id {} not found in struct array", field_id), + ) + })?; + Ok(struct_array.column(field_pos)) + } + + fn list_element_partner<'a>( + &self, + list_partner: &'a ArrayRef, + ) -> Result { + if !matches!( + list_partner.data_type(), + DataType::List(_) | DataType::LargeList(_) | DataType::FixedSizeList(_, _) + ) { + return Err(Error::new( + ErrorKind::DataInvalid, + "The list partner is not a list type", + )); + } + Ok(ArrowArrayListIterator { + array: list_partner.clone(), + index: 0, + }) + } + + fn map_element_partner<'a>(&self, map_partner: &'a ArrayRef) -> Result { + if !matches!(map_partner.data_type(), DataType::Map(_, _)) { + return Err(Error::new( + ErrorKind::DataInvalid, + "The map partner is not a map type", + )); + } + Ok(ArrowArrayMapIterator { + array: map_partner.clone(), + index: 0, + }) + } +} + +struct ArrowArrayListIterator { + array: ArrayRef, + index: usize, +} + +impl ListPartnerIterator for ArrowArrayListIterator { + fn next(&mut self) -> Option { + if self.index >= self.array.len() { + return None; + } + if let Some(array) = self.array.as_any().downcast_ref::() { + let result = Some(array.value(self.index)); + self.index += 1; + result + } else if let Some(array) = self.array.as_any().downcast_ref::() { + let result = Some(array.value(self.index)); + self.index += 1; + result + } else if let Some(array) = self.array.as_any().downcast_ref::() { + let result = Some(array.value(self.index)); + self.index += 1; + result + } else { + None + } + } +} + +struct ArrowArrayMapIterator { + array: ArrayRef, + index: usize, +} + +impl MapPartnerIterator for ArrowArrayMapIterator { + fn next(&mut self) -> Option<(ArrayRef, ArrayRef)> { + if let Some(array) = self.array.as_any().downcast_ref::() { + let entry = array.value(self.index); + Some((entry.column(0).clone(), entry.column(1).clone())) + } else { + None + } + } +} + +/// Convert arrow struct array to iceberg struct value array. +/// This function will assume the schema of arrow struct array is the same as iceberg struct type. +pub fn arrow_struct_to_literal( + struct_array: &ArrayRef, + ty: &StructType, +) -> Result>> { + visit_struct_with_partner( + ty, + struct_array, + &mut ArrowArrayConverter, + &ArrowArrayAccessor, + ) +} + +#[cfg(test)] +mod test { + use std::collections::HashMap; + use std::sync::Arc; + + use arrow_array::{ + ArrayRef, BinaryArray, BooleanArray, Date32Array, Decimal128Array, Float32Array, + Float64Array, Int16Array, Int32Array, Int64Array, StringArray, StructArray, + Time64MicrosecondArray, TimestampMicrosecondArray, TimestampNanosecondArray, + }; + use arrow_schema::{DataType, Field, Fields, TimeUnit}; + use parquet::arrow::PARQUET_FIELD_ID_META_KEY; + + use super::*; + use crate::spec::{Literal, NestedField, PrimitiveType, StructType, Type}; + + #[test] + fn test_arrow_struct_to_iceberg_struct() { + let bool_array = BooleanArray::from(vec![Some(true), Some(false), None]); + let int16_array = Int16Array::from(vec![Some(1), Some(2), None]); + let int32_array = Int32Array::from(vec![Some(3), Some(4), None]); + let int64_array = Int64Array::from(vec![Some(5), Some(6), None]); + let float32_array = Float32Array::from(vec![Some(1.1), Some(2.2), None]); + let float64_array = Float64Array::from(vec![Some(3.3), Some(4.4), None]); + let decimal_array = Decimal128Array::from(vec![Some(1000), Some(2000), None]) + .with_precision_and_scale(10, 2) + .unwrap(); + let date_array = Date32Array::from(vec![Some(18628), Some(18629), None]); + let time_array = Time64MicrosecondArray::from(vec![Some(123456789), Some(987654321), None]); + let timestamp_micro_array = TimestampMicrosecondArray::from(vec![ + Some(1622548800000000), + Some(1622635200000000), + None, + ]); + let timestamp_nano_array = TimestampNanosecondArray::from(vec![ + Some(1622548800000000000), + Some(1622635200000000000), + None, + ]); + let string_array = StringArray::from(vec![Some("a"), Some("b"), None]); + let binary_array = + BinaryArray::from(vec![Some(b"abc".as_ref()), Some(b"def".as_ref()), None]); + + let struct_array = Arc::new(StructArray::from(vec![ + ( + Arc::new(Field::new("bool_field", DataType::Boolean, true)), + Arc::new(bool_array) as ArrayRef, + ), + ( + Arc::new(Field::new("int16_field", DataType::Int16, true)), + Arc::new(int16_array) as ArrayRef, + ), + ( + Arc::new(Field::new("int32_field", DataType::Int32, true)), + Arc::new(int32_array) as ArrayRef, + ), + ( + Arc::new(Field::new("int64_field", DataType::Int64, true)), + Arc::new(int64_array) as ArrayRef, + ), + ( + Arc::new(Field::new("float32_field", DataType::Float32, true)), + Arc::new(float32_array) as ArrayRef, + ), + ( + Arc::new(Field::new("float64_field", DataType::Float64, true)), + Arc::new(float64_array) as ArrayRef, + ), + ( + Arc::new(Field::new( + "decimal_field", + DataType::Decimal128(10, 2), + true, + )), + Arc::new(decimal_array) as ArrayRef, + ), + ( + Arc::new(Field::new("date_field", DataType::Date32, true)), + Arc::new(date_array) as ArrayRef, + ), + ( + Arc::new(Field::new( + "time_field", + DataType::Time64(TimeUnit::Microsecond), + true, + )), + Arc::new(time_array) as ArrayRef, + ), + ( + Arc::new(Field::new( + "timestamp_micro_field", + DataType::Timestamp(TimeUnit::Microsecond, None), + true, + )), + Arc::new(timestamp_micro_array) as ArrayRef, + ), + ( + Arc::new(Field::new( + "timestamp_nano_field", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + )), + Arc::new(timestamp_nano_array) as ArrayRef, + ), + ( + Arc::new(Field::new("string_field", DataType::Utf8, true)), + Arc::new(string_array) as ArrayRef, + ), + ( + Arc::new(Field::new("binary_field", DataType::Binary, true)), + Arc::new(binary_array) as ArrayRef, + ), + ])) as ArrayRef; + + let iceberg_struct_type = StructType::new(vec![ + Arc::new(NestedField::optional( + 0, + "bool_field", + Type::Primitive(PrimitiveType::Boolean), + )), + Arc::new(NestedField::optional( + 1, + "int16_field", + Type::Primitive(PrimitiveType::Int), + )), + Arc::new(NestedField::optional( + 2, + "int32_field", + Type::Primitive(PrimitiveType::Int), + )), + Arc::new(NestedField::optional( + 3, + "int64_field", + Type::Primitive(PrimitiveType::Long), + )), + Arc::new(NestedField::optional( + 4, + "float32_field", + Type::Primitive(PrimitiveType::Float), + )), + Arc::new(NestedField::optional( + 5, + "float64_field", + Type::Primitive(PrimitiveType::Double), + )), + Arc::new(NestedField::optional( + 6, + "decimal_field", + Type::Primitive(PrimitiveType::Decimal { + precision: 10, + scale: 2, + }), + )), + Arc::new(NestedField::optional( + 7, + "date_field", + Type::Primitive(PrimitiveType::Date), + )), + Arc::new(NestedField::optional( + 8, + "time_field", + Type::Primitive(PrimitiveType::Time), + )), + Arc::new(NestedField::optional( + 9, + "timestamp_micro_field", + Type::Primitive(PrimitiveType::Timestamp), + )), + Arc::new(NestedField::optional( + 10, + "timestamp_nao_field", + Type::Primitive(PrimitiveType::TimestampNs), + )), + Arc::new(NestedField::optional( + 11, + "string_field", + Type::Primitive(PrimitiveType::String), + )), + Arc::new(NestedField::optional( + 12, + "binary_field", + Type::Primitive(PrimitiveType::Binary), + )), + ]); + + let result = arrow_struct_to_literal(&struct_array, &iceberg_struct_type).unwrap(); + + assert_eq!(result, vec![ + Some(Literal::Struct(Struct::from_iter(vec![ + Some(Literal::bool(true)), + Some(Literal::int(1)), + Some(Literal::int(3)), + Some(Literal::long(5)), + Some(Literal::float(1.1)), + Some(Literal::double(3.3)), + Some(Literal::decimal(1000)), + Some(Literal::date(18628)), + Some(Literal::time(123456789)), + Some(Literal::timestamp(1622548800000000)), + Some(Literal::timestamp_nano(1622548800000000000)), + Some(Literal::string("a".to_string())), + Some(Literal::binary(b"abc".to_vec())), + ]))), + Some(Literal::Struct(Struct::from_iter(vec![ + Some(Literal::bool(false)), + Some(Literal::int(2)), + Some(Literal::int(4)), + Some(Literal::long(6)), + Some(Literal::float(2.2)), + Some(Literal::double(4.4)), + Some(Literal::decimal(2000)), + Some(Literal::date(18629)), + Some(Literal::time(987654321)), + Some(Literal::timestamp(1622635200000000)), + Some(Literal::timestamp_nano(1622635200000000000)), + Some(Literal::string("b".to_string())), + Some(Literal::binary(b"def".to_vec())), + ]))), + Some(Literal::Struct(Struct::from_iter(vec![ + None, None, None, None, None, None, None, None, None, None, None, None, None, + ]))), + ]); + } + + #[test] + fn test_single_column_nullable_struct() { + let struct_array = Arc::new(StructArray::new_null( + Fields::from(vec![Field::new("bool_field", DataType::Boolean, true)]), + 3, + )) as ArrayRef; + let iceberg_struct_type = StructType::new(vec![Arc::new(NestedField::optional( + 0, + "bool_field", + Type::Primitive(PrimitiveType::Boolean), + ))]); + let result = arrow_struct_to_literal(&struct_array, &iceberg_struct_type).unwrap(); + assert_eq!(result, vec![None; 3]); + } + + #[test] + fn test_empty_struct() { + let struct_array = Arc::new(StructArray::new_null(Fields::empty(), 3)) as ArrayRef; + let iceberg_struct_type = StructType::new(vec![]); + let result = arrow_struct_to_literal(&struct_array, &iceberg_struct_type).unwrap(); + assert_eq!(result, vec![None; 0]); + } + + #[test] + fn test_arrow_struct_to_iceberg_struct_from_field_id() { + let bool_array = BooleanArray::from(vec![Some(true), Some(false), None]); + let int16_array = Int16Array::from(vec![Some(1), Some(2), None]); + let int32_array = Int32Array::from(vec![Some(3), Some(4), None]); + let int64_array = Int64Array::from(vec![Some(5), Some(6), None]); + let float32_array = Float32Array::from(vec![Some(1.1), Some(2.2), None]); + let struct_array = Arc::new(StructArray::from(vec![ + ( + Arc::new( + Field::new("bool_field", DataType::Boolean, true).with_metadata(HashMap::from( + [(PARQUET_FIELD_ID_META_KEY.to_string(), "2".to_string())], + )), + ), + Arc::new(bool_array) as ArrayRef, + ), + ( + Arc::new( + Field::new("int16_field", DataType::Int16, true).with_metadata(HashMap::from( + [(PARQUET_FIELD_ID_META_KEY.to_string(), "1".to_string())], + )), + ), + Arc::new(int16_array) as ArrayRef, + ), + ( + Arc::new( + Field::new("int32_field", DataType::Int32, true).with_metadata(HashMap::from( + [(PARQUET_FIELD_ID_META_KEY.to_string(), "4".to_string())], + )), + ), + Arc::new(int32_array) as ArrayRef, + ), + ( + Arc::new( + Field::new("int64_field", DataType::Int64, true).with_metadata(HashMap::from( + [(PARQUET_FIELD_ID_META_KEY.to_string(), "3".to_string())], + )), + ), + Arc::new(int64_array) as ArrayRef, + ), + ( + Arc::new( + Field::new("float32_field", DataType::Float32, true).with_metadata( + HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(), "5".to_string())]), + ), + ), + Arc::new(float32_array) as ArrayRef, + ), + ])) as ArrayRef; + let struct_type = StructType::new(vec![ + Arc::new(NestedField::optional( + 1, + "int16_field", + Type::Primitive(PrimitiveType::Int), + )), + Arc::new(NestedField::optional( + 2, + "bool_field", + Type::Primitive(PrimitiveType::Boolean), + )), + Arc::new(NestedField::optional( + 3, + "int64_field", + Type::Primitive(PrimitiveType::Long), + )), + Arc::new(NestedField::optional( + 4, + "int32_field", + Type::Primitive(PrimitiveType::Int), + )), + ]); + let result = arrow_struct_to_literal(&struct_array, &struct_type).unwrap(); + assert_eq!(result, vec![ + Some(Literal::Struct(Struct::from_iter(vec![ + Some(Literal::int(1)), + Some(Literal::bool(true)), + Some(Literal::long(5)), + Some(Literal::int(3)), + ]))), + Some(Literal::Struct(Struct::from_iter(vec![ + Some(Literal::int(2)), + Some(Literal::bool(false)), + Some(Literal::long(6)), + Some(Literal::int(4)), + ]))), + Some(Literal::Struct(Struct::from_iter(vec![ + None, None, None, None, + ]))), + ]); + } +} diff --git a/crates/iceberg/src/spec/schema.rs b/crates/iceberg/src/spec/schema.rs index 649b6b2c4..688bbe65d 100644 --- a/crates/iceberg/src/spec/schema.rs +++ b/crates/iceberg/src/spec/schema.rs @@ -1132,6 +1132,213 @@ impl ReassignFieldIds { } } +/// A post order schema visitor with partner. +/// +/// For order of methods called, please refer to [`visit_schema_with_partner`]. +pub trait SchemaWithPartnerVisitor

{ + /// Return type of this visitor. + type T; + + /// Called before struct field. + fn before_struct_field(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called after struct field. + fn after_struct_field(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called before list field. + fn before_list_element(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called after list field. + fn after_list_element(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called before map key field. + fn before_map_key(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called after map key field. + fn after_map_key(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called before map value field. + fn before_map_value(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called after map value field. + fn after_map_value(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + + /// Called before every type, if this function return `Some`, the following visiting will be skipped. + /// This function used to implement early return. + fn visit_type_before(&mut self, _ty: &Type, _partner: &P) -> Result> { + return Ok(None); + } + + /// Called after schema's type visited. + fn schema(&mut self, schema: &Schema, partner: &P, value: Self::T) -> Result; + /// Called after struct's field type visited. + fn field(&mut self, field: &NestedFieldRef, partner: &P, value: Self::T) -> Result; + /// Called after struct's fields visited. + fn r#struct( + &mut self, + r#struct: &StructType, + partner: &P, + results: Vec, + ) -> Result; + /// Called after list fields visited. + fn list(&mut self, list: &ListType, partner: &P, value: Vec) -> Result; + /// Called after map's key and value fields visited. + fn map( + &mut self, + map: &MapType, + partner: &P, + key_value: Vec, + value: Vec, + ) -> Result; + /// Called when see a primitive type. + fn primitive(&mut self, p: &PrimitiveType, partner: &P) -> Result; +} + +/// Accessor used to get child partner from parent partner. +pub trait PartnerAccessor

{ + /// List partner iterator. + type L: ListPartnerIterator

; + /// Map partner iterator. + type M: MapPartnerIterator

; + + /// Get the struct partner from schema partner. + fn struct_parner<'a>(&self, schema_partner: &'a P) -> Result<&'a P>; + /// Get the field partner from struct partner. + fn field_partner<'a>(&self, struct_partner: &'a P, field_id: i32, field: &str) + -> Result<&'a P>; + /// Get the list element partner from list partner. + fn list_element_partner<'a>(&self, list_partner: &'a P) -> Result; + /// Get the map key partner from map partner. + fn map_element_partner<'a>(&self, map_partner: &'a P) -> Result; +} + +/// Iterator for list partner. +pub trait ListPartnerIterator

{ + /// Get the next partner. + fn next(&mut self) -> Option

; +} + +/// Iterator for map partner. +pub trait MapPartnerIterator

{ + /// Get the next partner. + fn next(&mut self) -> Option<(P, P)>; +} + +/// Visiting a type in post order. +pub fn visit_type_with_partner, A: PartnerAccessor

>( + r#type: &Type, + partner: &P, + visitor: &mut V, + accessor: &A, +) -> Result { + if let Some(res) = visitor.visit_type_before(r#type, partner)? { + return Ok(res); + } + match r#type { + Type::Primitive(p) => visitor.primitive(p, partner), + Type::List(list) => { + let mut results = Vec::new(); + let mut list_element_partner_iter = accessor.list_element_partner(partner)?; + if let Some(list_element_partner) = list_element_partner_iter.next() { + visitor.before_list_element(&list.element_field, &list_element_partner)?; + let value = visit_type_with_partner( + &list.element_field.field_type, + &list_element_partner, + visitor, + accessor, + )?; + visitor.after_list_element(&list.element_field, &list_element_partner)?; + results.push(value); + } + visitor.list(list, partner, results) + } + Type::Map(map) => { + let mut k_results = Vec::new(); + let mut v_results = Vec::new(); + let mut kv_partner_iter = accessor.map_element_partner(partner)?; + if let Some((k_partner, v_partner)) = kv_partner_iter.next() { + let key_result = { + visitor.before_map_key(&map.key_field, &k_partner)?; + let ret = visit_type_with_partner( + &map.key_field.field_type, + &k_partner, + visitor, + accessor, + )?; + visitor.after_map_key(&map.key_field, &k_partner)?; + ret + }; + + let value_result = { + visitor.before_map_value(&map.value_field, &v_partner)?; + let ret = visit_type_with_partner( + &map.value_field.field_type, + &v_partner, + visitor, + accessor, + )?; + visitor.after_map_value(&map.value_field, &v_partner)?; + ret + }; + + k_results.push(key_result); + v_results.push(value_result); + } + + visitor.map(map, partner, k_results, v_results) + } + Type::Struct(s) => visit_struct_with_partner(s, partner, visitor, accessor), + } +} + +/// Visit struct type in post order. +pub fn visit_struct_with_partner, A: PartnerAccessor

>( + s: &StructType, + partner: &P, + visitor: &mut V, + accessor: &A, +) -> Result { + if let Some(res) = visitor.visit_type_before(&Type::Struct(s.clone()), partner)? { + return Ok(res); + } + let mut results = Vec::with_capacity(s.fields().len()); + for field in s.fields() { + let field_partner = accessor.field_partner(partner, field.id, &field.name)?; + visitor.before_struct_field(field, field_partner)?; + let result = visit_type_with_partner(&field.field_type, field_partner, visitor, accessor)?; + visitor.after_struct_field(field, field_partner)?; + let result = visitor.field(field, field_partner, result)?; + results.push(result); + } + + visitor.r#struct(s, partner, results) +} + +/// Visit schema in post order. +pub fn visit_schema_with_partner, A: PartnerAccessor

>( + schema: &Schema, + partner: &P, + visitor: &mut V, + accessor: &A, +) -> Result { + let result = visit_struct_with_partner( + &schema.r#struct, + accessor.struct_parner(partner)?, + visitor, + accessor, + )?; + visitor.schema(schema, partner, result) +} + pub(super) mod _serde { /// This is a helper module that defines types to help with serialization/deserialization. /// For deserialization the input first gets read into either the [SchemaV1] or [SchemaV2] struct diff --git a/crates/iceberg/src/spec/values.rs b/crates/iceberg/src/spec/values.rs index 6fb070527..9b42152fc 100644 --- a/crates/iceberg/src/spec/values.rs +++ b/crates/iceberg/src/spec/values.rs @@ -1401,6 +1401,16 @@ impl Literal { Self::Primitive(PrimitiveLiteral::Long(value)) } + /// Creates a timestamp from unix epoch in nanoseconds. + pub fn timestamp_nano(value: i64) -> Self { + Self::Primitive(PrimitiveLiteral::Long(value)) + } + + /// Creates a timestamp with timezone from unix epoch in nanoseconds. + pub fn timestamptz_nano(value: i64) -> Self { + Self::Primitive(PrimitiveLiteral::Long(value)) + } + /// Creates a timestamp from [`DateTime`]. pub fn timestamp_from_datetime(dt: DateTime) -> Self { Self::timestamp(dt.with_timezone(&Utc).timestamp_micros())