From 5ef683a0212710363f21afd902a9fbb9b234add0 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Tue, 30 Jul 2024 17:19:06 +0800 Subject: [PATCH 1/4] feat: Add file io (#23) --- crates/paimon/Cargo.toml | 3 +- crates/paimon/src/error.rs | 16 ++- crates/paimon/src/io/file_io.rs | 186 ++++++++++++++++++++++++++++++++ crates/paimon/src/io/mod.rs | 19 ++++ crates/paimon/src/lib.rs | 4 + 5 files changed, 224 insertions(+), 4 deletions(-) create mode 100644 crates/paimon/src/io/file_io.rs create mode 100644 crates/paimon/src/io/mod.rs diff --git a/crates/paimon/Cargo.toml b/crates/paimon/Cargo.toml index 1e33ce0..0af0b17 100644 --- a/crates/paimon/Cargo.toml +++ b/crates/paimon/Cargo.toml @@ -28,8 +28,9 @@ version.workspace = true [dependencies] bitflags = "2.6.0" -chrono = {version = "0.4.38", features = ["serde"]} +chrono = { version = "0.4.38", features = ["serde"] } serde = { version = "1", features = ["derive"] } serde_with = "3.8.3" snafu = "0.8.3" typed-builder = "^0.18" +opendal = "0.48" diff --git a/crates/paimon/src/error.rs b/crates/paimon/src/error.rs index f615649..6323f3f 100644 --- a/crates/paimon/src/error.rs +++ b/crates/paimon/src/error.rs @@ -15,16 +15,26 @@ // specific language governing permissions and limitations // under the License. -use snafu::Snafu; +use snafu::prelude::*; + +/// Result type used in paimon. +pub type Result = std::result::Result; /// Error type for paimon. -#[allow(dead_code)] #[derive(Debug, Snafu)] pub enum Error { - #[snafu(display("paimon data invalid for {}: {:?}", message, source))] + #[snafu(display("Paimon data invalid for {}: {:?}", message, source))] DataInvalid { message: String, #[snafu(backtrace)] source: snafu::Whatever, }, + #[snafu( + visibility(pub(crate)), + display("Paimon hitting unexpected error {}: {:?}", message, source) + )] + IoUnexpected { + message: String, + source: opendal::Error, + }, } diff --git a/crates/paimon/src/io/file_io.rs b/crates/paimon/src/io/file_io.rs new file mode 100644 index 0000000..5247b4f --- /dev/null +++ b/crates/paimon/src/io/file_io.rs @@ -0,0 +1,186 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::*; +use std::collections::HashMap; + +use opendal::services::MemoryConfig; +use opendal::{Metakey, Operator}; +use snafu::ResultExt; + +#[derive(Clone, Debug)] +pub struct FileIO { + op: Operator, +} + +impl FileIO { + /// Create a new FileIO. + /// + /// The input HashMap is paimon-java's [`Options`](https://github.com/apache/paimon/blob/release-0.8.2/paimon-common/src/main/java/org/apache/paimon/options/Options.java#L60) + /// + /// TODO: Support building Operator from HashMap via options. + pub fn new(_: HashMap) -> Result { + let op = Operator::from_config(MemoryConfig::default()) + .context(IoUnexpectedSnafu { + message: "Failed to create operator".to_string(), + })? + .finish(); + Ok(Self { op }) + } + + /// Create a new input file to read data. + /// + /// Reference: + pub fn new_input(&self, path: &str) -> InputFile { + InputFile { + _op: self.op.clone(), + path: path.to_string(), + } + } + + /// Create a new output file to write data. + /// + /// Reference: + pub fn new_output(&self, path: &str) -> OutputFile { + OutputFile { + _op: self.op.clone(), + path: path.to_string(), + } + } + + /// Return a file status object that represents the path. + /// + /// Reference: + pub async fn get_status(&self, path: &str) -> Result { + let meta = self.op.stat(path).await.context(IoUnexpectedSnafu { + message: "Failed to get file status".to_string(), + })?; + + Ok(FileStatus { + size: meta.content_length(), + }) + } + + /// List the statuses of the files/directories in the given path if the path is a directory. + /// + /// References: + /// + /// FIXME: how to handle large dir? Better to return a stream instead? + pub async fn list_status(&self, path: &str) -> Result> { + let entries = self + .op + .list_with(path) + .metakey(Metakey::ContentLength) + .await + .context(IoUnexpectedSnafu { + message: "Failed to list file status".to_string(), + })?; + + Ok(entries + .into_iter() + .map(|meta| FileStatus { + size: meta.metadata().content_length(), + }) + .collect()) + } + + /// Check if exists. + /// + /// References: + pub async fn exists(&self, path: &str) -> Result { + self.op.is_exist(path).await.context(IoUnexpectedSnafu { + message: "Failed to check file existence".to_string(), + }) + } + + /// Delete a file. + /// + /// Reference: + pub async fn delete_file(&self, path: &str) -> Result<()> { + self.op.delete(path).await.context(IoUnexpectedSnafu { + message: "Failed to delete file".to_string(), + })?; + + Ok(()) + } + + /// Delete a dir recursively. + /// + /// Reference: + pub async fn delete_dir(&self, path: &str) -> Result<()> { + self.op.remove_all(path).await.context(IoUnexpectedSnafu { + message: "Failed to delete dir".to_string(), + })?; + Ok(()) + } + + /// Make the given file and all non-existent parents into directories. + /// + /// Has the semantics of Unix 'mkdir -p'. Existence of the directory hierarchy is not an error. + /// + /// Reference: + pub async fn mkdirs(&self, path: &str) -> Result<()> { + self.op.create_dir(path).await.context(IoUnexpectedSnafu { + message: "Failed to create dir".to_string(), + })?; + Ok(()) + } + + /// Renames the file/directory src to dst. + /// + /// Reference: + pub async fn rename(&self, src: &str, dst: &str) -> Result<()> { + self.op.rename(src, dst).await.context(IoUnexpectedSnafu { + message: "Failed to rename file".to_string(), + })?; + Ok(()) + } +} + +/// FileStatus represents the status of a file. +#[derive(Clone, Debug)] +pub struct FileStatus { + pub size: u64, +} + +/// Input file represents a file that can be read from. +#[derive(Clone, Debug)] +pub struct InputFile { + _op: Operator, + path: String, +} + +impl InputFile { + /// Get the path of given input file. + pub fn path(&self) -> &str { + &self.path + } +} + +/// Output file represents a file that can be written to. +#[derive(Clone, Debug)] +pub struct OutputFile { + _op: Operator, + path: String, +} + +impl OutputFile { + /// Get the path of given output file. + pub fn path(&self) -> &str { + &self.path + } +} diff --git a/crates/paimon/src/io/mod.rs b/crates/paimon/src/io/mod.rs new file mode 100644 index 0000000..a9d049b --- /dev/null +++ b/crates/paimon/src/io/mod.rs @@ -0,0 +1,19 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod file_io; +pub use file_io::*; diff --git a/crates/paimon/src/lib.rs b/crates/paimon/src/lib.rs index bf367f0..6e15e0b 100644 --- a/crates/paimon/src/lib.rs +++ b/crates/paimon/src/lib.rs @@ -16,4 +16,8 @@ // under the License. mod error; +pub use error::Error; +pub use error::Result; + +pub mod io; pub mod spec; From f8c156f6358ee3844b901aad86b3a2f5e9fdd70e Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Tue, 30 Jul 2024 17:52:45 +0800 Subject: [PATCH 2/4] chore: Fix all reference links to tag instead (#24) --- crates/paimon/src/spec/data_file.rs | 10 +++++----- crates/paimon/src/spec/schema.rs | 4 ++-- crates/paimon/src/spec/snapshot.rs | 2 +- crates/paimon/src/spec/types.rs | 22 +++++++++++----------- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/crates/paimon/src/spec/data_file.rs b/crates/paimon/src/spec/data_file.rs index 4f6c41f..df92516 100644 --- a/crates/paimon/src/spec/data_file.rs +++ b/crates/paimon/src/spec/data_file.rs @@ -26,7 +26,7 @@ use std::fmt::{Display, Formatter}; /// column. Compared to the SQL standard, an optional field description simplifies the handling with /// complex structures. /// -/// Impl Reference: +/// Impl Reference: /// /// TODO: make RowType extends DataType. /// TODO: move me to a better place. @@ -44,7 +44,7 @@ pub const EMPTY_BINARY_ROW: BinaryRow = BinaryRow::new(0); /// An implementation of InternalRow. /// -/// Impl Reference: +/// Impl Reference: #[derive(Debug, Eq, PartialEq, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct BinaryRow { @@ -71,13 +71,13 @@ impl BinaryRow { /// TODO: implement me. /// The statistics for columns, supports the following stats. /// -/// Impl References: +/// Impl References: type SimpleStats = (); /// The Source of a file. /// TODO: move me to the manifest module. /// -/// Impl References: +/// Impl References: #[repr(u8)] #[derive(Debug, Clone, Copy, Eq, PartialEq, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] @@ -88,7 +88,7 @@ pub enum FileSource { /// Metadata of a data file. /// -/// Impl References: +/// Impl References: #[derive(Debug, Eq, PartialEq, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct DataFileMeta { diff --git a/crates/paimon/src/spec/schema.rs b/crates/paimon/src/spec/schema.rs index 7a9b0d0..feeeac1 100644 --- a/crates/paimon/src/spec/schema.rs +++ b/crates/paimon/src/spec/schema.rs @@ -22,7 +22,7 @@ use std::collections::HashMap; /// The table schema for paimon table. /// -/// Impl References: +/// Impl References: #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] #[serde(rename_all = "camelCase")] pub struct TableSchema { @@ -40,7 +40,7 @@ pub struct TableSchema { /// Data field for paimon table. /// -/// Impl Reference: +/// Impl Reference: #[serde_as] #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] pub struct DataField { diff --git a/crates/paimon/src/spec/snapshot.rs b/crates/paimon/src/spec/snapshot.rs index 53b9583..c232431 100644 --- a/crates/paimon/src/spec/snapshot.rs +++ b/crates/paimon/src/spec/snapshot.rs @@ -20,7 +20,7 @@ use typed_builder::TypedBuilder; /// Snapshot for paimon. /// -/// Impl Reference: . +/// Impl Reference: . #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, TypedBuilder)] #[serde(rename_all = "camelCase")] pub struct Snapshot { diff --git a/crates/paimon/src/spec/types.rs b/crates/paimon/src/spec/types.rs index b44dc2d..231691b 100644 --- a/crates/paimon/src/spec/types.rs +++ b/crates/paimon/src/spec/types.rs @@ -45,7 +45,7 @@ bitflags! { /// The root of data type. /// -/// Impl Reference: +/// Impl Reference: #[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize, Serialize, Hash)] pub enum DataTypeRoot { Char, @@ -143,7 +143,7 @@ pub trait DataTypeVisitor { /// Data type for paimon table. /// -/// Impl Reference: +/// Impl Reference: #[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize, Serialize, Hash)] pub struct DataType { is_nullable: bool, @@ -174,7 +174,7 @@ impl DataType { } /// Returns a deep copy of this type with possibly different nullability. - /// Impl Reference: + /// Impl Reference: fn with_nullable(&self, is_nullable: bool) -> Self { Self { is_nullable, @@ -184,41 +184,41 @@ impl DataType { /// Returns true if the data type is nullable. /// - /// Impl Reference: + /// Impl Reference: fn is_nullable(&self) -> bool { self.is_nullable } /// Returns the root of the data type. /// - /// Impl Reference: + /// Impl Reference: fn type_root(&self) -> &DataTypeRoot { &self.type_root } /// Returns whether the root of the type equals to the type_root or not. /// - /// Impl Reference: + /// Impl Reference: fn is(&self, type_root: &DataTypeRoot) -> bool { &self.type_root == type_root } /// Returns whether the family type of the type equals to the family or not. /// - /// Impl Reference: + /// Impl Reference: fn is_family(&self, family: DataTypeFamily) -> bool { self.type_root.families().contains(family) } /// Returns whether the root of the type equals to at least on of the type_roots or not. /// - /// Impl Reference: + /// Impl Reference: fn is_any_of(&self, type_roots: &[DataTypeRoot]) -> bool { type_roots.iter().any(|tr: &DataTypeRoot| self.is(tr)) } /// Returns whether the root of the type is part of at least one family of the families or not. - /// Impl Reference: + /// Impl Reference: fn is_any_of_family(&self, families: &[DataTypeFamily]) -> bool { families.iter().any(|f: &DataTypeFamily| self.is_family(*f)) } @@ -233,7 +233,7 @@ impl DataType { /// ArrayType for paimon. /// -/// Impl Reference: . +/// Impl Reference: . #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)] #[serde(rename_all = "camelCase")] pub struct ArrayType { @@ -389,7 +389,7 @@ impl BooleanType { /// CharType for paimon. /// -/// Impl Reference: . +/// Impl Reference: . #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] #[serde(rename_all = "camelCase")] pub struct CharType { From a5cfac79ae4e9eaf87b752b3720fb0575cef3cbb Mon Sep 17 00:00:00 2001 From: Asura7969 <1402357969@qq.com> Date: Fri, 2 Aug 2024 20:46:25 +0800 Subject: [PATCH 3/4] feat(spec): Add Manifest List (#17) --- crates/paimon/Cargo.toml | 1 + crates/paimon/src/spec/manifest_file_meta.rs | 182 +++++++++++++++++++ crates/paimon/src/spec/manifest_list.rs | 30 +++ crates/paimon/src/spec/mod.rs | 6 + 4 files changed, 219 insertions(+) create mode 100644 crates/paimon/src/spec/manifest_file_meta.rs create mode 100644 crates/paimon/src/spec/manifest_list.rs diff --git a/crates/paimon/Cargo.toml b/crates/paimon/Cargo.toml index 0af0b17..0f03c5b 100644 --- a/crates/paimon/Cargo.toml +++ b/crates/paimon/Cargo.toml @@ -31,6 +31,7 @@ bitflags = "2.6.0" chrono = { version = "0.4.38", features = ["serde"] } serde = { version = "1", features = ["derive"] } serde_with = "3.8.3" +serde_bytes = "0.11.15" snafu = "0.8.3" typed-builder = "^0.18" opendal = "0.48" diff --git a/crates/paimon/src/spec/manifest_file_meta.rs b/crates/paimon/src/spec/manifest_file_meta.rs new file mode 100644 index 0000000..4f3775d --- /dev/null +++ b/crates/paimon/src/spec/manifest_file_meta.rs @@ -0,0 +1,182 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use serde::{Deserialize, Deserializer, Serialize, Serializer}; +use serde_bytes::Bytes; +use std::fmt::{Display, Formatter}; + +/// Metadata of a manifest file. +/// +/// Impl Reference: +#[derive(PartialEq, Eq, Debug, Clone, Serialize, Deserialize)] +pub struct ManifestFileMeta { + /// manifest file name + #[serde(rename = "_FILE_NAME")] + file_name: String, + + /// manifest file size. + #[serde(rename = "_FILE_SIZE")] + file_size: i64, + + /// number added files in manifest. + #[serde(rename = "_NUM_ADDED_FILES")] + num_added_files: i64, + + /// number deleted files in manifest. + #[serde(rename = "_NUM_DELETED_FILES")] + num_deleted_files: i64, + + /// partition stats, the minimum and maximum values of partition fields in this manifest are beneficial for skipping certain manifest files during queries, it is a SimpleStats. + #[serde(rename = "_PARTITION_STATS")] + partition_stats: BinaryTableStats, + + /// schema id when writing this manifest file. + #[serde(rename = "_SCHEMA_ID")] + schema_id: i64, +} + +impl ManifestFileMeta { + /// Get the manifest file name + #[inline] + pub fn file_name(&self) -> &str { + self.file_name.as_str() + } + + /// Get the manifest file size. + #[inline] + pub fn file_size(&self) -> i64 { + self.file_size + } + + /// Get the number added files in manifest. + #[inline] + pub fn num_added_files(&self) -> i64 { + self.num_added_files + } + + /// Get the number deleted files in manifest. + #[inline] + pub fn num_deleted_files(&self) -> i64 { + self.num_deleted_files + } + + /// Get the partition stats + pub fn partition_stats(&self) -> &BinaryTableStats { + &self.partition_stats + } + + /// Get the schema id when writing this manifest file. + #[inline] + pub fn schema_id(&self) -> i64 { + self.schema_id + } +} + +impl Display for ManifestFileMeta { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{{{}, {}, {}, {}, {:?}, {}}}", + self.file_name, + self.file_size, + self.num_added_files, + self.num_deleted_files, + self.partition_stats, + self.schema_id + ) + } +} + +/// The statistics for columns, supports the following stats. +/// +/// All statistics are stored in the form of a Binary, which can significantly reduce its memory consumption, but the cost is that the column type needs to be known when getting. +/// +/// Impl Reference: +#[derive(PartialEq, Eq, Debug, Clone, Serialize, Deserialize)] +pub struct BinaryTableStats { + /// the minimum values of the columns + #[serde(rename = "_MIN_VALUES", with = "serde_bytes")] + min_values: Vec, + + /// the maximum values of the columns + #[serde(rename = "_MAX_VALUES", with = "serde_bytes")] + max_values: Vec, + + /// the number of nulls of the columns + #[serde( + rename = "_NULL_COUNTS", + serialize_with = "serialize_null_counts", + deserialize_with = "deserialize_null_counts" + )] + null_counts: Vec, +} + +impl BinaryTableStats { + /// Get the minimum values of the columns + #[inline] + pub fn min_values(&self) -> &[u8] { + &self.min_values + } + + /// Get the maximum values of the columns + #[inline] + pub fn max_values(&self) -> &[u8] { + &self.max_values + } + + /// Get the number of nulls of the columns + #[inline] + pub fn null_counts(&self) -> &Vec { + &self.null_counts + } +} + +impl Display for BinaryTableStats { + fn fmt(&self, _: &mut Formatter<'_>) -> std::fmt::Result { + todo!() + } +} + +fn serialize_null_counts(value: &Vec, serializer: S) -> Result +where + S: Serializer, +{ + let mut bytes = Vec::new(); + for &num in value { + bytes.extend_from_slice(&num.to_le_bytes()); + } + + let bytes = Bytes::new(bytes.as_slice()); + serializer.serialize_bytes(bytes) +} + +fn deserialize_null_counts<'de, D>(deserializer: D) -> Result, D::Error> +where + D: Deserializer<'de>, +{ + let bytes = Deserialize::deserialize(deserializer).map(Bytes::new)?; + + let size_of_i64 = std::mem::size_of::(); + let i64_count = bytes.len() / size_of_i64; + let mut i64s = Vec::with_capacity(i64_count); + for chunk in bytes.chunks_exact(size_of_i64) { + i64s.push(i64::from_le_bytes( + chunk.try_into().expect("Chunk must be 8 bytes long"), + )); + } + Ok(i64s) +} diff --git a/crates/paimon/src/spec/manifest_list.rs b/crates/paimon/src/spec/manifest_list.rs new file mode 100644 index 0000000..a37e8c0 --- /dev/null +++ b/crates/paimon/src/spec/manifest_list.rs @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use super::manifest_file_meta::ManifestFileMeta; + +/// This file includes several [`ManifestFileMeta`], representing all data of the whole table at the corresponding snapshot. +pub struct ManifestList {} + +impl ManifestList { + /// Write several [`ManifestFileMeta`]s into a manifest list. + /// + /// NOTE: This method is atomic. + pub fn write(&mut self, _metas: Vec) -> &str { + todo!() + } +} diff --git a/crates/paimon/src/spec/mod.rs b/crates/paimon/src/spec/mod.rs index fc09dcd..a2d1fa3 100644 --- a/crates/paimon/src/spec/mod.rs +++ b/crates/paimon/src/spec/mod.rs @@ -28,5 +28,11 @@ pub use schema::*; mod snapshot; pub use snapshot::*; +mod manifest_file_meta; +pub use manifest_file_meta::*; + +mod manifest_list; +pub use manifest_list::*; + mod types; pub use types::*; From 65a8e7460dd45ccd428c24f557e7f2eda7c6e62f Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Fri, 2 Aug 2024 22:43:18 +0800 Subject: [PATCH 4/4] refactor: Refactor DateTypes to make it more easy to use (#29) --- crates/paimon/src/spec/data_file.rs | 22 +- crates/paimon/src/spec/schema.rs | 2 +- crates/paimon/src/spec/types.rs | 811 +++++++++++++++------------- 3 files changed, 430 insertions(+), 405 deletions(-) diff --git a/crates/paimon/src/spec/data_file.rs b/crates/paimon/src/spec/data_file.rs index df92516..37165e6 100644 --- a/crates/paimon/src/spec/data_file.rs +++ b/crates/paimon/src/spec/data_file.rs @@ -15,31 +15,11 @@ // specific language governing permissions and limitations // under the License. -use super::schema::DataField; +use crate::spec::RowType; use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use std::fmt::{Display, Formatter}; -/// Data type of a sequence of fields. A field consists of a field name, field type, and an optional -/// description. The most specific type of a row of a table is a row type. In this case, each column -/// of the row corresponds to the field of the row type that has the same ordinal position as the -/// column. Compared to the SQL standard, an optional field description simplifies the handling with -/// complex structures. -/// -/// Impl Reference: -/// -/// TODO: make RowType extends DataType. -/// TODO: move me to a better place. -pub struct RowType { - _fields: Vec, -} - -impl RowType { - pub const fn new(list: Vec) -> Self { - Self { _fields: list } - } -} - pub const EMPTY_BINARY_ROW: BinaryRow = BinaryRow::new(0); /// An implementation of InternalRow. diff --git a/crates/paimon/src/spec/schema.rs b/crates/paimon/src/spec/schema.rs index feeeac1..60b3923 100644 --- a/crates/paimon/src/spec/schema.rs +++ b/crates/paimon/src/spec/schema.rs @@ -42,7 +42,7 @@ pub struct TableSchema { /// /// Impl Reference: #[serde_as] -#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] +#[derive(Debug, Clone, PartialEq, Hash, Eq, Deserialize, Serialize)] pub struct DataField { id: i32, name: String, diff --git a/crates/paimon/src/spec/types.rs b/crates/paimon/src/spec/types.rs index 231691b..617ee7d 100644 --- a/crates/paimon/src/spec/types.rs +++ b/crates/paimon/src/spec/types.rs @@ -16,6 +16,7 @@ // under the License. use crate::error::Error; +use crate::spec::DataField; use bitflags::bitflags; use serde::{Deserialize, Serialize}; use std::fmt::{Display, Formatter}; @@ -24,7 +25,7 @@ use std::str::FromStr; bitflags! { /// An enumeration of Data type families for clustering {@link DataTypeRoot}s into categories. /// -/// Impl Reference: +/// Impl Reference: #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct DataTypeFamily: u32 { const PREDEFINED = 1 << 0; @@ -43,111 +44,58 @@ bitflags! { } } -/// The root of data type. -/// -/// Impl Reference: -#[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize, Serialize, Hash)] -pub enum DataTypeRoot { - Char, - Varchar, - Boolean, - Binary, - Varbinary, - Decimal, - Tinyint, - Smallint, - Integer, - Bigint, - Float, - Double, - Date, - TimeWithoutTimeZone, - TimestampWithoutTimeZone, - TimestampWithLocalTimeZone, - Array, - Multiset, - Map, - Row, -} - -impl DataTypeRoot { - pub fn families(&self) -> DataTypeFamily { - match self { - Self::Char => DataTypeFamily::PREDEFINED | DataTypeFamily::CHARACTER_STRING, - Self::Varchar => DataTypeFamily::PREDEFINED | DataTypeFamily::CHARACTER_STRING, - Self::Boolean => DataTypeFamily::PREDEFINED, - Self::Binary => DataTypeFamily::PREDEFINED | DataTypeFamily::BINARY_STRING, - Self::Varbinary => DataTypeFamily::PREDEFINED | DataTypeFamily::BINARY_STRING, - Self::Decimal => { - DataTypeFamily::PREDEFINED | DataTypeFamily::NUMERIC | DataTypeFamily::EXACT_NUMERIC - } - Self::Tinyint => { - DataTypeFamily::PREDEFINED - | DataTypeFamily::NUMERIC - | DataTypeFamily::INTEGER_NUMERIC - | DataTypeFamily::EXACT_NUMERIC - } - Self::Smallint => { - DataTypeFamily::PREDEFINED - | DataTypeFamily::NUMERIC - | DataTypeFamily::INTEGER_NUMERIC - | DataTypeFamily::EXACT_NUMERIC - } - Self::Integer => { - DataTypeFamily::PREDEFINED - | DataTypeFamily::NUMERIC - | DataTypeFamily::INTEGER_NUMERIC - | DataTypeFamily::EXACT_NUMERIC - } - Self::Bigint => { - DataTypeFamily::PREDEFINED - | DataTypeFamily::NUMERIC - | DataTypeFamily::INTEGER_NUMERIC - | DataTypeFamily::EXACT_NUMERIC - } - Self::Float => { - DataTypeFamily::PREDEFINED - | DataTypeFamily::NUMERIC - | DataTypeFamily::APPROXIMATE_NUMERIC - } - Self::Double => { - DataTypeFamily::PREDEFINED - | DataTypeFamily::NUMERIC - | DataTypeFamily::APPROXIMATE_NUMERIC - } - Self::Date => DataTypeFamily::PREDEFINED | DataTypeFamily::DATETIME, - Self::TimeWithoutTimeZone => { - DataTypeFamily::PREDEFINED | DataTypeFamily::DATETIME | DataTypeFamily::TIME - } - Self::TimestampWithoutTimeZone => { - DataTypeFamily::PREDEFINED | DataTypeFamily::DATETIME | DataTypeFamily::TIMESTAMP - } - Self::TimestampWithLocalTimeZone => { - DataTypeFamily::PREDEFINED - | DataTypeFamily::DATETIME - | DataTypeFamily::TIMESTAMP - | DataTypeFamily::EXTENSION - } - Self::Array => DataTypeFamily::CONSTRUCTED | DataTypeFamily::COLLECTION, - Self::Multiset => DataTypeFamily::CONSTRUCTED | DataTypeFamily::COLLECTION, - Self::Map => DataTypeFamily::CONSTRUCTED | DataTypeFamily::EXTENSION, - Self::Row => DataTypeFamily::CONSTRUCTED, - } - } -} - -/// A visitor that can visit different data types. -pub trait DataTypeVisitor { - fn visit(&mut self, data_type: &DataType) -> R; -} - /// Data type for paimon table. /// /// Impl Reference: -#[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize, Serialize, Hash)] -pub struct DataType { - is_nullable: bool, - type_root: DataTypeRoot, +#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)] +pub enum DataType { + /// Data type of a boolean with a (possibly) three-valued logic of `TRUE`, `FALSE`, `UNKNOWN`. + Boolean(BooleanType), + + /// Data type of a 1-byte (2^8) signed integer with values from -128 to 127. + TinyInt(TinyIntType), + /// Data type of a 2-byte (2^16) signed integer with values from -32,768 to 32,767. + SmallInt(SmallIntType), + /// Data type of a 4-byte (2^32) signed integer with values from -2,147,483,648 to 2,147,483,647. + Int(IntType), + /// Data type of an 8-byte (2^64) signed integer with values from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807. + BigInt(BigIntType), + /// Data type of a decimal number with fixed precision and scale. + Decimal(DecimalType), + /// Data type of an 8-byte double precision floating point number. + Double(DoubleType), + /// Data type of a 4-byte single precision floating point number. + Float(FloatType), + + /// Data type of a fixed-length binary string (=a sequence of bytes). + Binary(BinaryType), + /// Data type of a variable-length binary string (=a sequence of bytes). + VarBinary(VarBinaryType), + /// Data type of a fixed-length character string. + Char(CharType), + /// Data type of a variable-length character string. + VarChar(VarCharType), + + /// Data type of a date consisting of `year-month-day` with values ranging from `0000-01-01` to `9999-12-31` + Date(DateType), + /// Data type of a timestamp WITH LOCAL time zone consisting of `year-month-day hour:minute:second[.fractional] zone`. + LocalZonedTimestamp(LocalZonedTimestampType), + /// Data type of a time WITHOUT time zone consisting of `hour:minute:second[.fractional]` with + /// up to nanosecond precision and values ranging from `00:00:00.000000000` to `23:59:59.999999999`. + Time(TimeType), + /// Data type of a timestamp WITHOUT time zone consisting of `year-month-day hour:minute:second[.fractional]` with up to nanosecond precision and values ranging from `0000-01-01 00:00:00.000000000` to `9999-12-31 23:59:59.999999999`. + Timestamp(TimestampType), + + /// Data type of an array of elements with same subtype. + Array(ArrayType), + /// Data type of an associative array that maps keys `NULL` to values (including `NULL`). + Map(MapType), + /// Data type of a multiset (=bag). Unlike a set, it allows for multiple instances for each of its + /// elements with a common subtype. + Multiset(MultisetType), + /// Data type of a sequence of fields. A field consists of a field name, field type, and an optional + /// description. + Row(RowType), } impl Display for DataType { @@ -164,118 +112,55 @@ impl FromStr for DataType { } } -#[allow(dead_code)] -impl DataType { - fn new(is_nullable: bool, type_root: DataTypeRoot) -> Self { - Self { - is_nullable, - type_root, - } - } - - /// Returns a deep copy of this type with possibly different nullability. - /// Impl Reference: - fn with_nullable(&self, is_nullable: bool) -> Self { - Self { - is_nullable, - type_root: self.type_root, - } - } - - /// Returns true if the data type is nullable. - /// - /// Impl Reference: - fn is_nullable(&self) -> bool { - self.is_nullable - } - - /// Returns the root of the data type. - /// - /// Impl Reference: - fn type_root(&self) -> &DataTypeRoot { - &self.type_root - } - - /// Returns whether the root of the type equals to the type_root or not. - /// - /// Impl Reference: - fn is(&self, type_root: &DataTypeRoot) -> bool { - &self.type_root == type_root - } - - /// Returns whether the family type of the type equals to the family or not. - /// - /// Impl Reference: - fn is_family(&self, family: DataTypeFamily) -> bool { - self.type_root.families().contains(family) - } - - /// Returns whether the root of the type equals to at least on of the type_roots or not. - /// - /// Impl Reference: - fn is_any_of(&self, type_roots: &[DataTypeRoot]) -> bool { - type_roots.iter().any(|tr: &DataTypeRoot| self.is(tr)) - } - - /// Returns whether the root of the type is part of at least one family of the families or not. - /// Impl Reference: - fn is_any_of_family(&self, families: &[DataTypeFamily]) -> bool { - families.iter().any(|f: &DataTypeFamily| self.is_family(*f)) - } - - fn accept(&self, visitor: &mut T) - where - T: DataTypeVisitor, - { - visitor.visit(self); - } -} - /// ArrayType for paimon. /// +/// Data type of an array of elements with same subtype. +/// /// Impl Reference: . #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)] #[serde(rename_all = "camelCase")] pub struct ArrayType { - element_type: DataType, + nullable: bool, + element_type: Box, } impl Display for ArrayType { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - if self.element_type.is_nullable() { - write!(f, "ARRAY") - } else { - write!(f, "ARRAY NOT NULL") - } + fn fmt(&self, _: &mut Formatter<'_>) -> std::fmt::Result { + todo!() } } -impl Default for ArrayType { - fn default() -> Self { - Self::new(true) +impl ArrayType { + pub fn new(element_type: DataType) -> Self { + Self::with_nullable(true, element_type) } -} -impl ArrayType { - pub fn new(is_nullable: bool) -> Self { + pub fn with_nullable(nullable: bool, element_type: DataType) -> Self { Self { - element_type: DataType::new(is_nullable, DataTypeRoot::Array), + nullable, + element_type: Box::new(element_type), } } + + pub fn family(&self) -> DataTypeFamily { + DataTypeFamily::CONSTRUCTED | DataTypeFamily::COLLECTION + } } /// BigIntType for paimon. /// -/// Impl Reference: . +/// Data type of an 8-byte (2^64) signed integer with values from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807. +/// +/// Impl Reference: . #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)] pub struct BigIntType { - element_type: DataType, + nullable: bool, } impl Display for BigIntType { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "BIGINT")?; - if !self.element_type.is_nullable() { + if !self.nullable { write!(f, " NOT NULL")?; } Ok(()) @@ -284,32 +169,43 @@ impl Display for BigIntType { impl Default for BigIntType { fn default() -> Self { - Self::new(true) + Self::new() } } impl BigIntType { - pub fn new(is_nullable: bool) -> Self { - Self { - element_type: DataType::new(is_nullable, DataTypeRoot::Bigint), - } + pub fn new() -> Self { + Self::with_nullable(true) + } + + pub fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn family(&self) -> DataTypeFamily { + DataTypeFamily::PREDEFINED + | DataTypeFamily::NUMERIC + | DataTypeFamily::INTEGER_NUMERIC + | DataTypeFamily::EXACT_NUMERIC } } /// BinaryType for paimon. /// -/// Impl Reference: . +/// Data type of a fixed-length binary string (=a sequence of bytes). +/// +/// Impl Reference: . #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)] #[serde(rename_all = "camelCase")] pub struct BinaryType { - element_type: DataType, + nullable: bool, length: usize, } impl Display for BinaryType { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "BINARY({})", self.length)?; - if !self.element_type.is_nullable() { + if !self.nullable { write!(f, " NOT NULL")?; } Ok(()) @@ -318,7 +214,7 @@ impl Display for BinaryType { impl Default for BinaryType { fn default() -> Self { - Self::new(true, Self::DEFAULT_LENGTH) + Self::new(Self::DEFAULT_LENGTH).unwrap() } } @@ -329,44 +225,40 @@ impl BinaryType { pub const DEFAULT_LENGTH: usize = 1; - pub fn new(is_nullable: bool, length: usize) -> Self { - Self::try_new(is_nullable, length).unwrap() + pub fn new(length: usize) -> Result { + Self::with_nullable(true, length) } - pub fn try_new(is_nullable: bool, length: usize) -> Result { + pub fn with_nullable(nullable: bool, length: usize) -> Result { if length < Self::MIN_LENGTH { return Err("Binary string length must be at least 1."); } - Ok(Self { - element_type: DataType { - is_nullable, - type_root: DataTypeRoot::Binary, - }, - length, - }) - } - - pub fn with_length(length: usize) -> Self { - Self::new(true, length) + Ok(Self { nullable, length }) } pub fn length(&self) -> usize { self.length } + + pub fn family(&self) -> DataTypeFamily { + DataTypeFamily::PREDEFINED | DataTypeFamily::BINARY_STRING + } } /// BooleanType for paimon. /// -/// Impl Reference: . +/// Data type of a boolean with a (possibly) three-valued logic of `TRUE`, `FALSE`, `UNKNOWN`. +/// +/// Impl Reference: . #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)] pub struct BooleanType { - pub element_type: DataType, + nullable: bool, } impl Display for BooleanType { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "BOOLEAN")?; - if !self.element_type.is_nullable() { + if !self.nullable { write!(f, " NOT NULL")?; } Ok(()) @@ -375,32 +267,40 @@ impl Display for BooleanType { impl Default for BooleanType { fn default() -> Self { - Self::new(true) + Self::new() } } impl BooleanType { - pub fn new(is_nullable: bool) -> Self { - Self { - element_type: DataType::new(is_nullable, DataTypeRoot::Boolean), - } + pub fn new() -> Self { + Self::with_nullable(true) + } + + pub fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn family(&self) -> DataTypeFamily { + DataTypeFamily::PREDEFINED } } /// CharType for paimon. /// +/// Data type of a fixed-length character string. +/// /// Impl Reference: . -#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] +#[derive(Debug, Clone, PartialEq, Hash, Eq, Deserialize, Serialize)] #[serde(rename_all = "camelCase")] pub struct CharType { - element_type: DataType, + nullable: bool, length: usize, } impl Display for CharType { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "CHAR({})", self.length)?; - if !self.element_type.is_nullable() { + if !self.nullable { write!(f, " NOT NULL")?; } Ok(()) @@ -409,7 +309,7 @@ impl Display for CharType { impl Default for CharType { fn default() -> Self { - Self::with_length(Self::DEFAULT_LENGTH) + Self::new(Self::DEFAULT_LENGTH).unwrap() } } @@ -420,44 +320,40 @@ impl CharType { pub const MAX_LENGTH: usize = 255; - pub fn new(is_nullable: bool, length: usize) -> Self { - Self::try_new(is_nullable, length).unwrap() + pub fn new(length: usize) -> Result { + Self::with_nullable(true, length) } - pub fn try_new(is_nullable: bool, length: usize) -> Result { + pub fn with_nullable(nullable: bool, length: usize) -> Result { if !(Self::MIN_LENGTH..=Self::MAX_LENGTH).contains(&length) { return Err("Character string length must be between 1 and 255 (both inclusive)."); } - Ok(CharType { - element_type: DataType { - is_nullable, - type_root: DataTypeRoot::Char, - }, - length, - }) - } - - pub fn with_length(length: usize) -> Self { - Self::new(true, length) + Ok(CharType { nullable, length }) } pub fn length(&self) -> usize { self.length } + + pub fn family(&self) -> DataTypeFamily { + DataTypeFamily::PREDEFINED | DataTypeFamily::CHARACTER_STRING + } } /// DateType for paimon. /// -/// Impl Reference: . -#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] +/// Data type of a date consisting of `year-month-day` with values ranging from `0000-01-01` to `9999-12-31` +/// +/// Impl Reference: . +#[derive(Debug, Clone, PartialEq, Hash, Eq, Deserialize, Serialize)] pub struct DateType { - element_type: DataType, + nullable: bool, } impl Display for DateType { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "DATE")?; - if !self.element_type.is_nullable() { + if !self.nullable { write!(f, " NOT NULL")?; } Ok(()) @@ -466,24 +362,33 @@ impl Display for DateType { impl Default for DateType { fn default() -> Self { - Self::new(true) + Self::new() } } impl DateType { - pub fn new(is_nullable: bool) -> Self { - Self { - element_type: DataType::new(is_nullable, DataTypeRoot::Date), - } + pub fn new() -> Self { + Self::with_nullable(true) + } + + pub fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn family(&self) -> DataTypeFamily { + DataTypeFamily::PREDEFINED | DataTypeFamily::DATETIME } } /// DecimalType for paimon. /// -/// Impl Reference: . +/// Data type of a decimal number with fixed precision and scale. +/// +/// Impl Reference: . #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)] pub struct DecimalType { - element_type: DataType, + nullable: bool, + precision: u32, scale: u32, } @@ -491,7 +396,7 @@ pub struct DecimalType { impl Display for DecimalType { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "DECIMAL({}, {})", self.precision, self.scale)?; - if !self.element_type.is_nullable() { + if !self.nullable { write!(f, " NOT NULL")?; } Ok(()) @@ -500,7 +405,7 @@ impl Display for DecimalType { impl Default for DecimalType { fn default() -> Self { - Self::with_precision_and_scale(Self::DEFAULT_PRECISION, Self::DEFAULT_SCALE) + Self::new(Self::DEFAULT_PRECISION, Self::DEFAULT_SCALE).unwrap() } } @@ -515,11 +420,11 @@ impl DecimalType { pub const DEFAULT_SCALE: u32 = 0; - pub fn new(is_nullable: bool, precision: u32, scale: u32) -> Self { - Self::try_new(is_nullable, precision, scale).unwrap() + pub fn new(precision: u32, scale: u32) -> Result { + Self::with_nullable(true, precision, scale) } - pub fn try_new(is_nullable: bool, precision: u32, scale: u32) -> Result { + pub fn with_nullable(nullable: bool, precision: u32, scale: u32) -> Result { if !(Self::MIN_PRECISION..=Self::MAX_PRECISION).contains(&precision) { return Err(format!( "Decimal precision must be between {} and {} (both inclusive).", @@ -537,19 +442,12 @@ impl DecimalType { } Ok(DecimalType { - element_type: DataType { - is_nullable, - type_root: DataTypeRoot::Decimal, - }, + nullable, precision, scale, }) } - pub fn with_precision_and_scale(precision: u32, scale: u32) -> Self { - Self::new(true, precision, scale) - } - pub fn precision(&self) -> u32 { self.precision } @@ -557,20 +455,26 @@ impl DecimalType { pub fn scale(&self) -> u32 { self.scale } + + pub fn family(&self) -> DataTypeFamily { + DataTypeFamily::PREDEFINED | DataTypeFamily::NUMERIC | DataTypeFamily::EXACT_NUMERIC + } } /// DoubleType for paimon. /// -/// Impl Reference: . +/// Data type of an 8-byte double precision floating point number. +/// +/// Impl Reference: . #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)] pub struct DoubleType { - element_type: DataType, + nullable: bool, } impl Display for DoubleType { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "DOUBLE")?; - if !self.element_type.is_nullable() { + if !self.nullable { write!(f, " NOT NULL")?; } Ok(()) @@ -579,30 +483,36 @@ impl Display for DoubleType { impl Default for DoubleType { fn default() -> Self { - Self::new(true) + Self::new() } } impl DoubleType { - pub fn new(is_nullable: bool) -> Self { - Self { - element_type: DataType::new(is_nullable, DataTypeRoot::Double), - } + pub fn new() -> Self { + Self::with_nullable(true) + } + + pub fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn family(&self) -> DataTypeFamily { + DataTypeFamily::PREDEFINED | DataTypeFamily::NUMERIC | DataTypeFamily::APPROXIMATE_NUMERIC } } /// FloatType for paimon. /// -/// Impl Reference: . +/// Impl Reference: . #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)] pub struct FloatType { - element_type: DataType, + nullable: bool, } impl Display for FloatType { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "FLOAT")?; - if !self.element_type.is_nullable() { + if !self.nullable { write!(f, " NOT NULL")?; } Ok(()) @@ -611,30 +521,38 @@ impl Display for FloatType { impl Default for FloatType { fn default() -> Self { - Self::new(true) + Self::new() } } impl FloatType { - pub fn new(is_nullable: bool) -> Self { - Self { - element_type: DataType::new(is_nullable, DataTypeRoot::Float), - } + pub fn new() -> Self { + Self::with_nullable(true) + } + + pub fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn family(&self) -> DataTypeFamily { + DataTypeFamily::PREDEFINED | DataTypeFamily::NUMERIC | DataTypeFamily::APPROXIMATE_NUMERIC } } /// IntType for paimon. /// -/// Impl Reference: . +/// Data type of a 4-byte (2^32) signed integer with values from -2,147,483,648 to 2,147,483,647. +/// +/// Impl Reference: . #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)] pub struct IntType { - element_type: DataType, + nullable: bool, } impl Display for IntType { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "INTEGER")?; - if !self.element_type.is_nullable() { + if !self.nullable { write!(f, " NOT NULL")?; } Ok(()) @@ -643,31 +561,42 @@ impl Display for IntType { impl Default for IntType { fn default() -> Self { - Self::new(true) + Self::new() } } impl IntType { - pub fn new(is_nullable: bool) -> Self { - Self { - element_type: DataType::new(is_nullable, DataTypeRoot::Integer), - } + pub fn new() -> Self { + Self::with_nullable(true) + } + + pub fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn family(&self) -> DataTypeFamily { + DataTypeFamily::PREDEFINED + | DataTypeFamily::NUMERIC + | DataTypeFamily::INTEGER_NUMERIC + | DataTypeFamily::EXACT_NUMERIC } } /// LocalZonedTimestampType for paimon. /// -/// Impl Reference: . +/// Data type of a timestamp WITH LOCAL time zone consisting of `year-month-day hour:minute:second[.fractional] zone` with up to nanosecond precision and values ranging from `0000-01-01 00:00:00.000000000 +14:59` to `9999-12-31 23:59:59.999999999 -14:59`. Leap seconds (23:59:60 and 23:59:61) are not supported as the semantics are closer to a point in time than a wall-clock time. +/// +/// Impl Reference: . #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)] pub struct LocalZonedTimestampType { - element_type: DataType, + nullable: bool, precision: u32, } impl Display for LocalZonedTimestampType { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "TIMESTAMP WITH LOCAL TIME ZONE({})", self.precision)?; - if !self.element_type.is_nullable() { + if !self.nullable { write!(f, " NOT NULL")?; } Ok(()) @@ -676,7 +605,7 @@ impl Display for LocalZonedTimestampType { impl Default for LocalZonedTimestampType { fn default() -> Self { - Self::with_precision(Self::DEFAULT_PRECISION) + Self::new(Self::DEFAULT_PRECISION).unwrap() } } @@ -687,11 +616,11 @@ impl LocalZonedTimestampType { pub const DEFAULT_PRECISION: u32 = TimestampType::DEFAULT_PRECISION; - pub fn new(is_nullable: bool, precision: u32) -> Self { - LocalZonedTimestampType::try_new(is_nullable, precision).unwrap() + pub fn new(precision: u32) -> Result { + Self::with_nullable(true, precision) } - pub fn try_new(is_nullable: bool, precision: u32) -> Result { + pub fn with_nullable(nullable: bool, precision: u32) -> Result { if !(Self::MIN_PRECISION..=Self::MAX_PRECISION).contains(&precision) { return Err(format!( "Timestamp precision must be between {} and {} (both inclusive).", @@ -701,37 +630,37 @@ impl LocalZonedTimestampType { } Ok(LocalZonedTimestampType { - element_type: DataType { - is_nullable, - type_root: DataTypeRoot::TimestampWithLocalTimeZone, - }, + nullable, precision, }) } - pub fn with_precision(precision: u32) -> Self { - Self::new(true, precision) - } - pub fn precision(&self) -> u32 { self.precision } -} -/// Next TODO: MapType、MultisetType、RowType + pub fn family(&self) -> DataTypeFamily { + DataTypeFamily::PREDEFINED + | DataTypeFamily::DATETIME + | DataTypeFamily::TIMESTAMP + | DataTypeFamily::EXTENSION + } +} /// SmallIntType for paimon. /// -/// Impl Reference: . +/// Data type of a 2-byte (2^16) signed integer with values from -32,768 to 32,767. +/// +/// Impl Reference: . #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)] pub struct SmallIntType { - element_type: DataType, + nullable: bool, } impl Display for SmallIntType { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "SMALLINT")?; - if !self.element_type.is_nullable() { + if !self.nullable { write!(f, " NOT NULL")?; } Ok(()) @@ -740,31 +669,43 @@ impl Display for SmallIntType { impl Default for SmallIntType { fn default() -> Self { - Self::new(true) + Self::new() } } impl SmallIntType { - pub fn new(is_nullable: bool) -> Self { - Self { - element_type: DataType::new(is_nullable, DataTypeRoot::Smallint), - } + pub fn new() -> Self { + Self::with_nullable(true) + } + + pub fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn family(&self) -> DataTypeFamily { + DataTypeFamily::PREDEFINED + | DataTypeFamily::NUMERIC + | DataTypeFamily::INTEGER_NUMERIC + | DataTypeFamily::EXACT_NUMERIC } } /// TimeType for paimon. /// -/// Impl Reference: . +/// Data type of a time WITHOUT time zone consisting of `hour:minute:second[.fractional]` with +/// up to nanosecond precision and values ranging from `00:00:00.000000000` to `23:59:59.999999999`. +/// +/// Impl Reference: . #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)] pub struct TimeType { - element_type: DataType, + nullable: bool, precision: u32, } impl Display for TimeType { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "TIME({})", self.precision)?; - if !self.element_type.is_nullable() { + if !self.nullable { write!(f, " NOT NULL")?; } Ok(()) @@ -773,7 +714,7 @@ impl Display for TimeType { impl Default for TimeType { fn default() -> Self { - Self::with_precision(TimeType::DEFAULT_PRECISION) + Self::new(TimeType::DEFAULT_PRECISION).unwrap() } } @@ -784,11 +725,11 @@ impl TimeType { pub const DEFAULT_PRECISION: u32 = 0; - pub fn new(is_nullable: bool, precision: u32) -> Self { - Self::try_new(is_nullable, precision).unwrap() + pub fn new(precision: u32) -> Result { + Self::with_nullable(true, precision) } - pub fn try_new(is_nullable: bool, precision: u32) -> Result { + pub fn with_nullable(nullable: bool, precision: u32) -> Result { if !(Self::MIN_PRECISION..=Self::MAX_PRECISION).contains(&precision) { return Err(format!( "Time precision must be between {} and {} (both inclusive).", @@ -798,36 +739,35 @@ impl TimeType { } Ok(TimeType { - element_type: DataType { - is_nullable, - type_root: DataTypeRoot::TimeWithoutTimeZone, - }, + nullable, precision, }) } - pub fn with_precision(precision: u32) -> Self { - Self::new(true, precision) - } - pub fn precision(&self) -> u32 { self.precision } + + pub fn family(&self) -> DataTypeFamily { + DataTypeFamily::PREDEFINED | DataTypeFamily::DATETIME | DataTypeFamily::TIME + } } /// TimestampType for paimon. /// -/// Impl Reference: . +/// Data type of a timestamp WITHOUT time zone consisting of `year-month-day hour:minute:second[.fractional]` with up to nanosecond precision and values ranging from `0000-01-01 00:00:00.000000000` to `9999-12-31 23:59:59.999999999`. +/// +/// Impl Reference: . #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)] pub struct TimestampType { - element_type: DataType, + nullable: bool, precision: u32, } impl Display for TimestampType { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "TIMESTAMP({})", self.precision)?; - if !self.element_type.is_nullable() { + if !self.nullable { write!(f, " NOT NULL")?; } Ok(()) @@ -836,7 +776,7 @@ impl Display for TimestampType { impl Default for TimestampType { fn default() -> Self { - Self::with_precision(Self::DEFAULT_PRECISION) + Self::new(Self::DEFAULT_PRECISION).unwrap() } } @@ -847,11 +787,11 @@ impl TimestampType { pub const DEFAULT_PRECISION: u32 = 6; - pub fn new(is_nullable: bool, precision: u32) -> Self { - Self::try_new(is_nullable, precision).unwrap() + pub fn new(precision: u32) -> Result { + Self::with_nullable(true, precision) } - pub fn try_new(is_nullable: bool, precision: u32) -> Result { + pub fn with_nullable(nullable: bool, precision: u32) -> Result { if !(Self::MIN_PRECISION..=Self::MAX_PRECISION).contains(&precision) { return Err(format!( "Timestamp precision must be between {} and {} (both inclusive).", @@ -861,35 +801,34 @@ impl TimestampType { } Ok(TimestampType { - element_type: DataType { - is_nullable, - type_root: DataTypeRoot::TimestampWithoutTimeZone, - }, + nullable, precision, }) } - pub fn with_precision(precision: u32) -> Self { - Self::new(true, precision) - } - pub fn precision(&self) -> u32 { self.precision } + + pub fn family(&self) -> DataTypeFamily { + DataTypeFamily::PREDEFINED | DataTypeFamily::DATETIME | DataTypeFamily::TIMESTAMP + } } /// TinyIntType for paimon. /// -/// Impl Reference: . +/// Data type of a 1-byte signed integer with values from -128 to 127. +/// +/// Impl Reference: . #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)] pub struct TinyIntType { - element_type: DataType, + nullable: bool, } impl Display for TinyIntType { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "TINYINT")?; - if !self.element_type.is_nullable() { + if !self.nullable { write!(f, " NOT NULL")?; } Ok(()) @@ -898,31 +837,42 @@ impl Display for TinyIntType { impl Default for TinyIntType { fn default() -> Self { - Self::new(true) + Self::new() } } impl TinyIntType { - pub fn new(is_nullable: bool) -> Self { - Self { - element_type: DataType::new(is_nullable, DataTypeRoot::Tinyint), - } + pub fn new() -> Self { + Self::with_nullable(true) + } + + pub fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn family(&self) -> DataTypeFamily { + DataTypeFamily::PREDEFINED + | DataTypeFamily::NUMERIC + | DataTypeFamily::INTEGER_NUMERIC + | DataTypeFamily::EXACT_NUMERIC } } /// VarBinaryType for paimon. /// -/// Impl Reference: . +/// Data type of a variable-length binary string (=a sequence of bytes). +/// +/// Impl Reference: . #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)] pub struct VarBinaryType { - element_type: DataType, + nullable: bool, length: u32, } impl Display for VarBinaryType { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "VARBINARY({})", self.length)?; - if !self.element_type.is_nullable() { + if !self.nullable { write!(f, " NOT NULL")?; } Ok(()) @@ -931,7 +881,7 @@ impl Display for VarBinaryType { impl Default for VarBinaryType { fn default() -> Self { - Self::with_length(Self::DEFAULT_LENGTH) + Self::new(Self::DEFAULT_LENGTH).unwrap() } } @@ -942,46 +892,42 @@ impl VarBinaryType { pub const DEFAULT_LENGTH: u32 = 1; - pub fn new(is_nullable: bool, length: u32) -> Self { - Self::try_new(is_nullable, length).unwrap() + pub fn new(length: u32) -> Result { + Self::try_new(true, length) } - pub fn try_new(is_nullable: bool, length: u32) -> Result { + pub fn try_new(nullable: bool, length: u32) -> Result { if length < Self::MIN_LENGTH { return Err("Binary string length must be at least 1.".to_string()); } - Ok(VarBinaryType { - element_type: DataType { - is_nullable, - type_root: DataTypeRoot::Varbinary, - }, - length, - }) - } - - pub fn with_length(length: u32) -> Self { - Self::new(true, length) + Ok(VarBinaryType { nullable, length }) } pub fn length(&self) -> u32 { self.length } + + pub fn family(&self) -> DataTypeFamily { + DataTypeFamily::PREDEFINED | DataTypeFamily::BINARY_STRING + } } /// VarCharType for paimon. /// -/// Impl Reference: . +/// Data type of a variable-length character string. +/// +/// Impl Reference: . #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)] pub struct VarCharType { - element_type: DataType, + nullable: bool, length: u32, } impl Display for VarCharType { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "VARCHAR({})", self.length)?; - if !self.element_type.is_nullable() { + if !self.nullable { write!(f, " NOT NULL")?; } Ok(()) @@ -990,7 +936,7 @@ impl Display for VarCharType { impl Default for VarCharType { fn default() -> Self { - Self::with_length(Self::DEFAULT_LENGTH) + Self::new(Self::DEFAULT_LENGTH).unwrap() } } @@ -1001,11 +947,11 @@ impl VarCharType { pub const DEFAULT_LENGTH: u32 = 1; - pub fn new(is_nullable: bool, length: u32) -> Self { - Self::try_new(is_nullable, length).unwrap() + pub fn new(length: u32) -> Result { + Self::with_nullable(true, length) } - pub fn try_new(is_nullable: bool, length: u32) -> Result { + pub fn with_nullable(nullable: bool, length: u32) -> Result { if !(Self::MIN_LENGTH..=Self::MAX_LENGTH).contains(&length) { return Err(format!( "Character string length must be between {} and {} (both inclusive).", @@ -1014,20 +960,119 @@ impl VarCharType { )); } - Ok(VarCharType { - element_type: DataType { - is_nullable, - type_root: DataTypeRoot::Varchar, - }, - length, - }) - } - - pub fn with_length(length: u32) -> Self { - Self::new(true, length) + Ok(VarCharType { nullable, length }) } pub fn length(&self) -> u32 { self.length } + + pub fn family(&self) -> DataTypeFamily { + DataTypeFamily::PREDEFINED | DataTypeFamily::CHARACTER_STRING + } +} + +/// MapType for paimon. +/// +/// Data type of an associative array that maps keys `NULL` to values (including `NULL`). +/// +/// Impl Reference: . +#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)] +pub struct MapType { + nullable: bool, + key_type: Box, + value_type: Box, +} + +impl Display for MapType { + fn fmt(&self, _: &mut Formatter<'_>) -> std::fmt::Result { + todo!() + } +} + +impl MapType { + pub fn new(key_type: DataType, value_type: DataType) -> Self { + Self::with_nullable(true, key_type, value_type) + } + + pub fn with_nullable(nullable: bool, key_type: DataType, value_type: DataType) -> Self { + Self { + nullable, + key_type: Box::new(key_type), + value_type: Box::new(value_type), + } + } + + pub fn family(&self) -> DataTypeFamily { + DataTypeFamily::CONSTRUCTED | DataTypeFamily::COLLECTION + } +} + +/// MultisetType for paimon. +/// +/// Data type of a multiset (=bag). Unlike a set, it allows for multiple instances for each of its +/// elements with a common subtype. +/// +/// Impl Reference: . +#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)] +pub struct MultisetType { + nullable: bool, + element_type: Box, +} + +impl Display for MultisetType { + fn fmt(&self, _: &mut Formatter<'_>) -> std::fmt::Result { + todo!() + } +} + +impl MultisetType { + pub fn new(element_type: DataType) -> Self { + Self::with_nullable(true, element_type) + } + + pub fn with_nullable(nullable: bool, element_type: DataType) -> Self { + Self { + nullable, + element_type: Box::new(element_type), + } + } + + pub fn family(&self) -> DataTypeFamily { + DataTypeFamily::CONSTRUCTED | DataTypeFamily::COLLECTION + } +} + +/// RowType for paimon. +/// +/// Data type of a sequence of fields. A field consists of a field name, field type, and an optional +/// description. The most specific type of a row of a table is a row type. In this case, each column +/// of the row corresponds to the field of the row type that has the same ordinal position as the +/// column. +/// +/// Impl Reference: . +#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)] +pub struct RowType { + nullable: bool, + fields: Vec, +} + +impl Display for RowType { + fn fmt(&self, _: &mut Formatter<'_>) -> std::fmt::Result { + todo!() + } +} + +impl RowType { + pub const fn new(fields: Vec) -> Self { + Self::with_nullable(true, fields) + } + + pub const fn with_nullable(nullable: bool, fields: Vec) -> Self { + Self { nullable, fields } + } + + pub fn family(&self) -> DataTypeFamily { + DataTypeFamily::CONSTRUCTED + } }