From 850ba66781a122a4bfd2ccdfc44da6fed5a082b5 Mon Sep 17 00:00:00 2001 From: fqaiser94 Date: Thu, 12 Dec 2024 22:08:37 -0500 Subject: [PATCH] Move inside iceberg crate --- Cargo.toml | 1 - crates/iceberg/Cargo.toml | 1 + crates/iceberg/src/lib.rs | 2 + .../src => iceberg/src/puffin}/blob.rs | 0 .../src => iceberg/src/puffin}/compression.rs | 5 ++- .../{puffin/src => iceberg/src/puffin}/lib.rs | 0 .../src => iceberg/src/puffin}/metadata.rs | 12 ++--- crates/iceberg/src/puffin/mod.rs | 38 ++++++++++++++++ .../src => iceberg/src/puffin}/reader.rs | 13 +++--- .../src => iceberg/src/puffin}/test_utils.rs | 25 +++++------ .../src => iceberg/src/puffin}/writer.rs | 28 ++++++------ crates/iceberg/src/writer/file_writer/mod.rs | 3 +- .../src/writer/file_writer/track_writer.rs | 2 +- .../empty-puffin-uncompressed.bin | Bin .../sample-metric-data-compressed-zstd.bin | Bin .../sample-metric-data-uncompressed.bin | Bin .../empty-puffin-uncompressed.bin | Bin .../sample-metric-data-compressed-zstd.bin | Bin .../sample-metric-data-uncompressed.bin | Bin crates/puffin/Cargo.toml | 41 ------------------ 20 files changed, 84 insertions(+), 87 deletions(-) rename crates/{puffin/src => iceberg/src/puffin}/blob.rs (100%) rename crates/{puffin/src => iceberg/src/puffin}/compression.rs (97%) rename crates/{puffin/src => iceberg/src/puffin}/lib.rs (100%) rename crates/{puffin/src => iceberg/src/puffin}/metadata.rs (98%) create mode 100644 crates/iceberg/src/puffin/mod.rs rename crates/{puffin/src => iceberg/src/puffin}/reader.rs (94%) rename crates/{puffin/src => iceberg/src/puffin}/test_utils.rs (88%) rename crates/{puffin/src => iceberg/src/puffin}/writer.rs (95%) rename crates/{puffin/testdata/v1 => iceberg/testdata/puffin}/java-generated/empty-puffin-uncompressed.bin (100%) rename crates/{puffin/testdata/v1 => iceberg/testdata/puffin}/java-generated/sample-metric-data-compressed-zstd.bin (100%) rename crates/{puffin/testdata/v1 => iceberg/testdata/puffin}/java-generated/sample-metric-data-uncompressed.bin (100%) rename crates/{puffin/testdata/v1 => iceberg/testdata/puffin}/rust-generated/empty-puffin-uncompressed.bin (100%) rename crates/{puffin/testdata/v1 => iceberg/testdata/puffin}/rust-generated/sample-metric-data-compressed-zstd.bin (100%) rename crates/{puffin/testdata/v1 => iceberg/testdata/puffin}/rust-generated/sample-metric-data-uncompressed.bin (100%) delete mode 100644 crates/puffin/Cargo.toml diff --git a/Cargo.toml b/Cargo.toml index 296cb5d42..9a76ff416 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,7 +23,6 @@ members = [ "crates/iceberg", "crates/integration_tests", "crates/integrations/*", - "crates/puffin", "crates/test_utils", ] exclude = ["bindings/python"] diff --git a/crates/iceberg/Cargo.toml b/crates/iceberg/Cargo.toml index 1307cc6f3..f2e6694bc 100644 --- a/crates/iceberg/Cargo.toml +++ b/crates/iceberg/Cargo.toml @@ -81,6 +81,7 @@ tokio = { workspace = true, optional = true } typed-builder = { workspace = true } url = { workspace = true } uuid = { workspace = true } +zstd = { workspace = true } [dev-dependencies] ctor = { workspace = true } diff --git a/crates/iceberg/src/lib.rs b/crates/iceberg/src/lib.rs index 72cf18d4b..d2ec20348 100644 --- a/crates/iceberg/src/lib.rs +++ b/crates/iceberg/src/lib.rs @@ -84,3 +84,5 @@ mod runtime; pub mod arrow; mod utils; pub mod writer; + +pub mod puffin; diff --git a/crates/puffin/src/blob.rs b/crates/iceberg/src/puffin/blob.rs similarity index 100% rename from crates/puffin/src/blob.rs rename to crates/iceberg/src/puffin/blob.rs diff --git a/crates/puffin/src/compression.rs b/crates/iceberg/src/puffin/compression.rs similarity index 97% rename from crates/puffin/src/compression.rs rename to crates/iceberg/src/puffin/compression.rs index 76629c0ea..a6321cfc3 100644 --- a/crates/puffin/src/compression.rs +++ b/crates/iceberg/src/puffin/compression.rs @@ -15,9 +15,10 @@ // specific language governing permissions and limitations // under the License. -use iceberg::{Error, ErrorKind, Result}; use serde::{Deserialize, Serialize}; +use crate::{Error, ErrorKind, Result}; + #[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone, Copy)] #[serde(rename_all = "lowercase")] #[derive(Default)] @@ -73,7 +74,7 @@ impl CompressionCodec { #[cfg(test)] mod tests { - use crate::compression::CompressionCodec; + use crate::puffin::compression::CompressionCodec; #[tokio::test] async fn test_compression_codec_none() { diff --git a/crates/puffin/src/lib.rs b/crates/iceberg/src/puffin/lib.rs similarity index 100% rename from crates/puffin/src/lib.rs rename to crates/iceberg/src/puffin/lib.rs diff --git a/crates/puffin/src/metadata.rs b/crates/iceberg/src/puffin/metadata.rs similarity index 98% rename from crates/puffin/src/metadata.rs rename to crates/iceberg/src/puffin/metadata.rs index 2272d4666..f4025d526 100644 --- a/crates/puffin/src/metadata.rs +++ b/crates/iceberg/src/puffin/metadata.rs @@ -18,12 +18,12 @@ use std::collections::{HashMap, HashSet}; use bytes::Bytes; -use iceberg::io::{FileRead, InputFile}; -use iceberg::{Error, ErrorKind, Result}; use once_cell::sync::Lazy; use serde::{Deserialize, Serialize}; -use crate::compression::CompressionCodec; +use crate::io::{FileRead, InputFile}; +use crate::puffin::compression::CompressionCodec; +use crate::{Error, ErrorKind, Result}; /// Human-readable identification of the application writing the file, along with its version. /// Example: "Trino version 381" @@ -286,11 +286,11 @@ mod tests { use std::collections::HashMap; use bytes::Bytes; - use iceberg::io::{FileIOBuilder, InputFile}; use tempfile::TempDir; - use crate::metadata::{BlobMetadata, CompressionCodec, FileMetadata}; - use crate::test_utils::{ + use crate::io::{FileIOBuilder, InputFile}; + use crate::puffin::metadata::{BlobMetadata, CompressionCodec, FileMetadata}; + use crate::puffin::test_utils::{ empty_footer_payload, empty_footer_payload_bytes, empty_footer_payload_bytes_length_bytes, rust_empty_uncompressed_input_file, rust_uncompressed_metric_input_file, rust_zstd_compressed_metric_input_file, uncompressed_metric_file_metadata, diff --git a/crates/iceberg/src/puffin/mod.rs b/crates/iceberg/src/puffin/mod.rs new file mode 100644 index 000000000..33d6853dc --- /dev/null +++ b/crates/iceberg/src/puffin/mod.rs @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Iceberg Puffin file format implementation + +#![deny(missing_docs)] + +mod blob; +pub use blob::{Blob, APACHE_DATASKETCHES_THETA_V1}; + +mod compression; +pub use compression::CompressionCodec; + +mod metadata; +pub use metadata::{BlobMetadata, FileMetadata, CREATED_BY_PROPERTY}; + +mod reader; +pub use reader::PuffinReader; + +#[cfg(test)] +mod test_utils; + +mod writer; +pub use writer::PuffinWriter; diff --git a/crates/puffin/src/reader.rs b/crates/iceberg/src/puffin/reader.rs similarity index 94% rename from crates/puffin/src/reader.rs rename to crates/iceberg/src/puffin/reader.rs index 56160569e..bcef11045 100644 --- a/crates/puffin/src/reader.rs +++ b/crates/iceberg/src/puffin/reader.rs @@ -15,11 +15,10 @@ // specific language governing permissions and limitations // under the License. -use iceberg::io::{FileRead, InputFile}; -use iceberg::Result; - -use crate::blob::Blob; -use crate::metadata::{BlobMetadata, FileMetadata}; +use crate::io::{FileRead, InputFile}; +use crate::puffin::blob::Blob; +use crate::puffin::metadata::{BlobMetadata, FileMetadata}; +use crate::Result; /// Puffin reader pub struct PuffinReader { @@ -68,12 +67,12 @@ impl PuffinReader { #[cfg(test)] mod tests { - use crate::test_utils::{ + use crate::puffin::test_utils::{ blob_0, blob_1, rust_uncompressed_metric_input_file, rust_zstd_compressed_metric_input_file, uncompressed_metric_file_metadata, zstd_compressed_metric_file_metadata, }; - use crate::PuffinReader; + use crate::puffin::PuffinReader; #[tokio::test] async fn test_puffin_reader_uncompressed_metric_data() { diff --git a/crates/puffin/src/test_utils.rs b/crates/iceberg/src/puffin/test_utils.rs similarity index 88% rename from crates/puffin/src/test_utils.rs rename to crates/iceberg/src/puffin/test_utils.rs index 0f2e934a3..4ae1911df 100644 --- a/crates/puffin/src/test_utils.rs +++ b/crates/iceberg/src/puffin/test_utils.rs @@ -17,14 +17,13 @@ use std::collections::HashMap; -use iceberg::io::{FileIOBuilder, InputFile}; +use crate::io::{FileIOBuilder, InputFile}; +use crate::puffin::blob::Blob; +use crate::puffin::compression::CompressionCodec; +use crate::puffin::metadata::{BlobMetadata, FileMetadata, CREATED_BY_PROPERTY}; -use crate::blob::Blob; -use crate::compression::CompressionCodec; -use crate::metadata::{BlobMetadata, FileMetadata, CREATED_BY_PROPERTY}; - -const V1_RUST: &str = "testdata/v1/rust-generated"; -const V1_JAVA: &str = "testdata/v1/java-generated"; +const RUST_TESTDATA: &str = "testdata/puffin/rust-generated"; +const JAVA_TESTDATA: &str = "testdata/puffin/java-generated"; const EMPTY_UNCOMPRESSED: &str = "empty-puffin-uncompressed.bin"; const METRIC_UNCOMPRESSED: &str = "sample-metric-data-uncompressed.bin"; const METRIC_ZSTD_COMPRESSED: &str = "sample-metric-data-compressed-zstd.bin"; @@ -38,27 +37,27 @@ fn input_file_for_test_data(path: &str) -> InputFile { } pub(crate) fn java_empty_uncompressed_input_file() -> InputFile { - input_file_for_test_data(&[V1_JAVA, EMPTY_UNCOMPRESSED].join("/")) + input_file_for_test_data(&[JAVA_TESTDATA, EMPTY_UNCOMPRESSED].join("/")) } pub(crate) fn rust_empty_uncompressed_input_file() -> InputFile { - input_file_for_test_data(&[V1_RUST, EMPTY_UNCOMPRESSED].join("/")) + input_file_for_test_data(&[RUST_TESTDATA, EMPTY_UNCOMPRESSED].join("/")) } pub(crate) fn java_uncompressed_metric_input_file() -> InputFile { - input_file_for_test_data(&[V1_JAVA, METRIC_UNCOMPRESSED].join("/")) + input_file_for_test_data(&[JAVA_TESTDATA, METRIC_UNCOMPRESSED].join("/")) } pub(crate) fn rust_uncompressed_metric_input_file() -> InputFile { - input_file_for_test_data(&[V1_RUST, METRIC_UNCOMPRESSED].join("/")) + input_file_for_test_data(&[RUST_TESTDATA, METRIC_UNCOMPRESSED].join("/")) } pub(crate) fn java_zstd_compressed_metric_input_file() -> InputFile { - input_file_for_test_data(&[V1_JAVA, METRIC_ZSTD_COMPRESSED].join("/")) + input_file_for_test_data(&[JAVA_TESTDATA, METRIC_ZSTD_COMPRESSED].join("/")) } pub(crate) fn rust_zstd_compressed_metric_input_file() -> InputFile { - input_file_for_test_data(&[V1_RUST, METRIC_ZSTD_COMPRESSED].join("/")) + input_file_for_test_data(&[RUST_TESTDATA, METRIC_ZSTD_COMPRESSED].join("/")) } pub(crate) fn empty_footer_payload() -> FileMetadata { diff --git a/crates/puffin/src/writer.rs b/crates/iceberg/src/puffin/writer.rs similarity index 95% rename from crates/puffin/src/writer.rs rename to crates/iceberg/src/puffin/writer.rs index 68299d2fd..b0afb86d0 100644 --- a/crates/puffin/src/writer.rs +++ b/crates/iceberg/src/puffin/writer.rs @@ -20,13 +20,13 @@ use std::sync::atomic::AtomicU64; use std::sync::Arc; use bytes::Bytes; -use iceberg::io::{FileWrite, OutputFile}; -use iceberg::writer::file_writer::TrackWriter; -use iceberg::{Error, ErrorKind, Result}; -use crate::blob::Blob; -use crate::compression::CompressionCodec; -use crate::metadata::{BlobMetadata, ByteNumber, FileMetadata, Flag}; +use crate::io::{FileWrite, OutputFile}; +use crate::puffin::blob::Blob; +use crate::puffin::compression::CompressionCodec; +use crate::puffin::metadata::{BlobMetadata, ByteNumber, FileMetadata, Flag}; +use crate::writer::file_writer::track_writer::TrackWriter; +use crate::{Error, ErrorKind, Result}; /// Puffin writer pub struct PuffinWriter { @@ -186,21 +186,21 @@ impl PuffinWriter { mod tests { use std::collections::HashMap; - use iceberg::io::{FileIOBuilder, InputFile, OutputFile}; - use iceberg::Result; use tempfile::TempDir; - use crate::blob::Blob; - use crate::compression::CompressionCodec; - use crate::metadata::FileMetadata; - use crate::test_utils::{ + use crate::io::{FileIOBuilder, InputFile, OutputFile}; + use crate::puffin::blob::Blob; + use crate::puffin::compression::CompressionCodec; + use crate::puffin::metadata::FileMetadata; + use crate::puffin::test_utils::{ blob_0, blob_1, empty_footer_payload, empty_footer_payload_bytes, file_properties, java_empty_uncompressed_input_file, java_uncompressed_metric_input_file, java_zstd_compressed_metric_input_file, uncompressed_metric_file_metadata, zstd_compressed_metric_file_metadata, }; - use crate::writer::PuffinWriter; - use crate::PuffinReader; + use crate::puffin::writer::PuffinWriter; + use crate::puffin::PuffinReader; + use crate::Result; #[tokio::test] async fn test_throws_error_if_attempt_to_add_blob_after_closing() { diff --git a/crates/iceberg/src/writer/file_writer/mod.rs b/crates/iceberg/src/writer/file_writer/mod.rs index 399c2e46f..854ef1d39 100644 --- a/crates/iceberg/src/writer/file_writer/mod.rs +++ b/crates/iceberg/src/writer/file_writer/mod.rs @@ -27,8 +27,7 @@ use crate::Result; mod parquet_writer; pub use parquet_writer::{ParquetWriter, ParquetWriterBuilder}; -mod track_writer; -pub use track_writer::TrackWriter; +pub(crate) mod track_writer; pub mod location_generator; diff --git a/crates/iceberg/src/writer/file_writer/track_writer.rs b/crates/iceberg/src/writer/file_writer/track_writer.rs index ab53da0fb..71bce7ad1 100644 --- a/crates/iceberg/src/writer/file_writer/track_writer.rs +++ b/crates/iceberg/src/writer/file_writer/track_writer.rs @@ -24,7 +24,7 @@ use crate::io::FileWrite; use crate::Result; /// `TrackWriter` is used to track the written size. -pub struct TrackWriter { +pub(crate) struct TrackWriter { inner: Box, written_size: Arc, } diff --git a/crates/puffin/testdata/v1/java-generated/empty-puffin-uncompressed.bin b/crates/iceberg/testdata/puffin/java-generated/empty-puffin-uncompressed.bin similarity index 100% rename from crates/puffin/testdata/v1/java-generated/empty-puffin-uncompressed.bin rename to crates/iceberg/testdata/puffin/java-generated/empty-puffin-uncompressed.bin diff --git a/crates/puffin/testdata/v1/java-generated/sample-metric-data-compressed-zstd.bin b/crates/iceberg/testdata/puffin/java-generated/sample-metric-data-compressed-zstd.bin similarity index 100% rename from crates/puffin/testdata/v1/java-generated/sample-metric-data-compressed-zstd.bin rename to crates/iceberg/testdata/puffin/java-generated/sample-metric-data-compressed-zstd.bin diff --git a/crates/puffin/testdata/v1/java-generated/sample-metric-data-uncompressed.bin b/crates/iceberg/testdata/puffin/java-generated/sample-metric-data-uncompressed.bin similarity index 100% rename from crates/puffin/testdata/v1/java-generated/sample-metric-data-uncompressed.bin rename to crates/iceberg/testdata/puffin/java-generated/sample-metric-data-uncompressed.bin diff --git a/crates/puffin/testdata/v1/rust-generated/empty-puffin-uncompressed.bin b/crates/iceberg/testdata/puffin/rust-generated/empty-puffin-uncompressed.bin similarity index 100% rename from crates/puffin/testdata/v1/rust-generated/empty-puffin-uncompressed.bin rename to crates/iceberg/testdata/puffin/rust-generated/empty-puffin-uncompressed.bin diff --git a/crates/puffin/testdata/v1/rust-generated/sample-metric-data-compressed-zstd.bin b/crates/iceberg/testdata/puffin/rust-generated/sample-metric-data-compressed-zstd.bin similarity index 100% rename from crates/puffin/testdata/v1/rust-generated/sample-metric-data-compressed-zstd.bin rename to crates/iceberg/testdata/puffin/rust-generated/sample-metric-data-compressed-zstd.bin diff --git a/crates/puffin/testdata/v1/rust-generated/sample-metric-data-uncompressed.bin b/crates/iceberg/testdata/puffin/rust-generated/sample-metric-data-uncompressed.bin similarity index 100% rename from crates/puffin/testdata/v1/rust-generated/sample-metric-data-uncompressed.bin rename to crates/iceberg/testdata/puffin/rust-generated/sample-metric-data-uncompressed.bin diff --git a/crates/puffin/Cargo.toml b/crates/puffin/Cargo.toml deleted file mode 100644 index d58e1c45e..000000000 --- a/crates/puffin/Cargo.toml +++ /dev/null @@ -1,41 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -[package] -name = "iceberg-puffin" -version = { workspace = true } -edition = { workspace = true } -homepage = { workspace = true } -rust-version = { workspace = true } - -categories = ["database"] -description = "Apache Iceberg Puffin" -repository = { workspace = true } -license = { workspace = true } -keywords = ["iceberg", "puffin"] - -[dependencies] -bytes = { workspace = true } -iceberg = { workspace = true } -once_cell = { workspace = true } -serde = { workspace = true } -serde_json = { workspace = true } -zstd = { workspace = true } - -[dev-dependencies] -tempfile = { workspace = true } -tokio = { workspace = true }