diff --git a/.github/actions/setup-builder/action.yml b/.github/actions/setup-builder/action.yml index 43de1cbaa..476feb9ca 100644 --- a/.github/actions/setup-builder/action.yml +++ b/.github/actions/setup-builder/action.yml @@ -21,18 +21,24 @@ name: Prepare Rust Builder description: 'Prepare Rust Build Environment' inputs: rust-version: - description: 'version of rust to install (e.g. stable)' - required: true - default: 'stable' + description: 'version of rust to install and use' runs: using: "composite" steps: - - name: Setup Rust toolchain + - name: Setup specified Rust toolchain shell: bash + if: ${{ inputs.rust-version != '' }} run: | echo "Installing ${{ inputs.rust-version }}" rustup toolchain install ${{ inputs.rust-version }} - rustup default ${{ inputs.rust-version }} + rustup override set ${{ inputs.rust-version }} + rustup component add rustfmt clippy + - name: Setup Rust toolchain according to rust-toolchain.toml + shell: bash + if: ${{ inputs.rust-version == '' }} + run: | + echo "Installing toolchain according to rust-toolchain.toml" + rustup show rustup component add rustfmt clippy - name: Fixup git permissions # https://github.com/actions/checkout/issues/766 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c56a25cf8..80ce5f70c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -43,14 +43,18 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Setup Rust toolchain + uses: ./.github/actions/setup-builder + - name: Check License Header uses: apache/skywalking-eyes/header@v0.6.0 - - name: Install cargo-sort - run: make install-cargo-sort - - name: Install taplo-cli - run: make install-taplo-cli + uses: taiki-e/install-action@v2 + with: + tool: taplo-cli@0.9.3 + - name: Check toml format + run: make check-toml - name: Cargo format run: make check-fmt @@ -61,11 +65,19 @@ jobs: - name: Cargo clippy run: make check-clippy + - name: Install cargo-sort + uses: taiki-e/install-action@v2 + with: + tool: cargo-sort@1.0.9 - name: Cargo sort - run: make cargo-sort + run: cargo sort -c -w + - name: Install cargo-machete + uses: taiki-e/install-action@v2 + with: + tool: cargo-machete - name: Cargo Machete - run: make cargo-machete + run: cargo machete build: runs-on: ${{ matrix.os }} @@ -80,12 +92,10 @@ jobs: - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - with: - rust-version: ${{ env.rust_msrv }} - name: Cache Rust artifacts uses: Swatinem/rust-cache@v2 - + - name: Build run: make build @@ -100,6 +110,9 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Setup Rust toolchain + uses: ./.github/actions/setup-builder + - name: Cache Rust artifacts uses: Swatinem/rust-cache@v2 @@ -113,8 +126,6 @@ jobs: - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - with: - rust-version: ${{ env.rust_msrv }} - name: Cache Rust artifacts uses: Swatinem/rust-cache@v2 @@ -127,3 +138,24 @@ jobs: - name: Doc Test run: cargo test --no-fail-fast --doc --all-features --workspace + + msrv: + name: Verify MSRV + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Setup Nightly Rust toolchain + uses: ./.github/actions/setup-builder + - name: Generate minimal versions lockfile + run: | + cargo generate-lockfile -Z direct-minimal-versions -Z minimal-versions + # Some dependencies don't correctly specify a minimal version for their dependencies and will fail to build. + # So we update these transitive dependencies here. + cargo update tap faststr metainfo linkedbytes + - name: Setup MSRV Rust toolchain + uses: ./.github/actions/setup-builder + with: + rust-version: ${{ env.rust_msrv }} + - name: Check MSRV + run: | + cargo +${{ env.rust_msrv }} check --locked --workspace --exclude iceberg-datafusion --exclude iceberg-catalog-s3tables --exclude iceberg-integration-tests diff --git a/.github/workflows/ci_typos.yml b/.github/workflows/ci_typos.yml index b0f0349eb..593f015fa 100644 --- a/.github/workflows/ci_typos.yml +++ b/.github/workflows/ci_typos.yml @@ -42,4 +42,4 @@ jobs: steps: - uses: actions/checkout@v4 - name: Check typos - uses: crate-ci/typos@v1.28.4 + uses: crate-ci/typos@v1.29.4 diff --git a/Cargo.lock b/Cargo.lock index d0b27f717..941907e44 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -146,11 +146,12 @@ dependencies = [ [[package]] name = "anstyle-wincon" -version = "3.0.6" +version = "3.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125" +checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e" dependencies = [ "anstyle", + "once_cell", "windows-sys 0.59.0", ] @@ -384,9 +385,9 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b95513080e728e4cec37f1ff5af4f12c9688d47795d17cda80b6ec2cf74d4678" +checksum = "9579b9d8bce47aa41389fe344f2c6758279983b7c0ebb4013e283e3e91bb450e" [[package]] name = "arrow-select" @@ -416,7 +417,7 @@ dependencies = [ "memchr", "num", "regex", - "regex-syntax", + "regex-syntax 0.8.5", ] [[package]] @@ -445,7 +446,7 @@ version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "435a87a52755b8f27fcf321ac4f04b2802e337c8c4872923137471ec39c37532" dependencies = [ - "event-listener 5.3.1", + "event-listener 5.4.0", "event-listener-strategy", "futures-core", "pin-project-lite", @@ -544,7 +545,7 @@ version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff6e472cdea888a4bd64f342f09b3f50e1886d32afe8df3d663c01140b811b18" dependencies = [ - "event-listener 5.3.1", + "event-listener 5.4.0", "event-listener-strategy", "pin-project-lite", ] @@ -557,7 +558,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -595,13 +596,13 @@ checksum = "8b75356056920673b02621b35afd0f7dda9306d03c79a30f5c56c44cf256e3de" [[package]] name = "async-trait" -version = "0.1.83" +version = "0.1.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" +checksum = "3f934833b4b7233644e5848f235df3f57ed8c80f1528a26c3dfa13d2147fa056" dependencies = [ "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -627,9 +628,9 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "aws-config" -version = "1.5.11" +version = "1.5.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5d1c2c88936a73c699225d0bc00684a534166b0cebc2659c3cdf08de8edc64c" +checksum = "c03a50b30228d3af8865ce83376b4e99e1ffa34728220fe2860e4df0bb5278d6" dependencies = [ "aws-credential-types", "aws-runtime", @@ -669,9 +670,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.5.2" +version = "1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44f6f1124d6e19ab6daf7f2e615644305dc6cb2d706892a8a8c0b98db35de020" +checksum = "bee7643696e7fdd74c10f9eb42848a87fe469d35eae9c3323f80aa98f350baac" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -694,9 +695,9 @@ dependencies = [ [[package]] name = "aws-sdk-glue" -version = "1.74.0" +version = "1.76.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcf70e5fdbed7934eff5a4990b0d85d3c02a88c6ae79b4c20b900bcf8c6890aa" +checksum = "9c25c89d6efe63a398cb727b79c285e06184c432985a0d221df0f23d7d10f1f9" dependencies = [ "aws-credential-types", "aws-runtime", @@ -716,9 +717,9 @@ dependencies = [ [[package]] name = "aws-sdk-s3tables" -version = "1.2.0" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2111e5117b6e6bbe8c89ddca58e5c1339accc74a47757ab1e39db4f26999a426" +checksum = "ceca807f3fd3dbfd5f3d4c374d7729fb3a16d74f6087f2fbdf39dbe076e539a8" dependencies = [ "aws-credential-types", "aws-runtime", @@ -738,9 +739,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.51.0" +version = "1.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74995133da38f109a0eb8e8c886f9e80c713b6e9f2e6e5a6a1ba4450ce2ffc46" +checksum = "1605dc0bf9f0a4b05b451441a17fcb0bda229db384f23bf5cead3adbab0664ac" dependencies = [ "aws-credential-types", "aws-runtime", @@ -760,9 +761,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.52.0" +version = "1.54.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7062a779685cbf3b2401eb36151e2c6589fd5f3569b8a6bc2d199e5aaa1d059" +checksum = "59f3f73466ff24f6ad109095e0f3f2c830bfb4cd6c8b12f744c8e61ebf4d3ba1" dependencies = [ "aws-credential-types", "aws-runtime", @@ -782,9 +783,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.52.0" +version = "1.54.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "299dae7b1dc0ee50434453fa5a229dc4b22bd3ee50409ff16becf1f7346e0193" +checksum = "861d324ef69247c6f3c6823755f408a68877ffb1a9afaff6dd8b0057c760de60" dependencies = [ "aws-credential-types", "aws-runtime", @@ -805,9 +806,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.2.6" +version = "1.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d3820e0c08d0737872ff3c7c1f21ebbb6693d832312d6152bf18ef50a5471c2" +checksum = "690118821e46967b3c4501d67d7d52dd75106a9c54cf36cefa1985cedbe94e05" dependencies = [ "aws-credential-types", "aws-smithy-http", @@ -828,9 +829,9 @@ dependencies = [ [[package]] name = "aws-smithy-async" -version = "1.2.3" +version = "1.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "427cb637d15d63d6f9aae26358e1c9a9c09d5aa490d64b09354c8217cfef0f28" +checksum = "fa59d1327d8b5053c54bf2eaae63bf629ba9e904434d0835a28ed3c0ed0a614e" dependencies = [ "futures-util", "pin-project-lite", @@ -839,9 +840,9 @@ dependencies = [ [[package]] name = "aws-smithy-http" -version = "0.60.11" +version = "0.60.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c8bc3e8fdc6b8d07d976e301c02fe553f72a39b7a9fea820e023268467d7ab6" +checksum = "7809c27ad8da6a6a68c454e651d4962479e81472aa19ae99e59f9aba1f9713cc" dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", @@ -859,9 +860,9 @@ dependencies = [ [[package]] name = "aws-smithy-json" -version = "0.61.1" +version = "0.61.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee4e69cc50921eb913c6b662f8d909131bb3e6ad6cb6090d3a39b66fc5c52095" +checksum = "623a51127f24c30776c8b374295f2df78d92517386f77ba30773f15a30ce1422" dependencies = [ "aws-smithy-types", ] @@ -878,9 +879,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.7.6" +version = "1.7.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a05dd41a70fc74051758ee75b5c4db2c0ca070ed9229c3df50e9475cda1cb985" +checksum = "865f7050bbc7107a6c98a397a9fcd9413690c27fa718446967cf03b2d3ac517e" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -922,9 +923,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.2.11" +version = "1.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38ddc9bd6c28aeb303477170ddd183760a956a03e083b3902a990238a7e3792d" +checksum = "a28f6feb647fb5e0d5b50f0472c19a7db9462b74e2fec01bb0b44eedcc834e97" dependencies = [ "base64-simd", "bytes", @@ -957,9 +958,9 @@ dependencies = [ [[package]] name = "aws-types" -version = "1.3.3" +version = "1.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5221b91b3e441e6675310829fd8984801b772cb1546ef6c0e54dec9f1ac13fef" +checksum = "b0df5a18c4f951c645300d365fec53a61418bcf4650f604f85fe2a665bfaa0c2" dependencies = [ "aws-credential-types", "aws-smithy-async", @@ -1051,9 +1052,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.6.0" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" +checksum = "1be3f42a67d6d345ecd59f675f3f012d6974981560836e938c22b424b85ce1be" dependencies = [ "serde", ] @@ -1125,9 +1126,9 @@ dependencies = [ [[package]] name = "borsh" -version = "1.5.3" +version = "1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2506947f73ad44e344215ccd6403ac2ae18cd8e046e581a441bf8d199f257f03" +checksum = "9fb65153674e51d3a42c8f27b05b9508cea85edfaade8aa46bc8fc18cecdfef3" dependencies = [ "borsh-derive", "cfg_aliases", @@ -1135,15 +1136,15 @@ dependencies = [ [[package]] name = "borsh-derive" -version = "1.5.3" +version = "1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2593a3b8b938bd68373196c9832f516be11fa487ef4ae745eb282e6a56a7244" +checksum = "a396e17ad94059c650db3d253bb6e25927f1eb462eede7e7a153bb6e75dce0a7" dependencies = [ "once_cell", "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -1169,9 +1170,9 @@ dependencies = [ [[package]] name = "bstr" -version = "1.11.1" +version = "1.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "786a307d683a5bf92e6fd5fd69a7eb613751668d1d8d67d802846dfe367c62c8" +checksum = "531a9155a481e2ee699d4f98f43c0ca4ff8ee1bfd55c31e9e98fb29d2b176fe0" dependencies = [ "memchr", "serde", @@ -1272,9 +1273,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.6" +version = "1.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d6dbb628b8f8555f86d0323c2eb39e3ec81901f4b83e091db8a6a76d316a333" +checksum = "c8293772165d9345bdaaa39b45b2109591e63fe5e6fbc23c6ff930a048aa310b" dependencies = [ "jobserver", "libc", @@ -1583,7 +1584,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a2785755761f3ddc1492979ce1e48d2c00d09311c39e4466429188f3dd6501" dependencies = [ "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -1607,7 +1608,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -1618,7 +1619,7 @@ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ "darling_core", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -1937,7 +1938,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f5de3c8f386ea991696553afe241a326ecbc3c98a12c562867e4be754d3a060c" dependencies = [ "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -1956,7 +1957,7 @@ dependencies = [ "log", "recursive", "regex", - "regex-syntax", + "regex-syntax 0.8.5", ] [[package]] @@ -2116,7 +2117,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -2126,7 +2127,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ "derive_builder_core", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -2161,7 +2162,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -2252,9 +2253,9 @@ checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" [[package]] name = "event-listener" -version = "5.3.1" +version = "5.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6032be9bd27023a771701cc49f9f053c751055f71efb2e0ae5c15809093675ba" +checksum = "3492acde4c3fc54c845eaab3eed8bd00c7a7d881f78bfc801e43a93dec1331ae" dependencies = [ "concurrent-queue", "parking", @@ -2267,7 +2268,7 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c3e4e0dd3673c1139bf041f3008816d9cf2946bbfac2945c09e523b8d7b05b2" dependencies = [ - "event-listener 5.3.1", + "event-listener 5.4.0", "pin-project-lite", ] @@ -2342,6 +2343,12 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foldhash" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -2418,9 +2425,9 @@ checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" [[package]] name = "futures-lite" -version = "2.5.0" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cef40d21ae2c515b51041df9ed313ed21e572df340ea58a922a0aefe7e8891a1" +checksum = "f5edaec856126859abb19ed65f39e90fea3a9574b9707f13539acf4abf7eb532" dependencies = [ "fastrand", "futures-core", @@ -2437,7 +2444,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -2470,6 +2477,19 @@ dependencies = [ "slab", ] +[[package]] +name = "generator" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc6bd114ceda131d3b1d665eba35788690ad37f5916457286b32ab6fd3c438dd" +dependencies = [ + "cfg-if", + "libc", + "log", + "rustversion", + "windows", +] + [[package]] name = "generic-array" version = "0.14.7" @@ -2501,9 +2521,9 @@ checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" [[package]] name = "glob" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" +checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" [[package]] name = "globset" @@ -2514,8 +2534,8 @@ dependencies = [ "aho-corasick", "bstr", "log", - "regex-automata", - "regex-syntax", + "regex-automata 0.4.9", + "regex-syntax 0.8.5", ] [[package]] @@ -2524,7 +2544,7 @@ version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0bf760ebf69878d9fd8f110c89703d90ce35095324d1f1edcb595c63945ee757" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.7.0", "ignore", "walkdir", ] @@ -2614,14 +2634,19 @@ name = "hashbrown" version = "0.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", +] [[package]] name = "hashlink" -version = "0.9.1" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af" +checksum = "7382cf6263419f2d8df38c55d7da83da5c18aef87fc7a7fc1fb1e344edfe14c1" dependencies = [ - "hashbrown 0.14.5", + "hashbrown 0.15.2", ] [[package]] @@ -2835,7 +2860,7 @@ dependencies = [ "http 1.2.0", "hyper 1.5.2", "hyper-util", - "rustls 0.23.20", + "rustls 0.23.21", "rustls-pki-types", "tokio", "tokio-rustls 0.26.1", @@ -2873,7 +2898,7 @@ dependencies = [ "iana-time-zone-haiku", "js-sys", "wasm-bindgen", - "windows-core", + "windows-core 0.52.0", ] [[package]] @@ -3080,12 +3105,15 @@ version = "0.4.0" dependencies = [ "arrow-array", "arrow-schema", + "datafusion", "futures", "iceberg", "iceberg-catalog-rest", + "iceberg-datafusion", "iceberg_test_utils", "parquet", "tokio", + "uuid", ] [[package]] @@ -3211,7 +3239,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -3251,7 +3279,7 @@ dependencies = [ "globset", "log", "memchr", - "regex-automata", + "regex-automata 0.4.9", "same-file", "walkdir", "winapi-util", @@ -3343,9 +3371,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.76" +version = "0.3.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7" +checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" dependencies = [ "once_cell", "wasm-bindgen", @@ -3514,9 +3542,9 @@ dependencies = [ [[package]] name = "linux-raw-sys" -version = "0.4.14" +version = "0.4.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" +checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" [[package]] name = "litemap" @@ -3536,13 +3564,26 @@ dependencies = [ [[package]] name = "log" -version = "0.4.22" +version = "0.4.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" +checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f" dependencies = [ "value-bag", ] +[[package]] +name = "loom" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "419e0dc8046cb947daa77eb95ae174acfbddb7673b4151f56d1eed8e93fbfaca" +dependencies = [ + "cfg-if", + "generator", + "scoped-tls", + "tracing", + "tracing-subscriber", +] + [[package]] name = "lz4_flex" version = "0.11.3" @@ -3563,6 +3604,15 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "matchers" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558" +dependencies = [ + "regex-automata 0.1.10", +] + [[package]] name = "md-5" version = "0.10.6" @@ -3607,17 +3657,11 @@ version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" -[[package]] -name = "minimal-lexical" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" - [[package]] name = "miniz_oxide" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ffbe83022cedc1d264172192511ae958937694cd57ce297164951b8b3568394" +checksum = "b8402cab7aefae129c6977bb0ff1b8fd9a04eb5b51efc50a70bea51cda0c7924" dependencies = [ "adler2", ] @@ -3659,25 +3703,23 @@ dependencies = [ [[package]] name = "moka" -version = "0.12.8" +version = "0.12.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32cf62eb4dd975d2dde76432fb1075c49e3ee2331cf36f1f8fd4b66550d32b6f" +checksum = "a9321642ca94a4282428e6ea4af8cc2ca4eac48ac7a6a4ea8f33f76d0ce70926" dependencies = [ "async-lock", - "async-trait", "crossbeam-channel", "crossbeam-epoch", "crossbeam-utils", - "event-listener 5.3.1", + "event-listener 5.4.0", "futures-util", - "once_cell", + "loom", "parking_lot", - "quanta", + "portable-atomic", "rustc_version", "smallvec", "tagptr", "thiserror 1.0.69", - "triomphe", "uuid", ] @@ -3721,7 +3763,7 @@ checksum = "1bb5c1d8184f13f7d0ccbeeca0def2f9a181bce2624302793005f5ca8aa62e5e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -3742,7 +3784,7 @@ version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71e2746dc3a24dd78b3cfcb7be93368c6de9963d30f43a6a73998a9cf4b17b46" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.7.0", "cfg-if", "cfg_aliases", "libc", @@ -3750,13 +3792,13 @@ dependencies = [ ] [[package]] -name = "nom" -version = "7.1.3" +name = "nu-ansi-term" +version = "0.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" dependencies = [ - "memchr", - "minimal-lexical", + "overload", + "winapi", ] [[package]] @@ -3875,7 +3917,7 @@ dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -3916,9 +3958,9 @@ checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" [[package]] name = "opendal" -version = "0.51.0" +version = "0.51.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c8cd8697b917793c15a7b4a8afcba44e35e2abbc55c363064851776f7c81136" +checksum = "8c9dcfa7a3615e3c60eb662ed6b46b6f244cf2658098f593c0c0915430b3a268" dependencies = [ "anyhow", "async-trait", @@ -3985,6 +4027,12 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a" +[[package]] +name = "overload" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" + [[package]] name = "parking" version = "2.2.1" @@ -4107,7 +4155,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b7cafe60d6cf8e62e1b9b2ea516a089c008945bb5a275416789e7db0bc199dc" dependencies = [ "memchr", - "thiserror 2.0.9", + "thiserror 2.0.11", "ucd-trie", ] @@ -4131,7 +4179,7 @@ dependencies = [ "pest_meta", "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -4157,18 +4205,18 @@ dependencies = [ [[package]] name = "phf" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" dependencies = [ "phf_shared", ] [[package]] name = "phf_codegen" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" +checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" dependencies = [ "phf_generator", "phf_shared", @@ -4176,9 +4224,9 @@ dependencies = [ [[package]] name = "phf_generator" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" dependencies = [ "phf_shared", "rand", @@ -4186,9 +4234,9 @@ dependencies = [ [[package]] name = "phf_shared" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" dependencies = [ "siphasher", ] @@ -4218,29 +4266,29 @@ dependencies = [ [[package]] name = "pin-project" -version = "1.1.7" +version = "1.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be57f64e946e500c8ee36ef6331845d40a93055567ec57e8fae13efd33759b95" +checksum = "1e2ec53ad785f4d35dac0adea7f7dc6f1bb277ad84a680c7afefeae05d1f5916" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.1.7" +version = "1.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c0f5fad0874fc7abcd4d750e76917eaebbecaa2c20bde22e1dbeeba8beb758c" +checksum = "d56a66c0c55993aa927429d0f8a0abfd74f084e4d9c192cffed01e418d83eefb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] name = "pin-project-lite" -version = "0.2.15" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "915a1e146535de9163f3987b8944ed8cf49a18bb0056bcebcdcece385cece4ff" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" [[package]] name = "pin-utils" @@ -4324,6 +4372,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "325a6d2ac5dee293c3b2612d4993b98aec1dff096b0a2dae70ed7d95784a05da" +[[package]] +name = "portable-atomic" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" + [[package]] name = "powerfmt" version = "0.2.0" @@ -4360,9 +4414,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.92" +version = "1.0.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" +checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99" dependencies = [ "unicode-ident", ] @@ -4413,7 +4467,7 @@ checksum = "ca414edb151b4c8d125c12566ab0d74dc9cdba36fb80eb7b848c15f495fd32d1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -4422,21 +4476,6 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a651516ddc9168ebd67b24afd085a718be02f8858fe406591b013d101ce2f40" -[[package]] -name = "quanta" -version = "0.12.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "773ce68d0bb9bc7ef20be3536ffe94e223e1f365bd374108b2659fac0c65cfe6" -dependencies = [ - "crossbeam-utils", - "libc", - "once_cell", - "raw-cpuid", - "wasi", - "web-sys", - "winapi", -] - [[package]] name = "quick-xml" version = "0.35.0" @@ -4468,9 +4507,9 @@ dependencies = [ "quinn-proto", "quinn-udp", "rustc-hash", - "rustls 0.23.20", + "rustls 0.23.21", "socket2", - "thiserror 2.0.9", + "thiserror 2.0.11", "tokio", "tracing", ] @@ -4486,10 +4525,10 @@ dependencies = [ "rand", "ring", "rustc-hash", - "rustls 0.23.20", + "rustls 0.23.21", "rustls-pki-types", "slab", - "thiserror 2.0.9", + "thiserror 2.0.11", "tinyvec", "tracing", "web-time", @@ -4565,15 +4604,6 @@ dependencies = [ "serde", ] -[[package]] -name = "raw-cpuid" -version = "11.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ab240315c661615f2ee9f0f2cd32d5a7343a84d5ebcccb99d46e6637565e7b0" -dependencies = [ - "bitflags 2.6.0", -] - [[package]] name = "recursive" version = "0.1.1" @@ -4591,7 +4621,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -4600,7 +4630,7 @@ version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.7.0", ] [[package]] @@ -4620,7 +4650,7 @@ checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -4631,8 +4661,17 @@ checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", - "regex-automata", - "regex-syntax", + "regex-automata 0.4.9", + "regex-syntax 0.8.5", +] + +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" +dependencies = [ + "regex-syntax 0.6.29", ] [[package]] @@ -4643,7 +4682,7 @@ checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", - "regex-syntax", + "regex-syntax 0.8.5", ] [[package]] @@ -4652,6 +4691,12 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a" +[[package]] +name = "regex-syntax" +version = "0.6.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" + [[package]] name = "regex-syntax" version = "0.8.5" @@ -4705,9 +4750,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.12.11" +version = "0.12.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fe060fe50f524be480214aba758c71f99f90ee8c83c5a36b5e9e1d568eb4eb3" +checksum = "43e734407157c3c2034e0258f5e4473ddb361b1e85f95a66690d67264d7cd1da" dependencies = [ "base64 0.22.1", "bytes", @@ -4727,7 +4772,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "quinn", - "rustls 0.23.20", + "rustls 0.23.21", "rustls-pemfile 2.2.0", "rustls-pki-types", "serde", @@ -4818,7 +4863,7 @@ checksum = "beb382a4d9f53bd5c0be86b10d8179c3f8a14c30bf774ff77096ed6581e35981" dependencies = [ "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -4901,11 +4946,11 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.42" +version = "0.38.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f93dc38ecbab2eb790ff964bb77fa94faf256fd3e73285fd7ba0903b76bedb85" +checksum = "a78891ee6bf2340288408954ac787aa063d8e8817e9f53abb37c695c6d834ef6" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.7.0", "errno", "libc", "linux-raw-sys", @@ -4926,9 +4971,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.20" +version = "0.23.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5065c3f250cbd332cd894be57c40fa52387247659b14a2d6041d121547903b1b" +checksum = "8f287924602bf649d949c63dc8ac8b235fa5387d394020705b80c4eb597ce5b8" dependencies = [ "once_cell", "ring", @@ -5037,6 +5082,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "scoped-tls" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294" + [[package]] name = "scopeguard" version = "1.2.0" @@ -5076,7 +5127,7 @@ version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.7.0", "core-foundation", "core-foundation-sys", "libc", @@ -5085,9 +5136,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.13.0" +version = "2.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1863fd3768cd83c56a7f60faa4dc0d403f1b6df0a38c3c25f44b7894e45370d5" +checksum = "49db231d56a190491cb4aeda9527f1ad45345af50b0851622a7adb8c03b01c32" dependencies = [ "core-foundation-sys", "libc", @@ -5131,14 +5182,14 @@ checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] name = "serde_json" -version = "1.0.134" +version = "1.0.135" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d00f4175c42ee48b15416f6193a959ba3a0d67fc699a0db9ad12df9f83991c7d" +checksum = "2b0d7ba2887406110130a978386c4e1befb98c674b4fba677954e4db976630d9" dependencies = [ "itoa", "memchr", @@ -5154,7 +5205,7 @@ checksum = "6c64451ba24fc7a6a2d60fc75dd9c83c90903b19028d4eff35e88fc1e86564e9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -5196,7 +5247,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -5221,6 +5272,15 @@ dependencies = [ "digest", ] +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + [[package]] name = "shlex" version = "1.3.0" @@ -5260,21 +5320,21 @@ checksum = "1de1d4f81173b03af4c0cbed3c898f6bff5b870e4a7f5d6f4057d62a7a4b686e" [[package]] name = "simple_asn1" -version = "0.6.2" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adc4e5204eb1910f40f9cfa375f6f05b68c3abac4b6fd879c8ff5e7ae8a0a085" +checksum = "297f631f50729c8c99b84667867963997ec0b50f32b2a7dbcab828ef0541e8bb" dependencies = [ "num-bigint", "num-traits", - "thiserror 1.0.69", + "thiserror 2.0.11", "time", ] [[package]] name = "siphasher" -version = "0.3.11" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" +checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" [[package]] name = "slab" @@ -5322,7 +5382,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -5367,7 +5427,7 @@ dependencies = [ "simdutf8", "sonic-number", "sonic-simd", - "thiserror 2.0.9", + "thiserror 2.0.11", ] [[package]] @@ -5398,16 +5458,6 @@ dependencies = [ "der", ] -[[package]] -name = "sqlformat" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7bba3a93db0cc4f7bdece8bb09e77e2e785c20bfebf79eb8340ed80708048790" -dependencies = [ - "nom", - "unicode_categories", -] - [[package]] name = "sqllogictest" version = "0.4.0" @@ -5433,14 +5483,14 @@ checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] name = "sqlx" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93334716a037193fac19df402f8571269c84a00852f6a7066b5d2616dcd64d3e" +checksum = "4410e73b3c0d8442c5f99b425d7a435b5ee0ae4167b3196771dd3f7a01be745f" dependencies = [ "sqlx-core", "sqlx-macros", @@ -5451,39 +5501,33 @@ dependencies = [ [[package]] name = "sqlx-core" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4d8060b456358185f7d50c55d9b5066ad956956fddec42ee2e8567134a8936e" +checksum = "6a007b6936676aa9ab40207cde35daab0a04b823be8ae004368c0793b96a61e0" dependencies = [ - "atoi", - "byteorder", "bytes", "crc", "crossbeam-queue", "either", - "event-listener 5.3.1", - "futures-channel", + "event-listener 5.4.0", "futures-core", "futures-intrusive", "futures-io", "futures-util", - "hashbrown 0.14.5", + "hashbrown 0.15.2", "hashlink", - "hex", "indexmap 2.7.0", "log", "memchr", "once_cell", - "paste", "percent-encoding", - "rustls 0.23.20", + "rustls 0.23.21", "rustls-pemfile 2.2.0", "serde", "serde_json", "sha2", "smallvec", - "sqlformat", - "thiserror 1.0.69", + "thiserror 2.0.11", "tokio", "tokio-stream", "tracing", @@ -5493,22 +5537,22 @@ dependencies = [ [[package]] name = "sqlx-macros" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cac0692bcc9de3b073e8d747391827297e075c7710ff6276d9f7a1f3d58c6657" +checksum = "3112e2ad78643fef903618d78cf0aec1cb3134b019730edb039b69eaf531f310" dependencies = [ "proc-macro2", "quote", "sqlx-core", "sqlx-macros-core", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] name = "sqlx-macros-core" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1804e8a7c7865599c9c79be146dc8a9fd8cc86935fa641d3ea58e5f0688abaa5" +checksum = "4e9f90acc5ab146a99bf5061a7eb4976b573f560bc898ef3bf8435448dd5e7ad" dependencies = [ "dotenvy", "either", @@ -5522,7 +5566,7 @@ dependencies = [ "sha2", "sqlx-core", "sqlx-sqlite", - "syn 2.0.92", + "syn 2.0.96", "tempfile", "tokio", "url", @@ -5530,13 +5574,13 @@ dependencies = [ [[package]] name = "sqlx-mysql" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64bb4714269afa44aef2755150a0fc19d756fb580a67db8885608cf02f47d06a" +checksum = "4560278f0e00ce64938540546f59f590d60beee33fffbd3b9cd47851e5fff233" dependencies = [ "atoi", "base64 0.22.1", - "bitflags 2.6.0", + "bitflags 2.7.0", "byteorder", "bytes", "crc", @@ -5564,27 +5608,26 @@ dependencies = [ "smallvec", "sqlx-core", "stringprep", - "thiserror 1.0.69", + "thiserror 2.0.11", "tracing", "whoami", ] [[package]] name = "sqlx-postgres" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fa91a732d854c5d7726349bb4bb879bb9478993ceb764247660aee25f67c2f8" +checksum = "c5b98a57f363ed6764d5b3a12bfedf62f07aa16e1856a7ddc2a0bb190a959613" dependencies = [ "atoi", "base64 0.22.1", - "bitflags 2.6.0", + "bitflags 2.7.0", "byteorder", "crc", "dotenvy", "etcetera", "futures-channel", "futures-core", - "futures-io", "futures-util", "hex", "hkdf", @@ -5602,16 +5645,16 @@ dependencies = [ "smallvec", "sqlx-core", "stringprep", - "thiserror 1.0.69", + "thiserror 2.0.11", "tracing", "whoami", ] [[package]] name = "sqlx-sqlite" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5b2cf34a45953bfd3daaf3db0f7a7878ab9b7a6b91b422d24a7a9e4c857b680" +checksum = "f85ca71d3a5b24e64e1d08dd8fe36c6c95c339a896cc33068148906784620540" dependencies = [ "atoi", "flume", @@ -5688,7 +5731,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -5710,9 +5753,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.92" +version = "2.0.96" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ae51629bf965c5c098cc9e87908a3df5301051a9e087d6f9bef5c9771ed126" +checksum = "d5d0adab1ae378d7f53bdebc67a39f1f151407ef230f0ce2883572f5d8985c80" dependencies = [ "proc-macro2", "quote", @@ -5736,7 +5779,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -5753,12 +5796,13 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" [[package]] name = "tempfile" -version = "3.14.0" +version = "3.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c" +checksum = "9a8a559c81686f576e8cd0290cd2a24a2a9ad80c98b3478856500fcbd7acd704" dependencies = [ "cfg-if", "fastrand", + "getrandom", "once_cell", "rustix", "windows-sys 0.59.0", @@ -5797,11 +5841,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.9" +version = "2.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f072643fd0190df67a8bab670c20ef5d8737177d6ac6b2e9a236cb096206b2cc" +checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc" dependencies = [ - "thiserror-impl 2.0.9", + "thiserror-impl 2.0.11", ] [[package]] @@ -5812,18 +5856,28 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] name = "thiserror-impl" -version = "2.0.9" +version = "2.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b50fa271071aae2e6ee85f842e2e28ba8cd2c5fb67f11fcb1fd70b276f9e7d4" +checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", +] + +[[package]] +name = "thread_local" +version = "1.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c" +dependencies = [ + "cfg-if", + "once_cell", ] [[package]] @@ -5904,9 +5958,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.42.0" +version = "1.43.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cec9b21b0450273377fc97bd4c33a8acffc8c996c987a7c5b319a0083707551" +checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e" dependencies = [ "backtrace", "bytes", @@ -5922,13 +5976,13 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.4.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" +checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -5947,7 +6001,7 @@ version = "0.26.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f6d0975eaace0cf0fcadee4e4aaa5da15b5c079146f2cffb67c113be122bf37" dependencies = [ - "rustls 0.23.20", + "rustls 0.23.21", "tokio", ] @@ -6039,7 +6093,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -6049,19 +6103,43 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c" dependencies = [ "once_cell", + "valuable", ] [[package]] -name = "trim-in-place" -version = "0.1.7" +name = "tracing-log" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "343e926fc669bc8cde4fa3129ab681c63671bae288b1f1081ceee6d9d37904fc" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] [[package]] -name = "triomphe" -version = "0.1.11" +name = "tracing-subscriber" +version = "0.3.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008" +dependencies = [ + "matchers", + "nu-ansi-term", + "once_cell", + "regex", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", +] + +[[package]] +name = "trim-in-place" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "859eb650cfee7434994602c3a68b25d77ad9e68c8a6cd491616ef86661382eb3" +checksum = "343e926fc669bc8cde4fa3129ab681c63671bae288b1f1081ceee6d9d37904fc" [[package]] name = "try-lock" @@ -6105,7 +6183,7 @@ checksum = "f9534daa9fd3ed0bd911d462a37f172228077e7abf18c18a5f67199d959205f8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -6116,7 +6194,7 @@ checksum = "560b82d656506509d43abe30e0ba64c56b1953ab3d4fe7ba5902747a7a3cedd5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -6220,12 +6298,6 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" -[[package]] -name = "unicode_categories" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" - [[package]] name = "untrusted" version = "0.9.0" @@ -6269,14 +6341,20 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.11.0" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a" +checksum = "744018581f9a3454a9e15beb8a33b017183f1e7c0cd170232a2d1453b23a51c4" dependencies = [ "getrandom", "serde", ] +[[package]] +name = "valuable" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" + [[package]] name = "value-bag" version = "1.10.0" @@ -6345,7 +6423,7 @@ dependencies = [ "rustc-hash", "scopeguard", "sonic-rs", - "thiserror 2.0.9", + "thiserror 2.0.11", "tokio", "tracing", "volo", @@ -6390,34 +6468,35 @@ checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" [[package]] name = "wasm-bindgen" -version = "0.2.99" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396" +checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" dependencies = [ "cfg-if", "once_cell", + "rustversion", "wasm-bindgen-macro", ] [[package]] name = "wasm-bindgen-backend" -version = "0.2.99" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79" +checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" dependencies = [ "bumpalo", "log", "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.49" +version = "0.4.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38176d9b44ea84e9184eff0bc34cc167ed044f816accfe5922e54d84cf48eca2" +checksum = "555d470ec0bc3bb57890405e5d4322cc9ea83cebb085523ced7be4144dac1e61" dependencies = [ "cfg-if", "js-sys", @@ -6428,9 +6507,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.99" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe" +checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -6438,22 +6517,25 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.99" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2" +checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" dependencies = [ "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.99" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6" +checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +dependencies = [ + "unicode-ident", +] [[package]] name = "wasm-streams" @@ -6470,9 +6552,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.76" +version = "0.3.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04dd7223427d52553d3702c004d3b2fe07c148165faa56313cb00211e31c12bc" +checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" dependencies = [ "js-sys", "wasm-bindgen", @@ -6538,6 +6620,16 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows" +version = "0.58.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd04d41d93c4992d421894c18c8b43496aa748dd4c081bac0dc93eb0489272b6" +dependencies = [ + "windows-core 0.58.0", + "windows-targets 0.52.6", +] + [[package]] name = "windows-core" version = "0.52.0" @@ -6547,6 +6639,41 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-core" +version = "0.58.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ba6d44ec8c2591c134257ce647b7ea6b20335bf6379a27dac5f1641fcf59f99" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-result", + "windows-strings", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-implement" +version = "0.58.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bbd5b46c938e506ecbce286b6628a02171d56153ba733b6c741fc627ec9579b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.96", +] + +[[package]] +name = "windows-interface" +version = "0.58.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053c4c462dc91d3b1504c6fe5a726dd15e216ba718e84a0e46a88fbe5ded3515" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.96", +] + [[package]] name = "windows-registry" version = "0.2.0" @@ -6727,9 +6854,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" -version = "0.6.20" +version = "0.6.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36c1fec1a2bb5866f07c25f68c26e565c4c200aebb96d7e55710c19d3e8ac49b" +checksum = "c8d71a593cc5c42ad7876e2c1fda56f314f3754c084128833e64f1345ff8a03a" dependencies = [ "memchr", ] @@ -6796,7 +6923,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", "synstructure", ] @@ -6818,7 +6945,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] @@ -6838,7 +6965,7 @@ checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" dependencies = [ "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", "synstructure", ] @@ -6867,7 +6994,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.92", + "syn 2.0.96", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 5b1dca422..99cfad562 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,27 +41,28 @@ rust-version = "1.77.1" anyhow = "1.0.72" apache-avro = "0.17" array-init = "2" -arrow-arith = { version = "53" } -arrow-array = { version = "53" } -arrow-cast = { version = "53" } -arrow-ord = { version = "53" } -arrow-schema = { version = "53" } -arrow-select = { version = "53" } -arrow-string = { version = "53" } +arrow-arith = { version = "53.3.0" } +arrow-array = { version = "53.3.0" } +arrow-cast = { version = "53.3.0" } +arrow-ord = { version = "53.3.0" } +arrow-schema = { version = "53.4.0" } +arrow-select = { version = "53.3.0" } +arrow-string = { version = "53.3.0" } async-stream = "0.3.5" -async-trait = "0.1" +async-trait = "0.1.85" async-std = "1.12" -aws-config = "1.1.8" -aws-sdk-glue = "1.21" +aws-config = "1" +aws-sdk-glue = "1.39" bimap = "0.6" bitvec = "1.0.1" -bytes = "1.5" -chrono = "0.4.34" +bytes = "1.6" +chrono = "0.4.38" ctor = "0.2.8" +datafusion = "44" derive_builder = "0.20" either = "1" env_logger = "0.11.0" -fnv = "1" +fnv = "1.0.7" futures = "0.3" iceberg = { version = "0.4.0", path = "./crates/iceberg" } iceberg-catalog-rest = { version = "0.4.0", path = "./crates/catalog/rest" } @@ -69,34 +70,34 @@ iceberg-catalog-hms = { version = "0.4.0", path = "./crates/catalog/hms" } iceberg-catalog-memory = { version = "0.4.0", path = "./crates/catalog/memory" } iceberg-datafusion = { version = "0.4.0", path = "./crates/integrations/datafusion" } itertools = "0.13" -log = "0.4" +log = "0.4.22" mockito = "1" murmur3 = "0.5.2" num-bigint = "0.4.6" -once_cell = "1" -opendal = "0.51.0" +once_cell = "1.19" +opendal = "0.51.1" ordered-float = "4" -parquet = "53.1" -paste = "1" +parquet = "53.3.0" +paste = "1.0.15" pilota = "0.11.2" pretty_assertions = "1.4" port_scanner = "0.1.5" -rand = "0.8" +rand = "0.8.5" regex = "1.10.5" -reqwest = { version = "0.12", default-features = false, features = ["json"] } +reqwest = { version = "0.12.2", default-features = false, features = ["json"] } rust_decimal = "1.31" -serde = { version = "1", features = ["rc"] } -serde_bytes = "0.11.8" -serde_derive = "1" -serde_json = "1" +serde = { version = "1.0.204", features = ["rc"] } +serde_bytes = "0.11.15" +serde_derive = "1.0.204" +serde_json = "1.0.120" serde_repr = "0.1.16" serde_with = "3.4" -tempfile = "3.8" -tokio = { version = "1", default-features = false } +tempfile = "3.15" +tokio = { version = "1.36", default-features = false } typed-builder = "0.20" -url = "2" +url = "2.2.2" urlencoding = "2" -uuid = { version = "1.6.1", features = ["v7"] } +uuid = { version = "1.10.0", features = ["v7"] } volo-thrift = "0.10" hive_metastore = "0.1" tera = "1" diff --git a/Makefile b/Makefile index 4ecc9bd88..fc8a52e5f 100644 --- a/Makefile +++ b/Makefile @@ -39,7 +39,7 @@ cargo-machete: install-cargo-machete cargo machete install-taplo-cli: - cargo install taplo-cli@0.9.0 + cargo install taplo-cli@0.9.3 fix-toml: install-taplo-cli taplo fmt diff --git a/bindings/python/pyproject.toml b/bindings/python/pyproject.toml index 981bd2d4f..69c96a42c 100644 --- a/bindings/python/pyproject.toml +++ b/bindings/python/pyproject.toml @@ -23,6 +23,7 @@ build-backend = "maturin" name = "pyiceberg_core" version = "0.4.0" readme = "project-description.md" +requires-python = "~=3.9" classifiers = [ "Development Status :: 4 - Beta", "Intended Audience :: Developers", diff --git a/crates/catalog/rest/Cargo.toml b/crates/catalog/rest/Cargo.toml index add57183b..691545493 100644 --- a/crates/catalog/rest/Cargo.toml +++ b/crates/catalog/rest/Cargo.toml @@ -35,7 +35,7 @@ chrono = { workspace = true } http = "1.1.0" iceberg = { workspace = true } itertools = { workspace = true } -log = "0.4.20" +log = { workspace = true } reqwest = { workspace = true } serde = { workspace = true } serde_derive = { workspace = true } diff --git a/crates/catalog/s3tables/Cargo.toml b/crates/catalog/s3tables/Cargo.toml index 772b328f3..46e7b5375 100644 --- a/crates/catalog/s3tables/Cargo.toml +++ b/crates/catalog/s3tables/Cargo.toml @@ -20,7 +20,7 @@ name = "iceberg-catalog-s3tables" version = { workspace = true } edition = { workspace = true } homepage = { workspace = true } -rust-version = { workspace = true } +rust-version = "1.81.0" categories = ["database"] description = "Apache Iceberg Rust S3Tables Catalog" @@ -32,7 +32,7 @@ keywords = ["iceberg", "sql", "catalog"] anyhow = { workspace = true } async-trait = { workspace = true } aws-config = { workspace = true } -aws-sdk-s3tables = "1.2.0" +aws-sdk-s3tables = "1.4.0" iceberg = { workspace = true } serde_json = { workspace = true } uuid = { workspace = true, features = ["v4"] } diff --git a/crates/catalog/sql/Cargo.toml b/crates/catalog/sql/Cargo.toml index a51671650..71cf37c14 100644 --- a/crates/catalog/sql/Cargo.toml +++ b/crates/catalog/sql/Cargo.toml @@ -40,7 +40,7 @@ uuid = { workspace = true, features = ["v4"] } iceberg_test_utils = { path = "../../test_utils", features = ["tests"] } itertools = { workspace = true } regex = "1.10.5" -sqlx = { version = "0.8.0", features = [ +sqlx = { version = "0.8.1", features = [ "tls-rustls", "runtime-tokio", "any", diff --git a/crates/examples/Cargo.toml b/crates/examples/Cargo.toml index 2fb3060c1..2f1dbf858 100644 --- a/crates/examples/Cargo.toml +++ b/crates/examples/Cargo.toml @@ -27,7 +27,7 @@ rust-version = { workspace = true } [dependencies] iceberg = { workspace = true } iceberg-catalog-rest = { workspace = true } -tokio = { version = "1", features = ["full"] } +tokio = { workspace = true, features = ["full"] } [[example]] name = "rest-catalog-namespace" diff --git a/crates/iceberg/Cargo.toml b/crates/iceberg/Cargo.toml index 7f323722f..626ca15ef 100644 --- a/crates/iceberg/Cargo.toml +++ b/crates/iceberg/Cargo.toml @@ -61,7 +61,7 @@ derive_builder = { workspace = true } fnv = { workspace = true } futures = { workspace = true } itertools = { workspace = true } -moka = { version = "0.12.8", features = ["future"] } +moka = { version = "0.12.10", features = ["future"] } murmur3 = { workspace = true } num-bigint = { workspace = true } once_cell = { workspace = true } diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index b4e15821f..6fcd59297 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -48,7 +48,7 @@ use crate::expr::visitors::row_group_metrics_evaluator::RowGroupMetricsEvaluator use crate::expr::{BoundPredicate, BoundReference}; use crate::io::{FileIO, FileMetadata, FileRead}; use crate::scan::{ArrowRecordBatchStream, FileScanTask, FileScanTaskStream}; -use crate::spec::{Datum, PrimitiveType, Schema}; +use crate::spec::{Datum, NestedField, PrimitiveType, Schema, Type}; use crate::utils::available_parallelism; use crate::{Error, ErrorKind}; @@ -273,6 +273,28 @@ impl ArrowReader { Ok((iceberg_field_ids, field_id_map)) } + /// Insert the leaf field id into the field_ids using for projection. + /// For nested type, it will recursively insert the leaf field id. + fn include_leaf_field_id(field: &NestedField, field_ids: &mut Vec) { + match field.field_type.as_ref() { + Type::Primitive(_) => { + field_ids.push(field.id); + } + Type::Struct(struct_type) => { + for nested_field in struct_type.fields() { + Self::include_leaf_field_id(nested_field, field_ids); + } + } + Type::List(list_type) => { + Self::include_leaf_field_id(&list_type.element_field, field_ids); + } + Type::Map(map_type) => { + Self::include_leaf_field_id(&map_type.key_field, field_ids); + Self::include_leaf_field_id(&map_type.value_field, field_ids); + } + } + } + fn get_arrow_projection_mask( field_ids: &[i32], iceberg_schema_of_task: &Schema, @@ -297,11 +319,21 @@ impl ArrowReader { scale: requested_scale, }), ) if requested_precision >= file_precision && file_scale == requested_scale => true, + // Uuid will be store as Fixed(16) in parquet file, so the read back type will be Fixed(16). + (Some(PrimitiveType::Fixed(16)), Some(PrimitiveType::Uuid)) => true, _ => false, } } - if field_ids.is_empty() { + let mut leaf_field_ids = vec![]; + for field_id in field_ids { + let field = iceberg_schema_of_task.field_by_id(*field_id); + if let Some(field) = field { + Self::include_leaf_field_id(field, &mut leaf_field_ids); + } + } + + if leaf_field_ids.is_empty() { Ok(ProjectionMask::all()) } else { // Build the map between field id and column index in Parquet schema. @@ -318,7 +350,7 @@ impl ArrowReader { .and_then(|field_id| i32::from_str(field_id).ok()) .map_or(false, |field_id| { projected_fields.insert((*f).clone(), field_id); - field_ids.contains(&field_id) + leaf_field_ids.contains(&field_id) }) }), arrow_schema.metadata().clone(), @@ -351,19 +383,26 @@ impl ArrowReader { true }); - if column_map.len() != field_ids.len() { + if column_map.len() != leaf_field_ids.len() { + let missing_fields = leaf_field_ids + .iter() + .filter(|field_id| !column_map.contains_key(field_id)) + .collect::>(); return Err(Error::new( ErrorKind::DataInvalid, format!( "Parquet schema {} and Iceberg schema {} do not match.", iceberg_schema, iceberg_schema_of_task ), - )); + ) + .with_context("column_map", format! {"{:?}", column_map}) + .with_context("field_ids", format! {"{:?}", leaf_field_ids}) + .with_context("missing_fields", format! {"{:?}", missing_fields})); } let mut indices = vec![]; - for field_id in field_ids { - if let Some(col_idx) = column_map.get(field_id) { + for field_id in leaf_field_ids { + if let Some(col_idx) = column_map.get(&field_id) { indices.push(*col_idx); } else { return Err(Error::new( diff --git a/crates/iceberg/src/arrow/record_batch_projector.rs b/crates/iceberg/src/arrow/record_batch_projector.rs index c311da1f1..9cd745691 100644 --- a/crates/iceberg/src/arrow/record_batch_projector.rs +++ b/crates/iceberg/src/arrow/record_batch_projector.rs @@ -190,7 +190,7 @@ mod test { RecordBatchProjector::new(schema.clone(), &[1, 3], field_id_fetch_func, |_| true) .unwrap(); - assert!(projector.field_indices.len() == 2); + assert_eq!(projector.field_indices.len(), 2); assert_eq!(projector.field_indices[0], vec![0]); assert_eq!(projector.field_indices[1], vec![0, 1]); diff --git a/crates/iceberg/src/arrow/schema.rs b/crates/iceberg/src/arrow/schema.rs index b590c8bc8..41afd8ea4 100644 --- a/crates/iceberg/src/arrow/schema.rs +++ b/crates/iceberg/src/arrow/schema.rs @@ -43,7 +43,9 @@ use crate::spec::{ use crate::{Error, ErrorKind}; /// When iceberg map type convert to Arrow map type, the default map field name is "key_value". -pub(crate) const DEFAULT_MAP_FIELD_NAME: &str = "key_value"; +pub const DEFAULT_MAP_FIELD_NAME: &str = "key_value"; +/// UTC time zone for Arrow timestamp type. +pub const UTC_TIME_ZONE: &str = "+00:00"; /// A post order arrow schema visitor. /// @@ -120,8 +122,10 @@ fn visit_type(r#type: &DataType, visitor: &mut V) -> Resu DataType::Boolean | DataType::Utf8 | DataType::LargeUtf8 + | DataType::Utf8View | DataType::Binary | DataType::LargeBinary + | DataType::BinaryView | DataType::FixedSizeBinary(_) ) => { @@ -403,7 +407,9 @@ impl ArrowSchemaVisitor for ArrowSchemaConverter { { Ok(Type::Primitive(PrimitiveType::TimestamptzNs)) } - DataType::Binary | DataType::LargeBinary => Ok(Type::Primitive(PrimitiveType::Binary)), + DataType::Binary | DataType::LargeBinary | DataType::BinaryView => { + Ok(Type::Primitive(PrimitiveType::Binary)) + } DataType::FixedSizeBinary(width) => { Ok(Type::Primitive(PrimitiveType::Fixed(*width as u64))) } @@ -594,14 +600,14 @@ impl SchemaVisitor for ToArrowSchemaConverter { )), crate::spec::PrimitiveType::Timestamptz => Ok(ArrowSchemaOrFieldOrType::Type( // Timestampz always stored as UTC - DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into())), + DataType::Timestamp(TimeUnit::Microsecond, Some(UTC_TIME_ZONE.into())), )), crate::spec::PrimitiveType::TimestampNs => Ok(ArrowSchemaOrFieldOrType::Type( DataType::Timestamp(TimeUnit::Nanosecond, None), )), crate::spec::PrimitiveType::TimestamptzNs => Ok(ArrowSchemaOrFieldOrType::Type( // Store timestamptz_ns as UTC - DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())), + DataType::Timestamp(TimeUnit::Nanosecond, Some(UTC_TIME_ZONE.into())), )), crate::spec::PrimitiveType::String => { Ok(ArrowSchemaOrFieldOrType::Type(DataType::Utf8)) diff --git a/crates/iceberg/src/expr/mod.rs b/crates/iceberg/src/expr/mod.rs index 5771aac5e..42f409b42 100644 --- a/crates/iceberg/src/expr/mod.rs +++ b/crates/iceberg/src/expr/mod.rs @@ -124,13 +124,25 @@ impl PredicateOperator { /// /// ```rust /// use iceberg::expr::PredicateOperator; - /// assert!(PredicateOperator::IsNull.negate() == PredicateOperator::NotNull); - /// assert!(PredicateOperator::IsNan.negate() == PredicateOperator::NotNan); - /// assert!(PredicateOperator::LessThan.negate() == PredicateOperator::GreaterThanOrEq); - /// assert!(PredicateOperator::GreaterThan.negate() == PredicateOperator::LessThanOrEq); - /// assert!(PredicateOperator::Eq.negate() == PredicateOperator::NotEq); - /// assert!(PredicateOperator::In.negate() == PredicateOperator::NotIn); - /// assert!(PredicateOperator::StartsWith.negate() == PredicateOperator::NotStartsWith); + /// assert_eq!( + /// PredicateOperator::IsNull.negate(), + /// PredicateOperator::NotNull + /// ); + /// assert_eq!(PredicateOperator::IsNan.negate(), PredicateOperator::NotNan); + /// assert_eq!( + /// PredicateOperator::LessThan.negate(), + /// PredicateOperator::GreaterThanOrEq + /// ); + /// assert_eq!( + /// PredicateOperator::GreaterThan.negate(), + /// PredicateOperator::LessThanOrEq + /// ); + /// assert_eq!(PredicateOperator::Eq.negate(), PredicateOperator::NotEq); + /// assert_eq!(PredicateOperator::In.negate(), PredicateOperator::NotIn); + /// assert_eq!( + /// PredicateOperator::StartsWith.negate(), + /// PredicateOperator::NotStartsWith + /// ); /// ``` pub fn negate(self) -> PredicateOperator { match self { diff --git a/crates/iceberg/src/metadata_scan.rs b/crates/iceberg/src/inspect/manifests.rs similarity index 61% rename from crates/iceberg/src/metadata_scan.rs rename to crates/iceberg/src/inspect/manifests.rs index c19231998..1e2783448 100644 --- a/crates/iceberg/src/metadata_scan.rs +++ b/crates/iceberg/src/inspect/manifests.rs @@ -15,128 +15,34 @@ // specific language governing permissions and limitations // under the License. -//! Metadata table api. - use std::collections::HashMap; use std::sync::Arc; use arrow_array::builder::{ - BooleanBuilder, ListBuilder, MapBuilder, PrimitiveBuilder, StringBuilder, StructBuilder, + BooleanBuilder, ListBuilder, PrimitiveBuilder, StringBuilder, StructBuilder, }; -use arrow_array::types::{Int32Type, Int64Type, TimestampMillisecondType}; +use arrow_array::types::{Int32Type, Int64Type}; use arrow_array::RecordBatch; -use arrow_schema::{DataType, Field, Fields, Schema, TimeUnit}; +use arrow_schema::{DataType, Field, Fields}; +use futures::{stream, StreamExt}; use crate::arrow::schema_to_arrow_schema; +use crate::scan::ArrowRecordBatchStream; use crate::spec::{ListType, NestedField, PrimitiveType, StructType, Type}; use crate::table::Table; use crate::Result; -/// Metadata table is used to inspect a table's history, snapshots, and other metadata as a table. -/// -/// References: -/// - -/// - -/// - -#[derive(Debug)] -pub struct MetadataTable(Table); - -impl MetadataTable { - /// Creates a new metadata scan. - pub(super) fn new(table: Table) -> Self { - Self(table) - } - - /// Get the snapshots table. - pub fn snapshots(&self) -> SnapshotsTable { - SnapshotsTable { table: &self.0 } - } - - /// Get the manifests table. - pub fn manifests(&self) -> ManifestsTable { - ManifestsTable { table: &self.0 } - } -} - -/// Snapshots table. -pub struct SnapshotsTable<'a> { - table: &'a Table, -} - -impl<'a> SnapshotsTable<'a> { - /// Returns the schema of the snapshots table. - pub fn schema(&self) -> Schema { - Schema::new(vec![ - Field::new( - "committed_at", - DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())), - false, - ), - Field::new("snapshot_id", DataType::Int64, false), - Field::new("parent_id", DataType::Int64, true), - Field::new("operation", DataType::Utf8, false), - Field::new("manifest_list", DataType::Utf8, false), - Field::new( - "summary", - DataType::Map( - Arc::new(Field::new( - "entries", - DataType::Struct( - vec![ - Field::new("keys", DataType::Utf8, false), - Field::new("values", DataType::Utf8, true), - ] - .into(), - ), - false, - )), - false, - ), - false, - ), - ]) - } - - /// Scans the snapshots table. - pub fn scan(&self) -> Result { - let mut committed_at = - PrimitiveBuilder::::new().with_timezone("+00:00"); - let mut snapshot_id = PrimitiveBuilder::::new(); - let mut parent_id = PrimitiveBuilder::::new(); - let mut operation = StringBuilder::new(); - let mut manifest_list = StringBuilder::new(); - let mut summary = MapBuilder::new(None, StringBuilder::new(), StringBuilder::new()); - - for snapshot in self.table.metadata().snapshots() { - committed_at.append_value(snapshot.timestamp_ms()); - snapshot_id.append_value(snapshot.snapshot_id()); - parent_id.append_option(snapshot.parent_snapshot_id()); - manifest_list.append_value(snapshot.manifest_list()); - operation.append_value(snapshot.summary().operation.as_str()); - for (key, value) in &snapshot.summary().additional_properties { - summary.keys().append_value(key); - summary.values().append_value(value); - } - summary.append(true)?; - } - - Ok(RecordBatch::try_new(Arc::new(self.schema()), vec![ - Arc::new(committed_at.finish()), - Arc::new(snapshot_id.finish()), - Arc::new(parent_id.finish()), - Arc::new(operation.finish()), - Arc::new(manifest_list.finish()), - Arc::new(summary.finish()), - ])?) - } -} - /// Manifests table. pub struct ManifestsTable<'a> { table: &'a Table, } impl<'a> ManifestsTable<'a> { + /// Create a new Manifests table instance. + pub fn new(table: &'a Table) -> Self { + Self { table } + } + /// Returns the iceberg schema of the manifests table. pub fn schema(&self) -> crate::spec::Schema { let fields = vec![ @@ -238,7 +144,7 @@ impl<'a> ManifestsTable<'a> { } /// Scans the manifests table. - pub async fn scan(&self) -> Result { + pub async fn scan(&self) -> Result { let schema = schema_to_arrow_schema(&self.schema())?; let partition_summary_fields = if let DataType::List(list_type) = schema.field_with_name("partition_summaries")?.data_type() @@ -319,7 +225,7 @@ impl<'a> ManifestsTable<'a> { } } - Ok(RecordBatch::try_new(Arc::new(schema), vec![ + let batch = RecordBatch::try_new(Arc::new(schema), vec![ Arc::new(content.finish()), Arc::new(path.finish()), Arc::new(length.finish()), @@ -332,150 +238,26 @@ impl<'a> ManifestsTable<'a> { Arc::new(existing_delete_files_count.finish()), Arc::new(deleted_delete_files_count.finish()), Arc::new(partition_summaries.finish()), - ])?) + ])?; + Ok(stream::iter(vec![Ok(batch)]).boxed()) } } #[cfg(test)] mod tests { - use expect_test::{expect, Expect}; - use itertools::Itertools; + use expect_test::expect; - use super::*; + use crate::inspect::metadata_table::tests::check_record_batches; use crate::scan::tests::TableTestFixture; - /// Snapshot testing to check the resulting record batch. - /// - /// - `expected_schema/data`: put `expect![[""]]` as a placeholder, - /// and then run test with `UPDATE_EXPECT=1 cargo test` to automatically update the result, - /// or use rust-analyzer (see [video](https://github.com/rust-analyzer/expect-test)). - /// Check the doc of [`expect_test`] for more details. - /// - `ignore_check_columns`: Some columns are not stable, so we can skip them. - /// - `sort_column`: The order of the data might be non-deterministic, so we can sort it by a column. - fn check_record_batch( - record_batch: RecordBatch, - expected_schema: Expect, - expected_data: Expect, - ignore_check_columns: &[&str], - sort_column: Option<&str>, - ) { - let mut columns = record_batch.columns().to_vec(); - if let Some(sort_column) = sort_column { - let column = record_batch.column_by_name(sort_column).unwrap(); - let indices = arrow_ord::sort::sort_to_indices(column, None, None).unwrap(); - columns = columns - .iter() - .map(|column| arrow_select::take::take(column.as_ref(), &indices, None).unwrap()) - .collect_vec(); - } - - expected_schema.assert_eq(&format!( - "{}", - record_batch.schema().fields().iter().format(",\n") - )); - expected_data.assert_eq(&format!( - "{}", - record_batch - .schema() - .fields() - .iter() - .zip_eq(columns) - .map(|(field, column)| { - if ignore_check_columns.contains(&field.name().as_str()) { - format!("{}: (skipped)", field.name()) - } else { - format!("{}: {:?}", field.name(), column) - } - }) - .format(",\n") - )); - } - - #[test] - fn test_snapshots_table() { - let table = TableTestFixture::new().table; - let record_batch = table.metadata_table().snapshots().scan().unwrap(); - check_record_batch( - record_batch, - expect![[r#" - Field { name: "committed_at", data_type: Timestamp(Millisecond, Some("+00:00")), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, - Field { name: "snapshot_id", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, - Field { name: "parent_id", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, - Field { name: "operation", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, - Field { name: "manifest_list", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, - Field { name: "summary", data_type: Map(Field { name: "entries", data_type: Struct([Field { name: "keys", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "values", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }"#]], - expect![[r#" - committed_at: PrimitiveArray - [ - 2018-01-04T21:22:35.770+00:00, - 2019-04-12T20:29:15.770+00:00, - ], - snapshot_id: PrimitiveArray - [ - 3051729675574597004, - 3055729675574597004, - ], - parent_id: PrimitiveArray - [ - null, - 3051729675574597004, - ], - operation: StringArray - [ - "append", - "append", - ], - manifest_list: (skipped), - summary: MapArray - [ - StructArray - -- validity: - [ - ] - [ - -- child 0: "keys" (Utf8) - StringArray - [ - ] - -- child 1: "values" (Utf8) - StringArray - [ - ] - ], - StructArray - -- validity: - [ - ] - [ - -- child 0: "keys" (Utf8) - StringArray - [ - ] - -- child 1: "values" (Utf8) - StringArray - [ - ] - ], - ]"#]], - &["manifest_list"], - Some("committed_at"), - ); - } - #[tokio::test] async fn test_manifests_table() { let mut fixture = TableTestFixture::new(); fixture.setup_manifest_files().await; - let record_batch = fixture - .table - .metadata_table() - .manifests() - .scan() - .await - .unwrap(); + let record_batch = fixture.table.inspect().manifests().scan().await.unwrap(); - check_record_batch( + check_record_batches( record_batch, expect![[r#" Field { name: "content", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "14"} }, @@ -561,6 +343,6 @@ mod tests { ]"#]], &["path", "length"], Some("path"), - ); + ).await; } } diff --git a/crates/iceberg/src/inspect/metadata_table.rs b/crates/iceberg/src/inspect/metadata_table.rs new file mode 100644 index 000000000..75dbc7472 --- /dev/null +++ b/crates/iceberg/src/inspect/metadata_table.rs @@ -0,0 +1,109 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use super::{ManifestsTable, SnapshotsTable}; +use crate::table::Table; + +/// Metadata table is used to inspect a table's history, snapshots, and other metadata as a table. +/// +/// References: +/// - +/// - +/// - +#[derive(Debug)] +pub struct MetadataTable<'a>(&'a Table); + +impl<'a> MetadataTable<'a> { + /// Creates a new metadata scan. + pub fn new(table: &'a Table) -> Self { + Self(table) + } + + /// Get the snapshots table. + pub fn snapshots(&self) -> SnapshotsTable { + SnapshotsTable::new(self.0) + } + + /// Get the manifests table. + pub fn manifests(&self) -> ManifestsTable { + ManifestsTable::new(self.0) + } +} + +#[cfg(test)] +pub mod tests { + use expect_test::Expect; + use futures::TryStreamExt; + use itertools::Itertools; + + use crate::scan::ArrowRecordBatchStream; + + /// Snapshot testing to check the resulting record batch. + /// + /// - `expected_schema/data`: put `expect![[""]]` as a placeholder, + /// and then run test with `UPDATE_EXPECT=1 cargo test` to automatically update the result, + /// or use rust-analyzer (see [video](https://github.com/rust-analyzer/expect-test)). + /// Check the doc of [`expect_test`] for more details. + /// - `ignore_check_columns`: Some columns are not stable, so we can skip them. + /// - `sort_column`: The order of the data might be non-deterministic, so we can sort it by a column. + pub async fn check_record_batches( + batch_stream: ArrowRecordBatchStream, + expected_schema: Expect, + expected_data: Expect, + ignore_check_columns: &[&str], + sort_column: Option<&str>, + ) { + let record_batches = batch_stream.try_collect::>().await.unwrap(); + assert!(!record_batches.is_empty(), "Empty record batches"); + + // Combine record batches using the first batch's schema + let first_batch = record_batches.first().unwrap(); + let record_batch = + arrow_select::concat::concat_batches(&first_batch.schema(), &record_batches).unwrap(); + + let mut columns = record_batch.columns().to_vec(); + if let Some(sort_column) = sort_column { + let column = record_batch.column_by_name(sort_column).unwrap(); + let indices = arrow_ord::sort::sort_to_indices(column, None, None).unwrap(); + columns = columns + .iter() + .map(|column| arrow_select::take::take(column.as_ref(), &indices, None).unwrap()) + .collect_vec(); + } + + expected_schema.assert_eq(&format!( + "{}", + record_batch.schema().fields().iter().format(",\n") + )); + expected_data.assert_eq(&format!( + "{}", + record_batch + .schema() + .fields() + .iter() + .zip_eq(columns) + .map(|(field, column)| { + if ignore_check_columns.contains(&field.name().as_str()) { + format!("{}: (skipped)", field.name()) + } else { + format!("{}: {:?}", field.name(), column) + } + }) + .format(",\n") + )); + } +} diff --git a/crates/iceberg/src/inspect/mod.rs b/crates/iceberg/src/inspect/mod.rs new file mode 100644 index 000000000..b64420ea1 --- /dev/null +++ b/crates/iceberg/src/inspect/mod.rs @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Metadata table APIs. + +mod manifests; +mod metadata_table; +mod snapshots; + +pub use manifests::ManifestsTable; +pub use metadata_table::*; +pub use snapshots::SnapshotsTable; diff --git a/crates/iceberg/src/inspect/snapshots.rs b/crates/iceberg/src/inspect/snapshots.rs new file mode 100644 index 000000000..1ee89963d --- /dev/null +++ b/crates/iceberg/src/inspect/snapshots.rs @@ -0,0 +1,189 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use arrow_array::builder::{MapBuilder, PrimitiveBuilder, StringBuilder}; +use arrow_array::types::{Int64Type, TimestampMillisecondType}; +use arrow_array::RecordBatch; +use arrow_schema::{DataType, Field, Schema, TimeUnit}; +use futures::{stream, StreamExt}; + +use crate::scan::ArrowRecordBatchStream; +use crate::table::Table; +use crate::Result; + +/// Snapshots table. +pub struct SnapshotsTable<'a> { + table: &'a Table, +} + +impl<'a> SnapshotsTable<'a> { + /// Create a new Snapshots table instance. + pub fn new(table: &'a Table) -> Self { + Self { table } + } + + /// Returns the schema of the snapshots table. + pub fn schema(&self) -> Schema { + Schema::new(vec![ + Field::new( + "committed_at", + DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())), + false, + ), + Field::new("snapshot_id", DataType::Int64, false), + Field::new("parent_id", DataType::Int64, true), + Field::new("operation", DataType::Utf8, false), + Field::new("manifest_list", DataType::Utf8, false), + Field::new( + "summary", + DataType::Map( + Arc::new(Field::new( + "entries", + DataType::Struct( + vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Utf8, true), + ] + .into(), + ), + false, + )), + false, + ), + false, + ), + ]) + } + + /// Scans the snapshots table. + pub async fn scan(&self) -> Result { + let mut committed_at = + PrimitiveBuilder::::new().with_timezone("+00:00"); + let mut snapshot_id = PrimitiveBuilder::::new(); + let mut parent_id = PrimitiveBuilder::::new(); + let mut operation = StringBuilder::new(); + let mut manifest_list = StringBuilder::new(); + let mut summary = MapBuilder::new(None, StringBuilder::new(), StringBuilder::new()); + + for snapshot in self.table.metadata().snapshots() { + committed_at.append_value(snapshot.timestamp_ms()); + snapshot_id.append_value(snapshot.snapshot_id()); + parent_id.append_option(snapshot.parent_snapshot_id()); + manifest_list.append_value(snapshot.manifest_list()); + operation.append_value(snapshot.summary().operation.as_str()); + for (key, value) in &snapshot.summary().additional_properties { + summary.keys().append_value(key); + summary.values().append_value(value); + } + summary.append(true)?; + } + + let batch = RecordBatch::try_new(Arc::new(self.schema()), vec![ + Arc::new(committed_at.finish()), + Arc::new(snapshot_id.finish()), + Arc::new(parent_id.finish()), + Arc::new(operation.finish()), + Arc::new(manifest_list.finish()), + Arc::new(summary.finish()), + ])?; + + Ok(stream::iter(vec![Ok(batch)]).boxed()) + } +} + +#[cfg(test)] +mod tests { + use expect_test::expect; + + use crate::inspect::metadata_table::tests::check_record_batches; + use crate::scan::tests::TableTestFixture; + + #[tokio::test] + async fn test_snapshots_table() { + let table = TableTestFixture::new().table; + + let batch_stream = table.inspect().snapshots().scan().await.unwrap(); + + check_record_batches( + batch_stream, + expect![[r#" + Field { name: "committed_at", data_type: Timestamp(Millisecond, Some("+00:00")), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, + Field { name: "snapshot_id", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, + Field { name: "parent_id", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, + Field { name: "operation", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, + Field { name: "manifest_list", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, + Field { name: "summary", data_type: Map(Field { name: "entries", data_type: Struct([Field { name: "keys", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "values", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }"#]], + expect![[r#" + committed_at: PrimitiveArray + [ + 2018-01-04T21:22:35.770+00:00, + 2019-04-12T20:29:15.770+00:00, + ], + snapshot_id: PrimitiveArray + [ + 3051729675574597004, + 3055729675574597004, + ], + parent_id: PrimitiveArray + [ + null, + 3051729675574597004, + ], + operation: StringArray + [ + "append", + "append", + ], + manifest_list: (skipped), + summary: MapArray + [ + StructArray + -- validity: + [ + ] + [ + -- child 0: "keys" (Utf8) + StringArray + [ + ] + -- child 1: "values" (Utf8) + StringArray + [ + ] + ], + StructArray + -- validity: + [ + ] + [ + -- child 0: "keys" (Utf8) + StringArray + [ + ] + -- child 1: "values" (Utf8) + StringArray + [ + ] + ], + ]"#]], + &["manifest_list"], + Some("committed_at"), + ).await; + } +} diff --git a/crates/iceberg/src/io/object_cache.rs b/crates/iceberg/src/io/object_cache.rs index 6ea7594ba..8cd72da20 100644 --- a/crates/iceberg/src/io/object_cache.rs +++ b/crates/iceberg/src/io/object_cache.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use std::mem::size_of_val; use std::sync::Arc; use crate::io::FileIO; @@ -191,9 +192,8 @@ mod tests { use super::*; use crate::io::{FileIO, OutputFile}; use crate::spec::{ - DataContentType, DataFileBuilder, DataFileFormat, FormatVersion, Literal, Manifest, - ManifestContentType, ManifestEntry, ManifestListWriter, ManifestMetadata, ManifestStatus, - ManifestWriter, Struct, TableMetadata, + DataContentType, DataFileBuilder, DataFileFormat, Literal, ManifestEntry, + ManifestListWriter, ManifestStatus, ManifestWriterBuilder, Struct, TableMetadata, }; use crate::table::Table; use crate::TableIdent; @@ -263,37 +263,33 @@ mod tests { let current_partition_spec = self.table.metadata().default_partition_spec(); // Write data files - let data_file_manifest = ManifestWriter::new( + let mut writer = ManifestWriterBuilder::new( self.next_manifest_file(), - current_snapshot.snapshot_id(), + Some(current_snapshot.snapshot_id()), vec![], + current_schema.clone(), + current_partition_spec.as_ref().clone(), ) - .write(Manifest::new( - ManifestMetadata::builder() - .schema(current_schema.clone()) - .content(ManifestContentType::Data) - .format_version(FormatVersion::V2) - .partition_spec((**current_partition_spec).clone()) - .schema_id(current_schema.schema_id()) - .build(), - vec![ManifestEntry::builder() - .status(ManifestStatus::Added) - .data_file( - DataFileBuilder::default() - .content(DataContentType::Data) - .file_path(format!("{}/1.parquet", &self.table_location)) - .file_format(DataFileFormat::Parquet) - .file_size_in_bytes(100) - .record_count(1) - .partition(Struct::from_iter([Some(Literal::long(100))])) - .key_metadata(None) - .build() - .unwrap(), - ) - .build()], - )) - .await - .unwrap(); + .build_v2_data(); + writer + .add_entry( + ManifestEntry::builder() + .status(ManifestStatus::Added) + .data_file( + DataFileBuilder::default() + .content(DataContentType::Data) + .file_path(format!("{}/1.parquet", &self.table_location)) + .file_format(DataFileFormat::Parquet) + .file_size_in_bytes(100) + .record_count(1) + .partition(Struct::from_iter([Some(Literal::long(100))])) + .build() + .unwrap(), + ) + .build(), + ) + .unwrap(); + let data_file_manifest = writer.write_manifest_file().await.unwrap(); // Write to manifest list let mut manifest_list_write = ManifestListWriter::v2( diff --git a/crates/iceberg/src/lib.rs b/crates/iceberg/src/lib.rs index 1946f35f3..fe5a52999 100644 --- a/crates/iceberg/src/lib.rs +++ b/crates/iceberg/src/lib.rs @@ -73,7 +73,7 @@ mod avro; pub mod io; pub mod spec; -pub mod metadata_scan; +pub mod inspect; pub mod scan; pub mod expr; diff --git a/crates/iceberg/src/puffin/compression.rs b/crates/iceberg/src/puffin/compression.rs index 710698df8..a9a56ef12 100644 --- a/crates/iceberg/src/puffin/compression.rs +++ b/crates/iceberg/src/puffin/compression.rs @@ -15,10 +15,13 @@ // specific language governing permissions and limitations // under the License. +use serde::{Deserialize, Serialize}; + use crate::{Error, ErrorKind, Result}; -#[derive(Debug, PartialEq, Eq, Clone, Copy, Default)] /// Data compression formats +#[derive(Debug, PartialEq, Eq, Clone, Copy, Default, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] pub enum CompressionCodec { #[default] /// No compression diff --git a/crates/iceberg/src/puffin/metadata.rs b/crates/iceberg/src/puffin/metadata.rs new file mode 100644 index 000000000..9d0003225 --- /dev/null +++ b/crates/iceberg/src/puffin/metadata.rs @@ -0,0 +1,777 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::{HashMap, HashSet}; + +use bytes::Bytes; +use serde::{Deserialize, Serialize}; + +use crate::io::{FileRead, InputFile}; +use crate::puffin::compression::CompressionCodec; +use crate::{Error, ErrorKind, Result}; + +/// Human-readable identification of the application writing the file, along with its version. +/// Example: "Trino version 381" +pub(crate) const CREATED_BY_PROPERTY: &str = "created-by"; + +/// Metadata about a blob. +/// For more information, see: https://iceberg.apache.org/puffin-spec/#blobmetadata +#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone)] +#[serde(rename_all = "kebab-case")] +pub(crate) struct BlobMetadata { + /// See blob types: https://iceberg.apache.org/puffin-spec/#blob-types + pub(crate) r#type: String, + /// List of field IDs the blob was computed for; the order of items is used to compute sketches stored in the blob. + pub(crate) fields: Vec, + /// ID of the Iceberg table's snapshot the blob was computed from + pub(crate) snapshot_id: i64, + /// Sequence number of the Iceberg table's snapshot the blob was computed from + pub(crate) sequence_number: i64, + /// The offset in the file where the blob contents start + pub(crate) offset: u64, + /// The length of the blob stored in the file (after compression, if compressed) + pub(crate) length: u64, + /// The compression codec used to compress the data + #[serde(skip_serializing_if = "CompressionCodec::is_none")] + #[serde(default)] + pub(crate) compression_codec: CompressionCodec, + /// Arbitrary meta-information about the blob + #[serde(skip_serializing_if = "HashMap::is_empty")] + #[serde(default)] + pub(crate) properties: HashMap, +} + +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +pub(crate) enum Flag { + FooterPayloadCompressed = 0, +} + +impl Flag { + pub(crate) fn byte_idx(self) -> u8 { + (self as u8) / 8 + } + + pub(crate) fn bit_idx(self) -> u8 { + (self as u8) % 8 + } + + fn matches(self, byte_idx: u8, bit_idx: u8) -> bool { + self.byte_idx() == byte_idx && self.bit_idx() == bit_idx + } + + fn from(byte_idx: u8, bit_idx: u8) -> Result { + if Flag::FooterPayloadCompressed.matches(byte_idx, bit_idx) { + Ok(Flag::FooterPayloadCompressed) + } else { + Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Unknown flag byte {} and bit {} combination", + byte_idx, bit_idx + ), + )) + } + } +} + +/// Metadata about a puffin file. +/// For more information, see: https://iceberg.apache.org/puffin-spec/#filemetadata +#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone)] +pub(crate) struct FileMetadata { + /// Metadata about blobs in file + pub(crate) blobs: Vec, + /// Arbitrary meta-information, like writer identification/version. + #[serde(skip_serializing_if = "HashMap::is_empty")] + #[serde(default)] + pub(crate) properties: HashMap, +} + +impl FileMetadata { + pub(crate) const MAGIC_LENGTH: u8 = 4; + pub(crate) const MAGIC: [u8; FileMetadata::MAGIC_LENGTH as usize] = [0x50, 0x46, 0x41, 0x31]; + + // We use the term FOOTER_STRUCT to refer to the fixed-length portion of the Footer, as illustrated below. + // + // Footer + // | + // ------------------------------------------------- + // | | + // Magic FooterPayload FooterPayloadLength Flags Magic + // | | + // ----------------------------- + // | + // FOOTER_STRUCT + + const FOOTER_STRUCT_PAYLOAD_LENGTH_OFFSET: u8 = 0; + const FOOTER_STRUCT_PAYLOAD_LENGTH_LENGTH: u8 = 4; + const FOOTER_STRUCT_FLAGS_OFFSET: u8 = FileMetadata::FOOTER_STRUCT_PAYLOAD_LENGTH_OFFSET + + FileMetadata::FOOTER_STRUCT_PAYLOAD_LENGTH_LENGTH; + pub(crate) const FOOTER_STRUCT_FLAGS_LENGTH: u8 = 4; + const FOOTER_STRUCT_MAGIC_OFFSET: u8 = + FileMetadata::FOOTER_STRUCT_FLAGS_OFFSET + FileMetadata::FOOTER_STRUCT_FLAGS_LENGTH; + pub(crate) const FOOTER_STRUCT_LENGTH: u8 = + FileMetadata::FOOTER_STRUCT_MAGIC_OFFSET + FileMetadata::MAGIC_LENGTH; + + fn check_magic(bytes: &[u8]) -> Result<()> { + if bytes == FileMetadata::MAGIC { + Ok(()) + } else { + Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Bad magic value: {:?} should be {:?}", + bytes, + FileMetadata::MAGIC + ), + )) + } + } + + async fn read_footer_payload_length( + file_read: &dyn FileRead, + input_file_length: u64, + ) -> Result { + let start = input_file_length - FileMetadata::FOOTER_STRUCT_LENGTH as u64; + let end = start + FileMetadata::FOOTER_STRUCT_PAYLOAD_LENGTH_LENGTH as u64; + let footer_payload_length_bytes = file_read.read(start..end).await?; + let mut buf = [0; 4]; + buf.copy_from_slice(&footer_payload_length_bytes); + let footer_payload_length = u32::from_le_bytes(buf); + Ok(footer_payload_length) + } + + async fn read_footer_bytes( + file_read: &dyn FileRead, + input_file_length: u64, + footer_payload_length: u32, + ) -> Result { + let footer_length = footer_payload_length as u64 + + FileMetadata::FOOTER_STRUCT_LENGTH as u64 + + FileMetadata::MAGIC_LENGTH as u64; + let start = input_file_length - footer_length; + let end = input_file_length; + file_read.read(start..end).await + } + + fn decode_flags(footer_bytes: &[u8]) -> Result> { + let mut flags = HashSet::new(); + + for byte_idx in 0..FileMetadata::FOOTER_STRUCT_FLAGS_LENGTH { + let byte_offset = footer_bytes.len() + - FileMetadata::MAGIC_LENGTH as usize + - FileMetadata::FOOTER_STRUCT_FLAGS_LENGTH as usize + + byte_idx as usize; + + let flag_byte = *footer_bytes.get(byte_offset).ok_or_else(|| { + Error::new(ErrorKind::DataInvalid, "Index range is out of bounds.") + })?; + + for bit_idx in 0..8 { + if ((flag_byte >> bit_idx) & 1) != 0 { + let flag = Flag::from(byte_idx, bit_idx)?; + flags.insert(flag); + } + } + } + + Ok(flags) + } + + fn extract_footer_payload_as_str( + footer_bytes: &[u8], + footer_payload_length: u32, + ) -> Result { + let flags = FileMetadata::decode_flags(footer_bytes)?; + let footer_compression_codec = if flags.contains(&Flag::FooterPayloadCompressed) { + CompressionCodec::Lz4 + } else { + CompressionCodec::None + }; + + let start_offset = FileMetadata::MAGIC_LENGTH as usize; + let end_offset = + FileMetadata::MAGIC_LENGTH as usize + usize::try_from(footer_payload_length)?; + let footer_payload_bytes = footer_bytes + .get(start_offset..end_offset) + .ok_or_else(|| Error::new(ErrorKind::DataInvalid, "Index range is out of bounds."))?; + let decompressed_footer_payload_bytes = + footer_compression_codec.decompress(footer_payload_bytes.into())?; + + String::from_utf8(decompressed_footer_payload_bytes).map_err(|src| { + Error::new(ErrorKind::DataInvalid, "Footer is not a valid UTF-8 string") + .with_source(src) + }) + } + + fn from_json_str(string: &str) -> Result { + serde_json::from_str::(string).map_err(|src| { + Error::new(ErrorKind::DataInvalid, "Given string is not valid JSON").with_source(src) + }) + } + + /// Returns the file metadata about a Puffin file + pub(crate) async fn read(input_file: &InputFile) -> Result { + let file_read = input_file.reader().await?; + + let first_four_bytes = file_read.read(0..FileMetadata::MAGIC_LENGTH.into()).await?; + FileMetadata::check_magic(&first_four_bytes)?; + + let input_file_length = input_file.metadata().await?.size; + let footer_payload_length = + FileMetadata::read_footer_payload_length(&file_read, input_file_length).await?; + let footer_bytes = + FileMetadata::read_footer_bytes(&file_read, input_file_length, footer_payload_length) + .await?; + + let magic_length = FileMetadata::MAGIC_LENGTH as usize; + // check first four bytes of footer + FileMetadata::check_magic(&footer_bytes[..magic_length])?; + // check last four bytes of footer + FileMetadata::check_magic(&footer_bytes[footer_bytes.len() - magic_length..])?; + + let footer_payload_str = + FileMetadata::extract_footer_payload_as_str(&footer_bytes, footer_payload_length)?; + FileMetadata::from_json_str(&footer_payload_str) + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use bytes::Bytes; + use tempfile::TempDir; + + use crate::io::{FileIOBuilder, InputFile}; + use crate::puffin::metadata::{BlobMetadata, CompressionCodec, FileMetadata}; + use crate::puffin::test_utils::{ + empty_footer_payload, empty_footer_payload_bytes, empty_footer_payload_bytes_length_bytes, + java_empty_uncompressed_input_file, java_uncompressed_metric_input_file, + java_zstd_compressed_metric_input_file, uncompressed_metric_file_metadata, + zstd_compressed_metric_file_metadata, + }; + + const INVALID_MAGIC_VALUE: [u8; 4] = [80, 70, 65, 0]; + + async fn input_file_with_bytes(temp_dir: &TempDir, slice: &[u8]) -> InputFile { + let file_io = FileIOBuilder::new_fs_io().build().unwrap(); + + let path_buf = temp_dir.path().join("abc.puffin"); + let temp_path = path_buf.to_str().unwrap(); + let output_file = file_io.new_output(temp_path).unwrap(); + + output_file + .write(Bytes::copy_from_slice(slice)) + .await + .unwrap(); + + output_file.to_input_file() + } + + async fn input_file_with_payload(temp_dir: &TempDir, payload_str: &str) -> InputFile { + let payload_bytes = payload_str.as_bytes(); + + let mut bytes = vec![]; + bytes.extend(FileMetadata::MAGIC.to_vec()); + bytes.extend(FileMetadata::MAGIC.to_vec()); + bytes.extend(payload_bytes); + bytes.extend(u32::to_le_bytes(payload_bytes.len() as u32)); + bytes.extend(vec![0, 0, 0, 0]); + bytes.extend(FileMetadata::MAGIC); + + input_file_with_bytes(temp_dir, &bytes).await + } + + #[tokio::test] + async fn test_file_starting_with_invalid_magic_returns_error() { + let temp_dir = TempDir::new().unwrap(); + + let mut bytes = vec![]; + bytes.extend(INVALID_MAGIC_VALUE.to_vec()); + bytes.extend(FileMetadata::MAGIC.to_vec()); + bytes.extend(empty_footer_payload_bytes()); + bytes.extend(empty_footer_payload_bytes_length_bytes()); + bytes.extend(vec![0, 0, 0, 0]); + bytes.extend(FileMetadata::MAGIC); + + let input_file = input_file_with_bytes(&temp_dir, &bytes).await; + + assert_eq!( + FileMetadata::read(&input_file) + .await + .unwrap_err() + .to_string(), + "DataInvalid => Bad magic value: [80, 70, 65, 0] should be [80, 70, 65, 49]", + ) + } + + #[tokio::test] + async fn test_file_with_invalid_magic_at_start_of_footer_returns_error() { + let temp_dir = TempDir::new().unwrap(); + + let mut bytes = vec![]; + bytes.extend(FileMetadata::MAGIC.to_vec()); + bytes.extend(INVALID_MAGIC_VALUE.to_vec()); + bytes.extend(empty_footer_payload_bytes()); + bytes.extend(empty_footer_payload_bytes_length_bytes()); + bytes.extend(vec![0, 0, 0, 0]); + bytes.extend(FileMetadata::MAGIC); + + let input_file = input_file_with_bytes(&temp_dir, &bytes).await; + + assert_eq!( + FileMetadata::read(&input_file) + .await + .unwrap_err() + .to_string(), + "DataInvalid => Bad magic value: [80, 70, 65, 0] should be [80, 70, 65, 49]", + ) + } + + #[tokio::test] + async fn test_file_ending_with_invalid_magic_returns_error() { + let temp_dir = TempDir::new().unwrap(); + + let mut bytes = vec![]; + bytes.extend(FileMetadata::MAGIC.to_vec()); + bytes.extend(FileMetadata::MAGIC.to_vec()); + bytes.extend(empty_footer_payload_bytes()); + bytes.extend(empty_footer_payload_bytes_length_bytes()); + bytes.extend(vec![0, 0, 0, 0]); + bytes.extend(INVALID_MAGIC_VALUE); + + let input_file = input_file_with_bytes(&temp_dir, &bytes).await; + + assert_eq!( + FileMetadata::read(&input_file) + .await + .unwrap_err() + .to_string(), + "DataInvalid => Bad magic value: [80, 70, 65, 0] should be [80, 70, 65, 49]", + ) + } + + #[tokio::test] + async fn test_encoded_payload_length_larger_than_actual_payload_length_returns_error() { + let temp_dir = TempDir::new().unwrap(); + + let mut bytes = vec![]; + bytes.extend(FileMetadata::MAGIC.to_vec()); + bytes.extend(FileMetadata::MAGIC.to_vec()); + bytes.extend(empty_footer_payload_bytes()); + bytes.extend(u32::to_le_bytes( + empty_footer_payload_bytes().len() as u32 + 1, + )); + bytes.extend(vec![0, 0, 0, 0]); + bytes.extend(FileMetadata::MAGIC.to_vec()); + + let input_file = input_file_with_bytes(&temp_dir, &bytes).await; + + assert_eq!( + FileMetadata::read(&input_file) + .await + .unwrap_err() + .to_string(), + "DataInvalid => Bad magic value: [49, 80, 70, 65] should be [80, 70, 65, 49]", + ) + } + + #[tokio::test] + async fn test_encoded_payload_length_smaller_than_actual_payload_length_returns_error() { + let temp_dir = TempDir::new().unwrap(); + + let mut bytes = vec![]; + bytes.extend(FileMetadata::MAGIC.to_vec()); + bytes.extend(FileMetadata::MAGIC.to_vec()); + bytes.extend(empty_footer_payload_bytes()); + bytes.extend(u32::to_le_bytes( + empty_footer_payload_bytes().len() as u32 - 1, + )); + bytes.extend(vec![0, 0, 0, 0]); + bytes.extend(FileMetadata::MAGIC.to_vec()); + + let input_file = input_file_with_bytes(&temp_dir, &bytes).await; + + assert_eq!( + FileMetadata::read(&input_file) + .await + .unwrap_err() + .to_string(), + "DataInvalid => Bad magic value: [70, 65, 49, 123] should be [80, 70, 65, 49]", + ) + } + + #[tokio::test] + async fn test_lz4_compressed_footer_returns_error() { + let temp_dir = TempDir::new().unwrap(); + + let mut bytes = vec![]; + bytes.extend(FileMetadata::MAGIC.to_vec()); + bytes.extend(FileMetadata::MAGIC.to_vec()); + bytes.extend(empty_footer_payload_bytes()); + bytes.extend(empty_footer_payload_bytes_length_bytes()); + bytes.extend(vec![0b00000001, 0, 0, 0]); + bytes.extend(FileMetadata::MAGIC.to_vec()); + + let input_file = input_file_with_bytes(&temp_dir, &bytes).await; + + assert_eq!( + FileMetadata::read(&input_file) + .await + .unwrap_err() + .to_string(), + "FeatureUnsupported => LZ4 decompression is not supported currently", + ) + } + + #[tokio::test] + async fn test_unknown_byte_bit_combination_returns_error() { + let temp_dir = TempDir::new().unwrap(); + + let mut bytes = vec![]; + bytes.extend(FileMetadata::MAGIC.to_vec()); + bytes.extend(FileMetadata::MAGIC.to_vec()); + bytes.extend(empty_footer_payload_bytes()); + bytes.extend(empty_footer_payload_bytes_length_bytes()); + bytes.extend(vec![0b00000010, 0, 0, 0]); + bytes.extend(FileMetadata::MAGIC.to_vec()); + + let input_file = input_file_with_bytes(&temp_dir, &bytes).await; + + assert_eq!( + FileMetadata::read(&input_file) + .await + .unwrap_err() + .to_string(), + "DataInvalid => Unknown flag byte 0 and bit 1 combination", + ) + } + + #[tokio::test] + async fn test_non_utf8_string_payload_returns_error() { + let temp_dir = TempDir::new().unwrap(); + + let payload_bytes: [u8; 4] = [0, 159, 146, 150]; + let payload_bytes_length_bytes: [u8; 4] = u32::to_le_bytes(payload_bytes.len() as u32); + + let mut bytes = vec![]; + bytes.extend(FileMetadata::MAGIC.to_vec()); + bytes.extend(FileMetadata::MAGIC.to_vec()); + bytes.extend(payload_bytes); + bytes.extend(payload_bytes_length_bytes); + bytes.extend(vec![0, 0, 0, 0]); + bytes.extend(FileMetadata::MAGIC.to_vec()); + + let input_file = input_file_with_bytes(&temp_dir, &bytes).await; + + assert_eq!( + FileMetadata::read(&input_file).await.unwrap_err().to_string(), + "DataInvalid => Footer is not a valid UTF-8 string, source: invalid utf-8 sequence of 1 bytes from index 1", + ) + } + + #[tokio::test] + async fn test_minimal_valid_file_returns_file_metadata() { + let temp_dir = TempDir::new().unwrap(); + + let mut bytes = vec![]; + bytes.extend(FileMetadata::MAGIC.to_vec()); + bytes.extend(FileMetadata::MAGIC.to_vec()); + bytes.extend(empty_footer_payload_bytes()); + bytes.extend(empty_footer_payload_bytes_length_bytes()); + bytes.extend(vec![0, 0, 0, 0]); + bytes.extend(FileMetadata::MAGIC); + + let input_file = input_file_with_bytes(&temp_dir, &bytes).await; + + assert_eq!( + FileMetadata::read(&input_file).await.unwrap(), + FileMetadata { + blobs: vec![], + properties: HashMap::new(), + } + ) + } + + #[tokio::test] + async fn test_returns_file_metadata_property() { + let temp_dir = TempDir::new().unwrap(); + + let input_file = input_file_with_payload( + &temp_dir, + r#"{ + "blobs" : [ ], + "properties" : { + "a property" : "a property value" + } + }"#, + ) + .await; + + assert_eq!( + FileMetadata::read(&input_file).await.unwrap(), + FileMetadata { + blobs: vec![], + properties: { + let mut map = HashMap::new(); + map.insert("a property".to_string(), "a property value".to_string()); + map + }, + } + ) + } + + #[tokio::test] + async fn test_returns_file_metadata_properties() { + let temp_dir = TempDir::new().unwrap(); + + let input_file = input_file_with_payload( + &temp_dir, + r#"{ + "blobs" : [ ], + "properties" : { + "a property" : "a property value", + "another one": "also with value" + } + }"#, + ) + .await; + + assert_eq!( + FileMetadata::read(&input_file).await.unwrap(), + FileMetadata { + blobs: vec![], + properties: { + let mut map = HashMap::new(); + map.insert("a property".to_string(), "a property value".to_string()); + map.insert("another one".to_string(), "also with value".to_string()); + map + }, + } + ) + } + + #[tokio::test] + async fn test_returns_error_if_blobs_field_is_missing() { + let temp_dir = TempDir::new().unwrap(); + + let input_file = input_file_with_payload( + &temp_dir, + r#"{ + "properties" : {} + }"#, + ) + .await; + + assert_eq!( + FileMetadata::read(&input_file).await.unwrap_err().to_string(), + format!( + "DataInvalid => Given string is not valid JSON, source: missing field `blobs` at line 3 column 13" + ), + ) + } + + #[tokio::test] + async fn test_returns_error_if_blobs_field_is_bad() { + let temp_dir = TempDir::new().unwrap(); + + let input_file = input_file_with_payload( + &temp_dir, + r#"{ + "blobs" : {} + }"#, + ) + .await; + + assert_eq!( + FileMetadata::read(&input_file).await.unwrap_err().to_string(), + format!("DataInvalid => Given string is not valid JSON, source: invalid type: map, expected a sequence at line 2 column 26"), + ) + } + + #[tokio::test] + async fn test_returns_blobs_metadatas() { + let temp_dir = TempDir::new().unwrap(); + + let input_file = input_file_with_payload( + &temp_dir, + r#"{ + "blobs" : [ + { + "type" : "type-a", + "fields" : [ 1 ], + "snapshot-id" : 14, + "sequence-number" : 3, + "offset" : 4, + "length" : 16 + }, + { + "type" : "type-bbb", + "fields" : [ 2, 3, 4 ], + "snapshot-id" : 77, + "sequence-number" : 4, + "offset" : 21474836470000, + "length" : 79834 + } + ] + }"#, + ) + .await; + + assert_eq!( + FileMetadata::read(&input_file).await.unwrap(), + FileMetadata { + blobs: vec![ + BlobMetadata { + r#type: "type-a".to_string(), + fields: vec![1], + snapshot_id: 14, + sequence_number: 3, + offset: 4, + length: 16, + compression_codec: CompressionCodec::None, + properties: HashMap::new(), + }, + BlobMetadata { + r#type: "type-bbb".to_string(), + fields: vec![2, 3, 4], + snapshot_id: 77, + sequence_number: 4, + offset: 21474836470000, + length: 79834, + compression_codec: CompressionCodec::None, + properties: HashMap::new(), + }, + ], + properties: HashMap::new(), + } + ) + } + + #[tokio::test] + async fn test_returns_properties_in_blob_metadata() { + let temp_dir = TempDir::new().unwrap(); + + let input_file = input_file_with_payload( + &temp_dir, + r#"{ + "blobs" : [ + { + "type" : "type-a", + "fields" : [ 1 ], + "snapshot-id" : 14, + "sequence-number" : 3, + "offset" : 4, + "length" : 16, + "properties" : { + "some key" : "some value" + } + } + ] + }"#, + ) + .await; + + assert_eq!( + FileMetadata::read(&input_file).await.unwrap(), + FileMetadata { + blobs: vec![BlobMetadata { + r#type: "type-a".to_string(), + fields: vec![1], + snapshot_id: 14, + sequence_number: 3, + offset: 4, + length: 16, + compression_codec: CompressionCodec::None, + properties: { + let mut map = HashMap::new(); + map.insert("some key".to_string(), "some value".to_string()); + map + }, + }], + properties: HashMap::new(), + } + ) + } + + #[tokio::test] + async fn test_returns_error_if_blobs_fields_value_is_outside_i32_range() { + let temp_dir = TempDir::new().unwrap(); + + let out_of_i32_range_number: i64 = i32::MAX as i64 + 1; + + let input_file = input_file_with_payload( + &temp_dir, + &format!( + r#"{{ + "blobs" : [ + {{ + "type" : "type-a", + "fields" : [ {} ], + "snapshot-id" : 14, + "sequence-number" : 3, + "offset" : 4, + "length" : 16 + }} + ] + }}"#, + out_of_i32_range_number + ), + ) + .await; + + assert_eq!( + FileMetadata::read(&input_file).await.unwrap_err().to_string(), + format!( + "DataInvalid => Given string is not valid JSON, source: invalid value: integer `{}`, expected i32 at line 5 column 51", + out_of_i32_range_number + ), + ) + } + + #[tokio::test] + async fn test_returns_errors_if_footer_payload_is_not_encoded_in_json_format() { + let temp_dir = TempDir::new().unwrap(); + + let input_file = input_file_with_payload(&temp_dir, r#""blobs" = []"#).await; + assert_eq!( + FileMetadata::read(&input_file).await.unwrap_err().to_string(), + "DataInvalid => Given string is not valid JSON, source: invalid type: string \"blobs\", expected struct FileMetadata at line 1 column 7", + ) + } + + #[tokio::test] + async fn test_read_file_metadata_of_uncompressed_empty_file() { + let input_file = java_empty_uncompressed_input_file(); + let file_metadata = FileMetadata::read(&input_file).await.unwrap(); + assert_eq!(file_metadata, empty_footer_payload()) + } + + #[tokio::test] + async fn test_read_file_metadata_of_uncompressed_metric_data() { + let input_file = java_uncompressed_metric_input_file(); + let file_metadata = FileMetadata::read(&input_file).await.unwrap(); + assert_eq!(file_metadata, uncompressed_metric_file_metadata()) + } + + #[tokio::test] + async fn test_read_file_metadata_of_zstd_compressed_metric_data() { + let input_file = java_zstd_compressed_metric_input_file(); + let file_metadata = FileMetadata::read(&input_file).await.unwrap(); + assert_eq!(file_metadata, zstd_compressed_metric_file_metadata()) + } +} diff --git a/crates/iceberg/src/puffin/mod.rs b/crates/iceberg/src/puffin/mod.rs index c13ebe420..91bdf125f 100644 --- a/crates/iceberg/src/puffin/mod.rs +++ b/crates/iceberg/src/puffin/mod.rs @@ -22,3 +22,7 @@ #![allow(dead_code)] mod compression; +mod metadata; + +#[cfg(test)] +mod test_utils; diff --git a/crates/iceberg/src/puffin/test_utils.rs b/crates/iceberg/src/puffin/test_utils.rs new file mode 100644 index 000000000..e49e51d50 --- /dev/null +++ b/crates/iceberg/src/puffin/test_utils.rs @@ -0,0 +1,158 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::HashMap; + +use crate::io::{FileIOBuilder, InputFile}; +use crate::puffin::compression::CompressionCodec; +use crate::puffin::metadata::{BlobMetadata, FileMetadata, CREATED_BY_PROPERTY}; + +const JAVA_TESTDATA: &str = "testdata/puffin/java-generated"; +const EMPTY_UNCOMPRESSED: &str = "empty-puffin-uncompressed.bin"; +const METRIC_UNCOMPRESSED: &str = "sample-metric-data-uncompressed.bin"; +const METRIC_ZSTD_COMPRESSED: &str = "sample-metric-data-compressed-zstd.bin"; + +fn input_file_for_test_data(path: &str) -> InputFile { + FileIOBuilder::new_fs_io() + .build() + .unwrap() + .new_input(env!("CARGO_MANIFEST_DIR").to_owned() + "/" + path) + .unwrap() +} + +pub(crate) fn java_empty_uncompressed_input_file() -> InputFile { + input_file_for_test_data(&[JAVA_TESTDATA, EMPTY_UNCOMPRESSED].join("/")) +} + +pub(crate) fn java_uncompressed_metric_input_file() -> InputFile { + input_file_for_test_data(&[JAVA_TESTDATA, METRIC_UNCOMPRESSED].join("/")) +} + +pub(crate) fn java_zstd_compressed_metric_input_file() -> InputFile { + input_file_for_test_data(&[JAVA_TESTDATA, METRIC_ZSTD_COMPRESSED].join("/")) +} + +pub(crate) fn empty_footer_payload() -> FileMetadata { + FileMetadata { + blobs: Vec::new(), + properties: HashMap::new(), + } +} + +pub(crate) fn empty_footer_payload_bytes() -> Vec { + return serde_json::to_string::(&empty_footer_payload()) + .unwrap() + .as_bytes() + .to_vec(); +} + +pub(crate) fn empty_footer_payload_bytes_length_bytes() -> [u8; 4] { + u32::to_le_bytes(empty_footer_payload_bytes().len() as u32) +} + +pub(crate) const METRIC_BLOB_0_TYPE: &str = "some-blob"; +pub(crate) const METRIC_BLOB_0_INPUT_FIELDS: [i32; 1] = [1]; +pub(crate) const METRIC_BLOB_0_SNAPSHOT_ID: i64 = 2; +pub(crate) const METRIC_BLOB_0_SEQUENCE_NUMBER: i64 = 1; + +pub(crate) fn zstd_compressed_metric_blob_0_metadata() -> BlobMetadata { + BlobMetadata { + r#type: METRIC_BLOB_0_TYPE.to_string(), + fields: METRIC_BLOB_0_INPUT_FIELDS.to_vec(), + snapshot_id: METRIC_BLOB_0_SNAPSHOT_ID, + sequence_number: METRIC_BLOB_0_SEQUENCE_NUMBER, + offset: 4, + length: 22, + compression_codec: CompressionCodec::Zstd, + properties: HashMap::new(), + } +} + +pub(crate) fn uncompressed_metric_blob_0_metadata() -> BlobMetadata { + BlobMetadata { + r#type: METRIC_BLOB_0_TYPE.to_string(), + fields: METRIC_BLOB_0_INPUT_FIELDS.to_vec(), + snapshot_id: METRIC_BLOB_0_SNAPSHOT_ID, + sequence_number: METRIC_BLOB_0_SEQUENCE_NUMBER, + offset: 4, + length: 9, + compression_codec: CompressionCodec::None, + properties: HashMap::new(), + } +} + +pub(crate) const METRIC_BLOB_1_TYPE: &str = "some-other-blob"; +pub(crate) const METRIC_BLOB_1_INPUT_FIELDS: [i32; 1] = [2]; +pub(crate) const METRIC_BLOB_1_SNAPSHOT_ID: i64 = 2; +pub(crate) const METRIC_BLOB_1_SEQUENCE_NUMBER: i64 = 1; + +pub(crate) fn uncompressed_metric_blob_1_metadata() -> BlobMetadata { + BlobMetadata { + r#type: METRIC_BLOB_1_TYPE.to_string(), + fields: METRIC_BLOB_1_INPUT_FIELDS.to_vec(), + snapshot_id: METRIC_BLOB_1_SNAPSHOT_ID, + sequence_number: METRIC_BLOB_1_SEQUENCE_NUMBER, + offset: 13, + length: 83, + compression_codec: CompressionCodec::None, + properties: HashMap::new(), + } +} + +pub(crate) fn zstd_compressed_metric_blob_1_metadata() -> BlobMetadata { + BlobMetadata { + r#type: METRIC_BLOB_1_TYPE.to_string(), + fields: METRIC_BLOB_1_INPUT_FIELDS.to_vec(), + snapshot_id: METRIC_BLOB_1_SNAPSHOT_ID, + sequence_number: METRIC_BLOB_1_SEQUENCE_NUMBER, + offset: 26, + length: 77, + compression_codec: CompressionCodec::Zstd, + properties: HashMap::new(), + } +} + +pub(crate) const CREATED_BY_PROPERTY_VALUE: &str = "Test 1234"; + +pub(crate) fn file_properties() -> HashMap { + let mut properties = HashMap::new(); + properties.insert( + CREATED_BY_PROPERTY.to_string(), + CREATED_BY_PROPERTY_VALUE.to_string(), + ); + properties +} + +pub(crate) fn uncompressed_metric_file_metadata() -> FileMetadata { + FileMetadata { + blobs: vec![ + uncompressed_metric_blob_0_metadata(), + uncompressed_metric_blob_1_metadata(), + ], + properties: file_properties(), + } +} + +pub(crate) fn zstd_compressed_metric_file_metadata() -> FileMetadata { + FileMetadata { + blobs: vec![ + zstd_compressed_metric_blob_0_metadata(), + zstd_compressed_metric_blob_1_metadata(), + ], + properties: file_properties(), + } +} diff --git a/crates/iceberg/src/scan.rs b/crates/iceberg/src/scan.rs index 5a97e74e7..7e05da59a 100644 --- a/crates/iceberg/src/scan.rs +++ b/crates/iceberg/src/scan.rs @@ -248,7 +248,7 @@ impl<'a> TableScanBuilder<'a> { ) })?; - let field = schema + schema .as_struct() .field_by_id(field_id) .ok_or_else(|| { @@ -261,16 +261,6 @@ impl<'a> TableScanBuilder<'a> { ) })?; - if !field.field_type.is_primitive() { - return Err(Error::new( - ErrorKind::FeatureUnsupported, - format!( - "Column {} is not a primitive type. Schema: {}", - column_name, schema - ), - )); - } - field_ids.push(field_id); } @@ -983,9 +973,9 @@ pub mod tests { use crate::io::{FileIO, OutputFile}; use crate::scan::FileScanTask; use crate::spec::{ - DataContentType, DataFileBuilder, DataFileFormat, Datum, FormatVersion, Literal, Manifest, - ManifestContentType, ManifestEntry, ManifestListWriter, ManifestMetadata, ManifestStatus, - ManifestWriter, NestedField, PrimitiveType, Schema, Struct, TableMetadata, Type, + DataContentType, DataFileBuilder, DataFileFormat, Datum, Literal, ManifestEntry, + ManifestListWriter, ManifestStatus, ManifestWriterBuilder, NestedField, PrimitiveType, + Schema, Struct, TableMetadata, Type, }; use crate::table::Table; use crate::TableIdent; @@ -1059,20 +1049,16 @@ pub mod tests { let current_partition_spec = self.table.metadata().default_partition_spec(); // Write data files - let data_file_manifest = ManifestWriter::new( + let mut writer = ManifestWriterBuilder::new( self.next_manifest_file(), - current_snapshot.snapshot_id(), + Some(current_snapshot.snapshot_id()), vec![], + current_schema.clone(), + current_partition_spec.as_ref().clone(), ) - .write(Manifest::new( - ManifestMetadata::builder() - .schema(current_schema.clone()) - .content(ManifestContentType::Data) - .format_version(FormatVersion::V2) - .partition_spec((**current_partition_spec).clone()) - .schema_id(current_schema.schema_id()) - .build(), - vec![ + .build_v2_data(); + writer + .add_entry( ManifestEntry::builder() .status(ManifestStatus::Added) .data_file( @@ -1088,6 +1074,10 @@ pub mod tests { .unwrap(), ) .build(), + ) + .unwrap(); + writer + .add_delete_entry( ManifestEntry::builder() .status(ManifestStatus::Deleted) .snapshot_id(parent_snapshot.snapshot_id()) @@ -1105,6 +1095,10 @@ pub mod tests { .unwrap(), ) .build(), + ) + .unwrap(); + writer + .add_existing_entry( ManifestEntry::builder() .status(ManifestStatus::Existing) .snapshot_id(parent_snapshot.snapshot_id()) @@ -1122,10 +1116,9 @@ pub mod tests { .unwrap(), ) .build(), - ], - )) - .await - .unwrap(); + ) + .unwrap(); + let data_file_manifest = writer.write_manifest_file().await.unwrap(); // Write to manifest list let mut manifest_list_write = ManifestListWriter::v2( diff --git a/crates/iceberg/src/spec/datatypes.rs b/crates/iceberg/src/spec/datatypes.rs index bce10ad5f..c806d16ea 100644 --- a/crates/iceberg/src/spec/datatypes.rs +++ b/crates/iceberg/src/spec/datatypes.rs @@ -36,9 +36,11 @@ use crate::spec::datatypes::_decimal::{MAX_PRECISION, REQUIRED_LENGTH}; use crate::spec::PrimitiveLiteral; /// Field name for list type. -pub(crate) const LIST_FILED_NAME: &str = "element"; -pub(crate) const MAP_KEY_FIELD_NAME: &str = "key"; -pub(crate) const MAP_VALUE_FIELD_NAME: &str = "value"; +pub const LIST_FIELD_NAME: &str = "element"; +/// Field name for map type's key. +pub const MAP_KEY_FIELD_NAME: &str = "key"; +/// Field name for map type's value. +pub const MAP_VALUE_FIELD_NAME: &str = "value"; pub(crate) const MAX_DECIMAL_BYTES: u32 = 24; pub(crate) const MAX_DECIMAL_PRECISION: u32 = 38; @@ -226,8 +228,10 @@ pub enum PrimitiveType { /// Timestamp in microsecond precision, with timezone Timestamptz, /// Timestamp in nanosecond precision, without timezone + #[serde(rename = "timestamp_ns")] TimestampNs, /// Timestamp in nanosecond precision with timezone + #[serde(rename = "timestamptz_ns")] TimestamptzNs, /// Arbitrary-length character sequences encoded in utf-8 String, @@ -631,9 +635,9 @@ impl NestedField { /// Construct list type's element field. pub fn list_element(id: i32, field_type: Type, required: bool) -> Self { if required { - Self::required(id, LIST_FILED_NAME, field_type) + Self::required(id, LIST_FIELD_NAME, field_type) } else { - Self::optional(id, LIST_FILED_NAME, field_type) + Self::optional(id, LIST_FIELD_NAME, field_type) } } @@ -855,64 +859,107 @@ mod tests { } #[test] - fn decimal() { + fn primitive_type_serde() { let record = r#" - { - "type": "struct", - "fields": [ - { - "id": 1, - "name": "id", - "required": true, - "type": "decimal(9,2)" - } - ] - } - "#; - - check_type_serde( - record, - Type::Struct(StructType { - fields: vec![NestedField::required( - 1, - "id", - Type::Primitive(PrimitiveType::Decimal { - precision: 9, - scale: 2, - }), - ) - .into()], - id_lookup: OnceLock::default(), - name_lookup: OnceLock::default(), - }), - ) + { + "type": "struct", + "fields": [ + {"id": 1, "name": "bool_field", "required": true, "type": "boolean"}, + {"id": 2, "name": "int_field", "required": true, "type": "int"}, + {"id": 3, "name": "long_field", "required": true, "type": "long"}, + {"id": 4, "name": "float_field", "required": true, "type": "float"}, + {"id": 5, "name": "double_field", "required": true, "type": "double"}, + {"id": 6, "name": "decimal_field", "required": true, "type": "decimal(9,2)"}, + {"id": 7, "name": "date_field", "required": true, "type": "date"}, + {"id": 8, "name": "time_field", "required": true, "type": "time"}, + {"id": 9, "name": "timestamp_field", "required": true, "type": "timestamp"}, + {"id": 10, "name": "timestamptz_field", "required": true, "type": "timestamptz"}, + {"id": 11, "name": "timestamp_ns_field", "required": true, "type": "timestamp_ns"}, + {"id": 12, "name": "timestamptz_ns_field", "required": true, "type": "timestamptz_ns"}, + {"id": 13, "name": "uuid_field", "required": true, "type": "uuid"}, + {"id": 14, "name": "fixed_field", "required": true, "type": "fixed[10]"}, + {"id": 15, "name": "binary_field", "required": true, "type": "binary"}, + {"id": 16, "name": "string_field", "required": true, "type": "string"} + ] } - - #[test] - fn fixed() { - let record = r#" - { - "type": "struct", - "fields": [ - { - "id": 1, - "name": "id", - "required": true, - "type": "fixed[8]" - } - ] - } - "#; + "#; check_type_serde( record, Type::Struct(StructType { - fields: vec![NestedField::required( - 1, - "id", - Type::Primitive(PrimitiveType::Fixed(8)), - ) - .into()], + fields: vec![ + NestedField::required(1, "bool_field", Type::Primitive(PrimitiveType::Boolean)) + .into(), + NestedField::required(2, "int_field", Type::Primitive(PrimitiveType::Int)) + .into(), + NestedField::required(3, "long_field", Type::Primitive(PrimitiveType::Long)) + .into(), + NestedField::required(4, "float_field", Type::Primitive(PrimitiveType::Float)) + .into(), + NestedField::required( + 5, + "double_field", + Type::Primitive(PrimitiveType::Double), + ) + .into(), + NestedField::required( + 6, + "decimal_field", + Type::Primitive(PrimitiveType::Decimal { + precision: 9, + scale: 2, + }), + ) + .into(), + NestedField::required(7, "date_field", Type::Primitive(PrimitiveType::Date)) + .into(), + NestedField::required(8, "time_field", Type::Primitive(PrimitiveType::Time)) + .into(), + NestedField::required( + 9, + "timestamp_field", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + NestedField::required( + 10, + "timestamptz_field", + Type::Primitive(PrimitiveType::Timestamptz), + ) + .into(), + NestedField::required( + 11, + "timestamp_ns_field", + Type::Primitive(PrimitiveType::TimestampNs), + ) + .into(), + NestedField::required( + 12, + "timestamptz_ns_field", + Type::Primitive(PrimitiveType::TimestamptzNs), + ) + .into(), + NestedField::required(13, "uuid_field", Type::Primitive(PrimitiveType::Uuid)) + .into(), + NestedField::required( + 14, + "fixed_field", + Type::Primitive(PrimitiveType::Fixed(10)), + ) + .into(), + NestedField::required( + 15, + "binary_field", + Type::Primitive(PrimitiveType::Binary), + ) + .into(), + NestedField::required( + 16, + "string_field", + Type::Primitive(PrimitiveType::String), + ) + .into(), + ], id_lookup: OnceLock::default(), name_lookup: OnceLock::default(), }), diff --git a/crates/iceberg/src/spec/manifest.rs b/crates/iceberg/src/spec/manifest.rs index f517b8e0d..856339ab1 100644 --- a/crates/iceberg/src/spec/manifest.rs +++ b/crates/iceberg/src/spec/manifest.rs @@ -34,7 +34,7 @@ use self::_const_schema::{manifest_schema_v1, manifest_schema_v2}; use super::{ Datum, FieldSummary, FormatVersion, ManifestContentType, ManifestFile, PartitionSpec, PrimitiveLiteral, PrimitiveType, Schema, SchemaId, SchemaRef, Struct, StructType, - INITIAL_SEQUENCE_NUMBER, UNASSIGNED_SEQUENCE_NUMBER, + INITIAL_SEQUENCE_NUMBER, UNASSIGNED_SEQUENCE_NUMBER, UNASSIGNED_SNAPSHOT_ID, }; use crate::error::Result; use crate::io::OutputFile; @@ -114,11 +114,75 @@ impl Manifest { } } +/// The builder used to create a [`ManifestWriter`]. +pub struct ManifestWriterBuilder { + output: OutputFile, + snapshot_id: Option, + key_metadata: Vec, + schema: SchemaRef, + partition_spec: PartitionSpec, +} + +impl ManifestWriterBuilder { + /// Create a new builder. + pub fn new( + output: OutputFile, + snapshot_id: Option, + key_metadata: Vec, + schema: SchemaRef, + partition_spec: PartitionSpec, + ) -> Self { + Self { + output, + snapshot_id, + key_metadata, + schema, + partition_spec, + } + } + + /// Build a [`ManifestWriter`] for format version 1. + pub fn build_v1(self) -> ManifestWriter { + let metadata = ManifestMetadata::builder() + .schema_id(self.schema.schema_id()) + .schema(self.schema) + .partition_spec(self.partition_spec) + .format_version(FormatVersion::V1) + .content(ManifestContentType::Data) + .build(); + ManifestWriter::new(self.output, self.snapshot_id, self.key_metadata, metadata) + } + + /// Build a [`ManifestWriter`] for format version 2, data content. + pub fn build_v2_data(self) -> ManifestWriter { + let metadata = ManifestMetadata::builder() + .schema_id(self.schema.schema_id()) + .schema(self.schema) + .partition_spec(self.partition_spec) + .format_version(FormatVersion::V2) + .content(ManifestContentType::Data) + .build(); + ManifestWriter::new(self.output, self.snapshot_id, self.key_metadata, metadata) + } + + /// Build a [`ManifestWriter`] for format version 2, deletes content. + pub fn build_v2_deletes(self) -> ManifestWriter { + let metadata = ManifestMetadata::builder() + .schema_id(self.schema.schema_id()) + .schema(self.schema) + .partition_spec(self.partition_spec) + .format_version(FormatVersion::V2) + .content(ManifestContentType::Deletes) + .build(); + ManifestWriter::new(self.output, self.snapshot_id, self.key_metadata, metadata) + } +} + /// A manifest writer. pub struct ManifestWriter { output: OutputFile, - snapshot_id: i64, + snapshot_id: Option, added_files: u32, added_rows: u64, @@ -131,7 +195,9 @@ pub struct ManifestWriter { key_metadata: Vec, - partitions: Vec, + manifest_entries: Vec, + + metadata: ManifestMetadata, } struct PartitionFieldStats { @@ -198,7 +264,12 @@ impl PartitionFieldStats { impl ManifestWriter { /// Create a new manifest writer. - pub fn new(output: OutputFile, snapshot_id: i64, key_metadata: Vec) -> Self { + pub(crate) fn new( + output: OutputFile, + snapshot_id: Option, + key_metadata: Vec, + metadata: ManifestMetadata, + ) -> Self { Self { output, snapshot_id, @@ -210,7 +281,8 @@ impl ManifestWriter { deleted_rows: 0, min_seq_num: None, key_metadata, - partitions: vec![], + manifest_entries: Vec::new(), + metadata, } } @@ -218,14 +290,13 @@ impl ManifestWriter { &mut self, partition_type: &StructType, ) -> Result> { - let partitions = std::mem::take(&mut self.partitions); let mut field_stats: Vec<_> = partition_type .fields() .iter() .map(|f| PartitionFieldStats::new(f.field_type.as_primitive_type().unwrap().clone())) .collect(); - for partition in partitions { - for (literal, stat) in partition.into_iter().zip_eq(field_stats.iter_mut()) { + for partition in self.manifest_entries.iter().map(|e| &e.data_file.partition) { + for (literal, stat) in partition.iter().zip_eq(field_stats.iter_mut()) { let primitive_literal = literal.map(|v| v.as_primitive_literal().unwrap()); stat.update(primitive_literal)?; } @@ -233,15 +304,184 @@ impl ManifestWriter { Ok(field_stats.into_iter().map(|stat| stat.finish()).collect()) } - /// Write a manifest. - pub async fn write(mut self, manifest: Manifest) -> Result { + fn check_data_file(&self, data_file: &DataFile) -> Result<()> { + match self.metadata.content { + ManifestContentType::Data => { + if data_file.content != DataContentType::Data { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Content type of entry {:?} should have DataContentType::Data", + data_file.content + ), + )); + } + } + ManifestContentType::Deletes => { + if data_file.content != DataContentType::EqualityDeletes + && data_file.content != DataContentType::PositionDeletes + { + return Err(Error::new( + ErrorKind::DataInvalid, + format!("Content type of entry {:?} should have DataContentType::EqualityDeletes or DataContentType::PositionDeletes", data_file.content), + )); + } + } + } + Ok(()) + } + + /// Add a new manifest entry. This method will update following status of the entry: + /// - Update the entry status to `Added` + /// - Set the snapshot id to the current snapshot id + /// - Set the sequence number to `None` if it is invalid(smaller than 0) + /// - Set the file sequence number to `None` + pub(crate) fn add_entry(&mut self, mut entry: ManifestEntry) -> Result<()> { + self.check_data_file(&entry.data_file)?; + if entry.sequence_number().is_some_and(|n| n >= 0) { + entry.status = ManifestStatus::Added; + entry.snapshot_id = self.snapshot_id; + entry.file_sequence_number = None; + } else { + entry.status = ManifestStatus::Added; + entry.snapshot_id = self.snapshot_id; + entry.sequence_number = None; + entry.file_sequence_number = None; + }; + self.add_entry_inner(entry)?; + Ok(()) + } + + /// Add file as an added entry with a specific sequence number. The entry's snapshot ID will be this manifest's snapshot ID. The entry's data sequence + /// number will be the provided data sequence number. The entry's file sequence number will be + /// assigned at commit. + pub fn add_file(&mut self, data_file: DataFile, sequence_number: i64) -> Result<()> { + self.check_data_file(&data_file)?; + let entry = ManifestEntry { + status: ManifestStatus::Added, + snapshot_id: self.snapshot_id, + sequence_number: (sequence_number >= 0).then_some(sequence_number), + file_sequence_number: None, + data_file, + }; + self.add_entry_inner(entry)?; + Ok(()) + } + + /// Add a delete manifest entry. This method will update following status of the entry: + /// - Update the entry status to `Deleted` + /// - Set the snapshot id to the current snapshot id + /// + /// # TODO + /// Remove this allow later + #[allow(dead_code)] + pub(crate) fn add_delete_entry(&mut self, mut entry: ManifestEntry) -> Result<()> { + self.check_data_file(&entry.data_file)?; + entry.status = ManifestStatus::Deleted; + entry.snapshot_id = self.snapshot_id; + self.add_entry_inner(entry)?; + Ok(()) + } + + /// Add a file as delete manifest entry. The entry's snapshot ID will be this manifest's snapshot ID. + /// However, the original data and file sequence numbers of the file must be preserved when + /// the file is marked as deleted. + pub fn add_delete_file( + &mut self, + data_file: DataFile, + sequence_number: i64, + file_sequence_number: Option, + ) -> Result<()> { + self.check_data_file(&data_file)?; + let entry = ManifestEntry { + status: ManifestStatus::Deleted, + snapshot_id: self.snapshot_id, + sequence_number: Some(sequence_number), + file_sequence_number, + data_file, + }; + self.add_entry_inner(entry)?; + Ok(()) + } + + /// Add an existing manifest entry. This method will update following status of the entry: + /// - Update the entry status to `Existing` + /// + /// # TODO + /// Remove this allow later + #[allow(dead_code)] + pub(crate) fn add_existing_entry(&mut self, mut entry: ManifestEntry) -> Result<()> { + self.check_data_file(&entry.data_file)?; + entry.status = ManifestStatus::Existing; + self.add_entry_inner(entry)?; + Ok(()) + } + + /// Add an file as existing manifest entry. The original data and file sequence numbers, snapshot ID, + /// which were assigned at commit, must be preserved when adding an existing entry. + pub fn add_existing_file( + &mut self, + data_file: DataFile, + snapshot_id: i64, + sequence_number: i64, + file_sequence_number: Option, + ) -> Result<()> { + self.check_data_file(&data_file)?; + let entry = ManifestEntry { + status: ManifestStatus::Existing, + snapshot_id: Some(snapshot_id), + sequence_number: Some(sequence_number), + file_sequence_number, + data_file, + }; + self.add_entry_inner(entry)?; + Ok(()) + } + + fn add_entry_inner(&mut self, entry: ManifestEntry) -> Result<()> { + // Check if the entry has sequence number + if (entry.status == ManifestStatus::Deleted || entry.status == ManifestStatus::Existing) + && (entry.sequence_number.is_none() || entry.file_sequence_number.is_none()) + { + return Err(Error::new( + ErrorKind::DataInvalid, + "Manifest entry with status Existing or Deleted should have sequence number", + )); + } + + // Update the statistics + match entry.status { + ManifestStatus::Added => { + self.added_files += 1; + self.added_rows += entry.data_file.record_count; + } + ManifestStatus::Deleted => { + self.deleted_files += 1; + self.deleted_rows += entry.data_file.record_count; + } + ManifestStatus::Existing => { + self.existing_files += 1; + self.existing_rows += entry.data_file.record_count; + } + } + if entry.is_alive() { + if let Some(seq_num) = entry.sequence_number { + self.min_seq_num = Some(self.min_seq_num.map_or(seq_num, |v| min(v, seq_num))); + } + } + self.manifest_entries.push(entry); + Ok(()) + } + + /// Write manifest file and return it. + pub async fn write_manifest_file(mut self) -> Result { // Create the avro writer - let partition_type = manifest + let partition_type = self .metadata .partition_spec - .partition_type(&manifest.metadata.schema)?; - let table_schema = &manifest.metadata.schema; - let avro_schema = match manifest.metadata.format_version { + .partition_type(&self.metadata.schema)?; + let table_schema = &self.metadata.schema; + let avro_schema = match self.metadata.format_version { FormatVersion::V1 => manifest_schema_v1(&partition_type)?, FormatVersion::V2 => manifest_schema_v2(&partition_type)?, }; @@ -259,69 +499,36 @@ impl ManifestWriter { )?; avro_writer.add_user_metadata( "partition-spec".to_string(), - to_vec(&manifest.metadata.partition_spec.fields()).map_err(|err| { + to_vec(&self.metadata.partition_spec.fields()).map_err(|err| { Error::new(ErrorKind::DataInvalid, "Fail to serialize partition spec") .with_source(err) })?, )?; avro_writer.add_user_metadata( "partition-spec-id".to_string(), - manifest.metadata.partition_spec.spec_id().to_string(), + self.metadata.partition_spec.spec_id().to_string(), )?; avro_writer.add_user_metadata( "format-version".to_string(), - (manifest.metadata.format_version as u8).to_string(), + (self.metadata.format_version as u8).to_string(), )?; - if manifest.metadata.format_version == FormatVersion::V2 { + if self.metadata.format_version == FormatVersion::V2 { avro_writer - .add_user_metadata("content".to_string(), manifest.metadata.content.to_string())?; + .add_user_metadata("content".to_string(), self.metadata.content.to_string())?; } + let partition_summary = self.construct_partition_summaries(&partition_type)?; // Write manifest entries - for entry in manifest.entries { - if (entry.status == ManifestStatus::Deleted || entry.status == ManifestStatus::Existing) - && (entry.sequence_number.is_none() || entry.file_sequence_number.is_none()) - { - return Err(Error::new( - ErrorKind::DataInvalid, - "Manifest entry with status Existing or Deleted should have sequence number", - )); - } - - match entry.status { - ManifestStatus::Added => { - self.added_files += 1; - self.added_rows += entry.data_file.record_count; + for entry in std::mem::take(&mut self.manifest_entries) { + let value = match self.metadata.format_version { + FormatVersion::V1 => { + to_value(_serde::ManifestEntryV1::try_from(entry, &partition_type)?)? + .resolve(&avro_schema)? } - ManifestStatus::Deleted => { - self.deleted_files += 1; - self.deleted_rows += entry.data_file.record_count; + FormatVersion::V2 => { + to_value(_serde::ManifestEntryV2::try_from(entry, &partition_type)?)? + .resolve(&avro_schema)? } - ManifestStatus::Existing => { - self.existing_files += 1; - self.existing_rows += entry.data_file.record_count; - } - } - - if entry.is_alive() { - if let Some(seq_num) = entry.sequence_number { - self.min_seq_num = Some(self.min_seq_num.map_or(seq_num, |v| min(v, seq_num))); - } - } - - self.partitions.push(entry.data_file.partition.clone()); - - let value = match manifest.metadata.format_version { - FormatVersion::V1 => to_value(_serde::ManifestEntryV1::try_from( - (*entry).clone(), - &partition_type, - )?)? - .resolve(&avro_schema)?, - FormatVersion::V2 => to_value(_serde::ManifestEntryV2::try_from( - (*entry).clone(), - &partition_type, - )?)? - .resolve(&avro_schema)?, }; avro_writer.append(value)?; @@ -331,18 +538,16 @@ impl ManifestWriter { let length = content.len(); self.output.write(Bytes::from(content)).await?; - let partition_summary = self.construct_partition_summaries(&partition_type)?; - Ok(ManifestFile { manifest_path: self.output.location().to_string(), manifest_length: length as i64, - partition_spec_id: manifest.metadata.partition_spec.spec_id(), - content: manifest.metadata.content, + partition_spec_id: self.metadata.partition_spec.spec_id(), + content: self.metadata.content, // sequence_number and min_sequence_number with UNASSIGNED_SEQUENCE_NUMBER will be replace with // real sequence number in `ManifestListWriter`. sequence_number: UNASSIGNED_SEQUENCE_NUMBER, min_sequence_number: self.min_seq_num.unwrap_or(UNASSIGNED_SEQUENCE_NUMBER), - added_snapshot_id: self.snapshot_id, + added_snapshot_id: self.snapshot_id.unwrap_or(UNASSIGNED_SNAPSHOT_ID), added_files_count: Some(self.added_files), existing_files_count: Some(self.existing_files), deleted_files_count: Some(self.deleted_files), @@ -1699,16 +1904,18 @@ mod tests { .build() .unwrap(), ); - let manifest = Manifest { - metadata: ManifestMetadata { - schema_id: 0, - schema: schema.clone(), - partition_spec: PartitionSpec::builder(schema).with_spec_id(0).build().unwrap(), - content: ManifestContentType::Data, - format_version: FormatVersion::V2, - }, - entries: vec![ - Arc::new(ManifestEntry { + let metadata = ManifestMetadata { + schema_id: 0, + schema: schema.clone(), + partition_spec: PartitionSpec::builder(schema) + .with_spec_id(0) + .build() + .unwrap(), + content: ManifestContentType::Data, + format_version: FormatVersion::V2, + }; + let mut entries = vec![ + ManifestEntry { status: ManifestStatus::Added, snapshot_id: None, sequence_number: None, @@ -1731,13 +1938,34 @@ mod tests { equality_ids: Vec::new(), sort_order_id: None, } - }) - ] - }; + } + ]; - let writer = |output_file: OutputFile| ManifestWriter::new(output_file, 1, vec![]); + // write manifest to file + let tmp_dir = TempDir::new().unwrap(); + let path = tmp_dir.path().join("test_manifest.avro"); + let io = FileIOBuilder::new_fs_io().build().unwrap(); + let output_file = io.new_output(path.to_str().unwrap()).unwrap(); + let mut writer = ManifestWriterBuilder::new( + output_file, + Some(1), + vec![], + metadata.schema.clone(), + metadata.partition_spec.clone(), + ) + .build_v2_data(); + for entry in &entries { + writer.add_entry(entry.clone()).unwrap(); + } + writer.write_manifest_file().await.unwrap(); - test_manifest_read_write(manifest, writer).await; + // read back the manifest file and check the content + let actual_manifest = + Manifest::parse_avro(fs::read(path).expect("read_file must succeed").as_slice()) + .unwrap(); + // The snapshot id is assigned when the entry is added to the manifest. + entries[0].snapshot_id = Some(1); + assert_eq!(actual_manifest, Manifest::new(metadata, entries)); } #[tokio::test] @@ -1812,17 +2040,21 @@ mod tests { .build() .unwrap(), ); - let manifest = Manifest { - metadata: ManifestMetadata { - schema_id: 0, - schema: schema.clone(), - partition_spec: PartitionSpec::builder(schema) - .with_spec_id(0).add_partition_field("v_int", "v_int", Transform::Identity).unwrap() - .add_partition_field("v_long", "v_long", Transform::Identity).unwrap().build().unwrap(), - content: ManifestContentType::Data, - format_version: FormatVersion::V2, - }, - entries: vec![Arc::new(ManifestEntry { + let metadata = ManifestMetadata { + schema_id: 0, + schema: schema.clone(), + partition_spec: PartitionSpec::builder(schema) + .with_spec_id(0) + .add_partition_field("v_int", "v_int", Transform::Identity) + .unwrap() + .add_partition_field("v_long", "v_long", Transform::Identity) + .unwrap() + .build() + .unwrap(), + content: ManifestContentType::Data, + format_version: FormatVersion::V2, + }; + let mut entries = vec![ManifestEntry { status: ManifestStatus::Added, snapshot_id: None, sequence_number: None, @@ -1887,15 +2119,38 @@ mod tests { equality_ids: vec![], sort_order_id: None, }, - })], - }; - - let writer = |output_file: OutputFile| ManifestWriter::new(output_file, 1, vec![]); + }]; - let res = test_manifest_read_write(manifest, writer).await; + // write manifest to file and check the return manifest file. + let tmp_dir = TempDir::new().unwrap(); + let path = tmp_dir.path().join("test_manifest.avro"); + let io = FileIOBuilder::new_fs_io().build().unwrap(); + let output_file = io.new_output(path.to_str().unwrap()).unwrap(); + let mut writer = ManifestWriterBuilder::new( + output_file, + Some(2), + vec![], + metadata.schema.clone(), + metadata.partition_spec.clone(), + ) + .build_v2_data(); + for entry in &entries { + writer.add_entry(entry.clone()).unwrap(); + } + let manifest_file = writer.write_manifest_file().await.unwrap(); + assert_eq!(manifest_file.sequence_number, UNASSIGNED_SEQUENCE_NUMBER); + assert_eq!( + manifest_file.min_sequence_number, + UNASSIGNED_SEQUENCE_NUMBER + ); - assert_eq!(res.sequence_number, UNASSIGNED_SEQUENCE_NUMBER); - assert_eq!(res.min_sequence_number, UNASSIGNED_SEQUENCE_NUMBER); + // read back the manifest file and check the content + let actual_manifest = + Manifest::parse_avro(fs::read(path).expect("read_file must succeed").as_slice()) + .unwrap(); + // The snapshot id is assigned when the entry is added to the manifest. + entries[0].snapshot_id = Some(2); + assert_eq!(actual_manifest, Manifest::new(metadata, entries)); } #[tokio::test] @@ -1923,15 +2178,17 @@ mod tests { .build() .unwrap(), ); - let manifest = Manifest { - metadata: ManifestMetadata { - schema_id: 1, - schema: schema.clone(), - partition_spec: PartitionSpec::builder(schema).with_spec_id(0).build().unwrap(), - content: ManifestContentType::Data, - format_version: FormatVersion::V1, - }, - entries: vec![Arc::new(ManifestEntry { + let metadata = ManifestMetadata { + schema_id: 1, + schema: schema.clone(), + partition_spec: PartitionSpec::builder(schema) + .with_spec_id(0) + .build() + .unwrap(), + content: ManifestContentType::Data, + format_version: FormatVersion::V1, + }; + let mut entries = vec![ManifestEntry { status: ManifestStatus::Added, snapshot_id: Some(0), sequence_number: Some(0), @@ -1954,13 +2211,33 @@ mod tests { equality_ids: vec![], sort_order_id: Some(0), } - })], - }; + }]; - let writer = - |output_file: OutputFile| ManifestWriter::new(output_file, 2966623707104393227, vec![]); + // write manifest to file + let tmp_dir = TempDir::new().unwrap(); + let path = tmp_dir.path().join("test_manifest.avro"); + let io = FileIOBuilder::new_fs_io().build().unwrap(); + let output_file = io.new_output(path.to_str().unwrap()).unwrap(); + let mut writer = ManifestWriterBuilder::new( + output_file, + Some(3), + vec![], + metadata.schema.clone(), + metadata.partition_spec.clone(), + ) + .build_v1(); + for entry in &entries { + writer.add_entry(entry.clone()).unwrap(); + } + writer.write_manifest_file().await.unwrap(); - test_manifest_read_write(manifest, writer).await; + // read back the manifest file and check the content + let actual_manifest = + Manifest::parse_avro(fs::read(path).expect("read_file must succeed").as_slice()) + .unwrap(); + // The snapshot id is assigned when the entry is added to the manifest. + entries[0].snapshot_id = Some(3); + assert_eq!(actual_manifest, Manifest::new(metadata, entries)); } #[tokio::test] @@ -1987,16 +2264,19 @@ mod tests { .build() .unwrap(), ); - let manifest = Manifest { - metadata: ManifestMetadata { - schema_id: 0, - schema: schema.clone(), - partition_spec: PartitionSpec::builder(schema).add_partition_field("category", "category", Transform::Identity).unwrap().build().unwrap(), - content: ManifestContentType::Data, - format_version: FormatVersion::V1, - }, - entries: vec![ - Arc::new(ManifestEntry { + let metadata = ManifestMetadata { + schema_id: 0, + schema: schema.clone(), + partition_spec: PartitionSpec::builder(schema) + .add_partition_field("category", "category", Transform::Identity) + .unwrap() + .build() + .unwrap(), + content: ManifestContentType::Data, + format_version: FormatVersion::V1, + }; + let mut entries = vec![ + ManifestEntry { status: ManifestStatus::Added, snapshot_id: Some(0), sequence_number: Some(0), @@ -2034,17 +2314,43 @@ mod tests { equality_ids: vec![], sort_order_id: Some(0), }, - }) - ] - }; - - let writer = |output_file: OutputFile| ManifestWriter::new(output_file, 1, vec![]); + } + ]; - let entry = test_manifest_read_write(manifest, writer).await; + // write manifest to file + let tmp_dir = TempDir::new().unwrap(); + let path = tmp_dir.path().join("test_manifest.avro"); + let io = FileIOBuilder::new_fs_io().build().unwrap(); + let output_file = io.new_output(path.to_str().unwrap()).unwrap(); + let mut writer = ManifestWriterBuilder::new( + output_file, + Some(2), + vec![], + metadata.schema.clone(), + metadata.partition_spec.clone(), + ) + .build_v1(); + for entry in &entries { + writer.add_entry(entry.clone()).unwrap(); + } + let manifest_file = writer.write_manifest_file().await.unwrap(); + assert_eq!(manifest_file.partitions.len(), 1); + assert_eq!( + manifest_file.partitions[0].lower_bound, + Some(Datum::string("x")) + ); + assert_eq!( + manifest_file.partitions[0].upper_bound, + Some(Datum::string("x")) + ); - assert_eq!(entry.partitions.len(), 1); - assert_eq!(entry.partitions[0].lower_bound, Some(Datum::string("x"))); - assert_eq!(entry.partitions[0].upper_bound, Some(Datum::string("x"))); + // read back the manifest file and check the content + let actual_manifest = + Manifest::parse_avro(fs::read(path).expect("read_file must succeed").as_slice()) + .unwrap(); + // The snapshot id is assigned when the entry is added to the manifest. + entries[0].snapshot_id = Some(2); + assert_eq!(actual_manifest, Manifest::new(metadata, entries)); } #[tokio::test] @@ -2066,15 +2372,17 @@ mod tests { .build() .unwrap(), ); - let manifest = Manifest { - metadata: ManifestMetadata { - schema_id: 0, - schema: schema.clone(), - partition_spec: PartitionSpec::builder(schema).with_spec_id(0).build().unwrap(), - content: ManifestContentType::Data, - format_version: FormatVersion::V2, - }, - entries: vec![Arc::new(ManifestEntry { + let metadata = ManifestMetadata { + schema_id: 0, + schema: schema.clone(), + partition_spec: PartitionSpec::builder(schema) + .with_spec_id(0) + .build() + .unwrap(), + content: ManifestContentType::Data, + format_version: FormatVersion::V2, + }; + let entries = vec![ManifestEntry { status: ManifestStatus::Added, snapshot_id: None, sequence_number: None, @@ -2109,18 +2417,34 @@ mod tests { equality_ids: vec![], sort_order_id: None, }, - })], - }; - - let writer = |output_file: OutputFile| ManifestWriter::new(output_file, 1, vec![]); + }]; - let (avro_bytes, _) = write_manifest(&manifest, writer).await; + // write manifest to file + let tmp_dir = TempDir::new().unwrap(); + let path = tmp_dir.path().join("test_manifest.avro"); + let io = FileIOBuilder::new_fs_io().build().unwrap(); + let output_file = io.new_output(path.to_str().unwrap()).unwrap(); + let mut writer = ManifestWriterBuilder::new( + output_file, + Some(2), + vec![], + metadata.schema.clone(), + metadata.partition_spec.clone(), + ) + .build_v2_data(); + for entry in &entries { + writer.add_entry(entry.clone()).unwrap(); + } + writer.write_manifest_file().await.unwrap(); - // The parse should succeed. - let actual_manifest = Manifest::parse_avro(avro_bytes.as_slice()).unwrap(); + // read back the manifest file and check the content + let actual_manifest = + Manifest::parse_avro(fs::read(path).expect("read_file must succeed").as_slice()) + .unwrap(); // Compared with original manifest, the lower_bounds and upper_bounds no longer has data for field 3, and // other parts should be same. + // The snapshot id is assigned when the entry is added to the manifest. let schema = Arc::new( Schema::builder() .with_fields(vec![ @@ -2148,7 +2472,7 @@ mod tests { }, entries: vec![Arc::new(ManifestEntry { status: ManifestStatus::Added, - snapshot_id: None, + snapshot_id: Some(2), sequence_number: None, file_sequence_number: None, data_file: DataFile { @@ -2219,16 +2543,15 @@ mod tests { .unwrap() .build() .unwrap(); - let manifest = Manifest { - metadata: ManifestMetadata { - schema_id: 0, - schema, - partition_spec, - content: ManifestContentType::Data, - format_version: FormatVersion::V2, - }, - entries: vec![ - Arc::new(ManifestEntry { + let metadata = ManifestMetadata { + schema_id: 0, + schema, + partition_spec, + content: ManifestContentType::Data, + format_version: FormatVersion::V2, + }; + let entries = vec![ + ManifestEntry { status: ManifestStatus::Added, snapshot_id: None, sequence_number: None, @@ -2257,8 +2580,7 @@ mod tests { equality_ids: Vec::new(), sort_order_id: None, } - }), - Arc::new( + }, ManifestEntry { status: ManifestStatus::Added, snapshot_id: None, @@ -2288,9 +2610,7 @@ mod tests { equality_ids: Vec::new(), sort_order_id: None, } - } - ), - Arc::new( + }, ManifestEntry { status: ManifestStatus::Added, snapshot_id: None, @@ -2320,9 +2640,7 @@ mod tests { equality_ids: Vec::new(), sort_order_id: None, } - } - ), - Arc::new( + }, ManifestEntry { status: ManifestStatus::Added, snapshot_id: None, @@ -2352,56 +2670,177 @@ mod tests { equality_ids: Vec::new(), sort_order_id: None, } - } - ), - ] - }; + }, + ]; - let writer = |output_file: OutputFile| ManifestWriter::new(output_file, 1, vec![]); + // write manifest to file + let tmp_dir = TempDir::new().unwrap(); + let path = tmp_dir.path().join("test_manifest.avro"); + let io = FileIOBuilder::new_fs_io().build().unwrap(); + let output_file = io.new_output(path.to_str().unwrap()).unwrap(); + let mut writer = ManifestWriterBuilder::new( + output_file, + Some(1), + vec![], + metadata.schema.clone(), + metadata.partition_spec.clone(), + ) + .build_v2_data(); + for entry in &entries { + writer.add_entry(entry.clone()).unwrap(); + } + let res = writer.write_manifest_file().await.unwrap(); - let res = test_manifest_read_write(manifest, writer).await; - assert!(res.partitions.len() == 3); - assert!(res.partitions[0].lower_bound == Some(Datum::int(1111))); - assert!(res.partitions[0].upper_bound == Some(Datum::int(2021))); + assert_eq!(res.partitions.len(), 3); + assert_eq!(res.partitions[0].lower_bound, Some(Datum::int(1111))); + assert_eq!(res.partitions[0].upper_bound, Some(Datum::int(2021))); assert!(!res.partitions[0].contains_null); - assert!(res.partitions[0].contains_nan == Some(false)); + assert_eq!(res.partitions[0].contains_nan, Some(false)); - assert!(res.partitions[1].lower_bound == Some(Datum::float(1.0))); - assert!(res.partitions[1].upper_bound == Some(Datum::float(15.5))); + assert_eq!(res.partitions[1].lower_bound, Some(Datum::float(1.0))); + assert_eq!(res.partitions[1].upper_bound, Some(Datum::float(15.5))); assert!(res.partitions[1].contains_null); - assert!(res.partitions[1].contains_nan == Some(true)); + assert_eq!(res.partitions[1].contains_nan, Some(true)); - assert!(res.partitions[2].lower_bound == Some(Datum::double(1.0))); - assert!(res.partitions[2].upper_bound == Some(Datum::double(25.5))); + assert_eq!(res.partitions[2].lower_bound, Some(Datum::double(1.0))); + assert_eq!(res.partitions[2].upper_bound, Some(Datum::double(25.5))); assert!(!res.partitions[2].contains_null); - assert!(res.partitions[2].contains_nan == Some(false)); + assert_eq!(res.partitions[2].contains_nan, Some(false)); } - async fn test_manifest_read_write( - manifest: Manifest, - writer_builder: impl FnOnce(OutputFile) -> ManifestWriter, - ) -> ManifestFile { - let (bs, res) = write_manifest(&manifest, writer_builder).await; - let actual_manifest = Manifest::parse_avro(bs.as_slice()).unwrap(); - - assert_eq!(actual_manifest, manifest); - res - } + #[tokio::test] + async fn test_add_delete_existing() { + let schema = Arc::new( + Schema::builder() + .with_fields(vec![ + Arc::new(NestedField::optional( + 1, + "id", + Type::Primitive(PrimitiveType::Int), + )), + Arc::new(NestedField::optional( + 2, + "name", + Type::Primitive(PrimitiveType::String), + )), + ]) + .build() + .unwrap(), + ); + let metadata = ManifestMetadata { + schema_id: 0, + schema: schema.clone(), + partition_spec: PartitionSpec::builder(schema) + .with_spec_id(0) + .build() + .unwrap(), + content: ManifestContentType::Data, + format_version: FormatVersion::V2, + }; + let mut entries = vec![ + ManifestEntry { + status: ManifestStatus::Added, + snapshot_id: None, + sequence_number: Some(1), + file_sequence_number: Some(1), + data_file: DataFile { + content: DataContentType::Data, + file_path: "s3a://icebergdata/demo/s1/t1/data/00000-0-ba56fbfa-f2ff-40c9-bb27-565ad6dc2be8-00000.parquet".to_string(), + file_format: DataFileFormat::Parquet, + partition: Struct::empty(), + record_count: 1, + file_size_in_bytes: 5442, + column_sizes: HashMap::from([(1, 61), (2, 73)]), + value_counts: HashMap::from([(1, 1), (2, 1)]), + null_value_counts: HashMap::from([(1, 0), (2, 0)]), + nan_value_counts: HashMap::new(), + lower_bounds: HashMap::new(), + upper_bounds: HashMap::new(), + key_metadata: Some(Vec::new()), + split_offsets: vec![4], + equality_ids: Vec::new(), + sort_order_id: None, + }, + }, + ManifestEntry { + status: ManifestStatus::Deleted, + snapshot_id: Some(1), + sequence_number: Some(1), + file_sequence_number: Some(1), + data_file: DataFile { + content: DataContentType::Data, + file_path: "s3a://icebergdata/demo/s1/t1/data/00000-0-ba56fbfa-f2ff-40c9-bb27-565ad6dc2be8-00000.parquet".to_string(), + file_format: DataFileFormat::Parquet, + partition: Struct::empty(), + record_count: 1, + file_size_in_bytes: 5442, + column_sizes: HashMap::from([(1, 61), (2, 73)]), + value_counts: HashMap::from([(1, 1), (2, 1)]), + null_value_counts: HashMap::from([(1, 0), (2, 0)]), + nan_value_counts: HashMap::new(), + lower_bounds: HashMap::new(), + upper_bounds: HashMap::new(), + key_metadata: Some(Vec::new()), + split_offsets: vec![4], + equality_ids: Vec::new(), + sort_order_id: None, + }, + }, + ManifestEntry { + status: ManifestStatus::Existing, + snapshot_id: Some(1), + sequence_number: Some(1), + file_sequence_number: Some(1), + data_file: DataFile { + content: DataContentType::Data, + file_path: "s3a://icebergdata/demo/s1/t1/data/00000-0-ba56fbfa-f2ff-40c9-bb27-565ad6dc2be8-00000.parquet".to_string(), + file_format: DataFileFormat::Parquet, + partition: Struct::empty(), + record_count: 1, + file_size_in_bytes: 5442, + column_sizes: HashMap::from([(1, 61), (2, 73)]), + value_counts: HashMap::from([(1, 1), (2, 1)]), + null_value_counts: HashMap::from([(1, 0), (2, 0)]), + nan_value_counts: HashMap::new(), + lower_bounds: HashMap::new(), + upper_bounds: HashMap::new(), + key_metadata: Some(Vec::new()), + split_offsets: vec![4], + equality_ids: Vec::new(), + sort_order_id: None, + }, + }, + ]; - /// Utility method which writes out a manifest and returns the bytes. - async fn write_manifest( - manifest: &Manifest, - writer_builder: impl FnOnce(OutputFile) -> ManifestWriter, - ) -> (Vec, ManifestFile) { - let temp_dir = TempDir::new().unwrap(); - let path = temp_dir.path().join("test_manifest.avro"); + // write manifest to file + let tmp_dir = TempDir::new().unwrap(); + let path = tmp_dir.path().join("test_manifest.avro"); let io = FileIOBuilder::new_fs_io().build().unwrap(); let output_file = io.new_output(path.to_str().unwrap()).unwrap(); - let writer = writer_builder(output_file); - let res = writer.write(manifest.clone()).await.unwrap(); - - // Verify manifest - (fs::read(path).expect("read_file must succeed"), res) + let mut writer = ManifestWriterBuilder::new( + output_file, + Some(3), + vec![], + metadata.schema.clone(), + metadata.partition_spec.clone(), + ) + .build_v2_data(); + writer.add_entry(entries[0].clone()).unwrap(); + writer.add_delete_entry(entries[1].clone()).unwrap(); + writer.add_existing_entry(entries[2].clone()).unwrap(); + writer.write_manifest_file().await.unwrap(); + + // read back the manifest file and check the content + let actual_manifest = + Manifest::parse_avro(fs::read(path).expect("read_file must succeed").as_slice()) + .unwrap(); + + // The snapshot id is assigned when the entry is added and delete to the manifest. Existing entries are keep original. + entries[0].snapshot_id = Some(3); + entries[1].snapshot_id = Some(3); + // file sequence number is assigned to None when the entry is added and delete to the manifest. + entries[0].file_sequence_number = None; + assert_eq!(actual_manifest, Manifest::new(metadata, entries)); } #[tokio::test] diff --git a/crates/iceberg/src/spec/schema.rs b/crates/iceberg/src/spec/schema.rs index 709c4cdae..f290441aa 100644 --- a/crates/iceberg/src/spec/schema.rs +++ b/crates/iceberg/src/spec/schema.rs @@ -30,7 +30,7 @@ use super::NestedField; use crate::error::Result; use crate::expr::accessor::StructAccessor; use crate::spec::datatypes::{ - ListType, MapType, NestedFieldRef, PrimitiveType, StructType, Type, LIST_FILED_NAME, + ListType, MapType, NestedFieldRef, PrimitiveType, StructType, Type, LIST_FIELD_NAME, MAP_KEY_FIELD_NAME, MAP_VALUE_FIELD_NAME, }; use crate::{ensure_data_valid, Error, ErrorKind}; @@ -774,7 +774,7 @@ impl SchemaVisitor for IndexByName { } fn list(&mut self, list: &ListType, _value: Self::T) -> Result { - self.add_field(LIST_FILED_NAME, list.element_field.id) + self.add_field(LIST_FIELD_NAME, list.element_field.id) } fn map(&mut self, map: &MapType, _key_value: Self::T, _value: Self::T) -> Result { diff --git a/crates/iceberg/src/spec/snapshot.rs b/crates/iceberg/src/spec/snapshot.rs index f24a3c26b..e73b8abaa 100644 --- a/crates/iceberg/src/spec/snapshot.rs +++ b/crates/iceberg/src/spec/snapshot.rs @@ -34,6 +34,8 @@ use crate::{Error, ErrorKind}; /// The ref name of the main branch of the table. pub const MAIN_BRANCH: &str = "main"; +/// Placeholder for snapshot ID. The field with this value must be replaced with the actual snapshot ID before it is committed. +pub const UNASSIGNED_SNAPSHOT_ID: i64 = -1; /// Reference to [`Snapshot`]. pub type SnapshotRef = Arc; diff --git a/crates/iceberg/src/spec/table_metadata_builder.rs b/crates/iceberg/src/spec/table_metadata_builder.rs index 4e8c1ae76..cbf2e5e30 100644 --- a/crates/iceberg/src/spec/table_metadata_builder.rs +++ b/crates/iceberg/src/spec/table_metadata_builder.rs @@ -831,7 +831,7 @@ impl TableMetadataBuilder { if sort_order_found { if self.last_added_order_id != Some(new_order_id) { self.changes.push(TableUpdate::AddSortOrder { - sort_order: sort_order.clone(), + sort_order: sort_order.clone().with_order_id(new_order_id), }); self.last_added_order_id = Some(new_order_id); } diff --git a/crates/iceberg/src/spec/values.rs b/crates/iceberg/src/spec/values.rs index f92ca263d..0dbd3ad5e 100644 --- a/crates/iceberg/src/spec/values.rs +++ b/crates/iceberg/src/spec/values.rs @@ -3569,6 +3569,9 @@ mod tests { let value = Datum::timestamp_from_str("2021-08-01T01:09:00.0899").unwrap(); assert_eq!(&format!("{value}"), "2021-08-01 01:09:00.089900"); + let value = Datum::timestamp_from_str("2023-01-06T00:00:00").unwrap(); + assert_eq!(&format!("{value}"), "2023-01-06 00:00:00"); + let value = Datum::timestamp_from_str("2021-08-01T01:09:00.0899+0800"); assert!(value.is_err(), "Parse timestamp with timezone should fail!"); diff --git a/crates/iceberg/src/table.rs b/crates/iceberg/src/table.rs index fa5304855..ebee670f4 100644 --- a/crates/iceberg/src/table.rs +++ b/crates/iceberg/src/table.rs @@ -20,9 +20,9 @@ use std::sync::Arc; use crate::arrow::ArrowReaderBuilder; +use crate::inspect::MetadataTable; use crate::io::object_cache::ObjectCache; use crate::io::FileIO; -use crate::metadata_scan::MetadataTable; use crate::scan::TableScanBuilder; use crate::spec::{TableMetadata, TableMetadataRef}; use crate::{Error, ErrorKind, Result, TableIdent}; @@ -203,7 +203,7 @@ impl Table { /// Creates a metadata table which provides table-like APIs for inspecting metadata. /// See [`MetadataTable`] for more details. - pub fn metadata_table(self) -> MetadataTable { + pub fn inspect(&self) -> MetadataTable<'_> { MetadataTable::new(self) } diff --git a/crates/iceberg/src/transaction.rs b/crates/iceberg/src/transaction.rs index cfd6a8381..c27a107da 100644 --- a/crates/iceberg/src/transaction.rs +++ b/crates/iceberg/src/transaction.rs @@ -28,10 +28,9 @@ use uuid::Uuid; use crate::error::Result; use crate::io::OutputFile; use crate::spec::{ - DataFile, DataFileFormat, FormatVersion, Manifest, ManifestEntry, ManifestFile, - ManifestListWriter, ManifestMetadata, ManifestWriter, NullOrder, Operation, Snapshot, - SnapshotReference, SnapshotRetention, SortDirection, SortField, SortOrder, Struct, StructType, - Summary, Transform, MAIN_BRANCH, + DataFile, DataFileFormat, FormatVersion, ManifestEntry, ManifestFile, ManifestListWriter, + ManifestWriterBuilder, NullOrder, Operation, Snapshot, SnapshotReference, SnapshotRetention, + SortDirection, SortField, SortOrder, Struct, StructType, Summary, Transform, MAIN_BRANCH, }; use crate::table::Table; use crate::TableUpdate::UpgradeFormatVersion; @@ -378,43 +377,42 @@ impl<'a> SnapshotProduceAction<'a> { // Write manifest file for added data files and return the ManifestFile for ManifestList. async fn write_added_manifest(&mut self) -> Result { let added_data_files = std::mem::take(&mut self.added_data_files); - let manifest_entries = added_data_files - .into_iter() - .map(|data_file| { - let builder = ManifestEntry::builder() - .status(crate::spec::ManifestStatus::Added) - .data_file(data_file); - if self.tx.table.metadata().format_version() == FormatVersion::V1 { - builder.snapshot_id(self.snapshot_id).build() - } else { - // For format version > 1, we set the snapshot id at the inherited time to avoid rewrite the manifest file when - // commit failed. - builder.build() - } - }) - .collect(); - let schema = self.tx.table.metadata().current_schema(); - let manifest_meta = ManifestMetadata::builder() - .schema(schema.clone()) - .schema_id(schema.schema_id()) - .format_version(self.tx.table.metadata().format_version()) - .partition_spec( + let snapshot_id = self.snapshot_id; + let manifest_entries = added_data_files.into_iter().map(|data_file| { + let builder = ManifestEntry::builder() + .status(crate::spec::ManifestStatus::Added) + .data_file(data_file); + if self.tx.table.metadata().format_version() == FormatVersion::V1 { + builder.snapshot_id(snapshot_id).build() + } else { + // For format version > 1, we set the snapshot id at the inherited time to avoid rewrite the manifest file when + // commit failed. + builder.build() + } + }); + let mut writer = { + let builder = ManifestWriterBuilder::new( + self.new_manifest_output()?, + Some(self.snapshot_id), + self.key_metadata.clone(), + self.tx.table.metadata().current_schema().clone(), self.tx .table .metadata() .default_partition_spec() .as_ref() .clone(), - ) - .content(crate::spec::ManifestContentType::Data) - .build(); - let manifest = Manifest::new(manifest_meta, manifest_entries); - let writer = ManifestWriter::new( - self.new_manifest_output()?, - self.snapshot_id, - self.key_metadata.clone(), - ); - writer.write(manifest).await + ); + if self.tx.table.metadata().format_version() == FormatVersion::V1 { + builder.build_v1() + } else { + builder.build_v2_data() + } + }; + for entry in manifest_entries { + writer.add_entry(entry)?; + } + writer.write_manifest_file().await } async fn manifest_file( diff --git a/crates/iceberg/src/utils.rs b/crates/iceberg/src/utils.rs index 70514cccb..00d3e69bd 100644 --- a/crates/iceberg/src/utils.rs +++ b/crates/iceberg/src/utils.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use std::num::NonZero; +use std::num::NonZeroUsize; // Use a default value of 1 as the safest option. // See https://doc.rust-lang.org/std/thread/fn.available_parallelism.html#limitations @@ -31,12 +31,12 @@ const DEFAULT_PARALLELISM: usize = 1; /// are circumstances where the level of available /// parallelism can change during the lifetime of an executing /// process, but this should not be called in a hot loop. -pub(crate) fn available_parallelism() -> NonZero { +pub(crate) fn available_parallelism() -> NonZeroUsize { std::thread::available_parallelism().unwrap_or_else(|_err| { // Failed to get the level of parallelism. // TODO: log/trace when this fallback occurs. // Using a default value. - NonZero::new(DEFAULT_PARALLELISM).unwrap() + NonZeroUsize::new(DEFAULT_PARALLELISM).unwrap() }) } diff --git a/crates/iceberg/testdata/puffin/java-generated/empty-puffin-uncompressed.bin b/crates/iceberg/testdata/puffin/java-generated/empty-puffin-uncompressed.bin new file mode 100644 index 000000000..142b45bd4 Binary files /dev/null and b/crates/iceberg/testdata/puffin/java-generated/empty-puffin-uncompressed.bin differ diff --git a/crates/iceberg/testdata/puffin/java-generated/sample-metric-data-compressed-zstd.bin b/crates/iceberg/testdata/puffin/java-generated/sample-metric-data-compressed-zstd.bin new file mode 100644 index 000000000..ac8b69c76 Binary files /dev/null and b/crates/iceberg/testdata/puffin/java-generated/sample-metric-data-compressed-zstd.bin differ diff --git a/crates/iceberg/testdata/puffin/java-generated/sample-metric-data-uncompressed.bin b/crates/iceberg/testdata/puffin/java-generated/sample-metric-data-uncompressed.bin new file mode 100644 index 000000000..ab8da1382 Binary files /dev/null and b/crates/iceberg/testdata/puffin/java-generated/sample-metric-data-uncompressed.bin differ diff --git a/crates/integration_tests/Cargo.toml b/crates/integration_tests/Cargo.toml index a047d7580..1d1bfd9c2 100644 --- a/crates/integration_tests/Cargo.toml +++ b/crates/integration_tests/Cargo.toml @@ -27,9 +27,12 @@ rust-version = { workspace = true } [dependencies] arrow-array = { workspace = true } arrow-schema = { workspace = true } +datafusion = { workspace = true } futures = { workspace = true } iceberg = { workspace = true } iceberg-catalog-rest = { workspace = true } +iceberg-datafusion = { workspace = true } iceberg_test_utils = { path = "../test_utils", features = ["tests"] } parquet = { workspace = true } tokio = { workspace = true } +uuid = { workspace = true } diff --git a/crates/integration_tests/testdata/spark/Dockerfile b/crates/integration_tests/testdata/spark/Dockerfile index 74aefbd50..a20dc8662 100644 --- a/crates/integration_tests/testdata/spark/Dockerfile +++ b/crates/integration_tests/testdata/spark/Dockerfile @@ -27,7 +27,7 @@ ENV PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.7-src.zip:$ RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME} && mkdir -p /home/iceberg/spark-events WORKDIR ${SPARK_HOME} -ENV SPARK_VERSION=3.5.3 +ENV SPARK_VERSION=3.5.4 ENV ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12 ENV ICEBERG_VERSION=1.6.0 diff --git a/crates/integration_tests/testdata/spark/provision.py b/crates/integration_tests/testdata/spark/provision.py index 364e366b4..3b0742125 100755 --- a/crates/integration_tests/testdata/spark/provision.py +++ b/crates/integration_tests/testdata/spark/provision.py @@ -119,3 +119,25 @@ spark.sql("INSERT INTO rest.default.test_promote_column VALUES (19)") spark.sql("ALTER TABLE rest.default.test_promote_column ALTER COLUMN foo TYPE bigint") spark.sql("INSERT INTO rest.default.test_promote_column VALUES (25)") + +# Create a table with various types +spark.sql(""" +CREATE OR REPLACE TABLE rest.default.types_test USING ICEBERG AS +SELECT + CAST(s % 2 = 1 AS BOOLEAN) AS cboolean, + CAST(s % 256 - 128 AS TINYINT) AS ctinyint, + CAST(s AS SMALLINT) AS csmallint, + CAST(s AS INT) AS cint, + CAST(s AS BIGINT) AS cbigint, + CAST(s AS FLOAT) AS cfloat, + CAST(s AS DOUBLE) AS cdouble, + CAST(s / 100.0 AS DECIMAL(8, 2)) AS cdecimal, + CAST(DATE('1970-01-01') + s AS DATE) AS cdate, + CAST(from_unixtime(s) AS TIMESTAMP_NTZ) AS ctimestamp_ntz, + CAST(from_unixtime(s) AS TIMESTAMP) AS ctimestamp, + CAST(s AS STRING) AS cstring, + CAST(s AS BINARY) AS cbinary +FROM ( + SELECT EXPLODE(SEQUENCE(0, 1000)) AS s +); +""") diff --git a/crates/integration_tests/tests/datafusion.rs b/crates/integration_tests/tests/datafusion.rs new file mode 100644 index 000000000..1586298ff --- /dev/null +++ b/crates/integration_tests/tests/datafusion.rs @@ -0,0 +1,138 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::HashMap; +use std::sync::Arc; + +use arrow_schema::TimeUnit; +use datafusion::arrow::datatypes::{DataType, Field, Schema}; +use datafusion::assert_batches_eq; +use datafusion::catalog::TableProvider; +use datafusion::error::DataFusionError; +use datafusion::prelude::SessionContext; +use iceberg::{Catalog, TableIdent}; +use iceberg_datafusion::IcebergTableProvider; +use iceberg_integration_tests::set_test_fixture; +use parquet::arrow::PARQUET_FIELD_ID_META_KEY; + +#[tokio::test] +async fn test_basic_queries() -> Result<(), DataFusionError> { + let fixture = set_test_fixture("datafusion_basic_read").await; + + let catalog = fixture.rest_catalog; + + let table = catalog + .load_table(&TableIdent::from_strs(["default", "types_test"]).unwrap()) + .await + .unwrap(); + + let ctx = SessionContext::new(); + + let table_provider = Arc::new( + IcebergTableProvider::try_new_from_table(table) + .await + .unwrap(), + ); + + let schema = table_provider.schema(); + + assert_eq!( + schema.as_ref(), + &Schema::new(vec![ + Field::new("cboolean", DataType::Boolean, true).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + Field::new("ctinyint", DataType::Int32, true).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "2".to_string(), + )])), + Field::new("csmallint", DataType::Int32, true).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "3".to_string(), + )])), + Field::new("cint", DataType::Int32, true).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "4".to_string(), + )])), + Field::new("cbigint", DataType::Int64, true).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "5".to_string(), + )])), + Field::new("cfloat", DataType::Float32, true).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "6".to_string(), + )])), + Field::new("cdouble", DataType::Float64, true).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "7".to_string(), + )])), + Field::new("cdecimal", DataType::Decimal128(8, 2), true).with_metadata(HashMap::from( + [(PARQUET_FIELD_ID_META_KEY.to_string(), "8".to_string(),)] + )), + Field::new("cdate", DataType::Date32, true).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "9".to_string(), + )])), + Field::new( + "ctimestamp_ntz", + DataType::Timestamp(TimeUnit::Microsecond, None), + true + ) + .with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "10".to_string(), + )])), + Field::new( + "ctimestamp", + DataType::Timestamp(TimeUnit::Microsecond, Some(Arc::from("+00:00"))), + true + ) + .with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "11".to_string(), + )])), + Field::new("cstring", DataType::Utf8, true).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "12".to_string(), + )])), + Field::new("cbinary", DataType::LargeBinary, true).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "13".to_string(), + )])), + ]) + ); + + ctx.register_table("types_table", table_provider)?; + + let batches = ctx + .sql("SELECT * FROM types_table ORDER BY cbigint LIMIT 3") + .await? + .collect() + .await?; + let expected = [ + "+----------+----------+-----------+------+---------+--------+---------+----------+------------+---------------------+----------------------+---------+----------+", + "| cboolean | ctinyint | csmallint | cint | cbigint | cfloat | cdouble | cdecimal | cdate | ctimestamp_ntz | ctimestamp | cstring | cbinary |", + "+----------+----------+-----------+------+---------+--------+---------+----------+------------+---------------------+----------------------+---------+----------+", + "| false | -128 | 0 | 0 | 0 | 0.0 | 0.0 | 0.00 | 1970-01-01 | 1970-01-01T00:00:00 | 1970-01-01T00:00:00Z | 0 | 00000000 |", + "| true | -127 | 1 | 1 | 1 | 1.0 | 1.0 | 0.01 | 1970-01-02 | 1970-01-01T00:00:01 | 1970-01-01T00:00:01Z | 1 | 00000001 |", + "| false | -126 | 2 | 2 | 2 | 2.0 | 2.0 | 0.02 | 1970-01-03 | 1970-01-01T00:00:02 | 1970-01-01T00:00:02Z | 2 | 00000002 |", + "+----------+----------+-----------+------+---------+--------+---------+----------+------------+---------------------+----------------------+---------+----------+", + ]; + assert_batches_eq!(expected, &batches); + Ok(()) +} diff --git a/crates/integration_tests/tests/scan_all_type.rs b/crates/integration_tests/tests/scan_all_type.rs new file mode 100644 index 000000000..517f6ea87 --- /dev/null +++ b/crates/integration_tests/tests/scan_all_type.rs @@ -0,0 +1,369 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Integration tests for rest catalog. + +use std::collections::HashMap; +use std::sync::Arc; + +use arrow_array::builder::{Int32Builder, ListBuilder, MapBuilder, StringBuilder}; +use arrow_array::{ + Array, ArrayRef, BooleanArray, Date32Array, Decimal128Array, FixedSizeBinaryArray, + Float32Array, Float64Array, Int32Array, Int64Array, LargeBinaryArray, MapArray, RecordBatch, + StringArray, StructArray, Time64MicrosecondArray, TimestampMicrosecondArray, +}; +use arrow_schema::{DataType, Field, Fields}; +use futures::TryStreamExt; +use iceberg::arrow::{DEFAULT_MAP_FIELD_NAME, UTC_TIME_ZONE}; +use iceberg::spec::{ + ListType, MapType, NestedField, PrimitiveType, Schema, StructType, Type, LIST_FIELD_NAME, + MAP_KEY_FIELD_NAME, MAP_VALUE_FIELD_NAME, +}; +use iceberg::transaction::Transaction; +use iceberg::writer::base_writer::data_file_writer::DataFileWriterBuilder; +use iceberg::writer::file_writer::location_generator::{ + DefaultFileNameGenerator, DefaultLocationGenerator, +}; +use iceberg::writer::file_writer::ParquetWriterBuilder; +use iceberg::writer::{IcebergWriter, IcebergWriterBuilder}; +use iceberg::{Catalog, Namespace, NamespaceIdent, TableCreation}; +use iceberg_integration_tests::set_test_fixture; +use parquet::arrow::PARQUET_FIELD_ID_META_KEY; +use parquet::file::properties::WriterProperties; +use uuid::Uuid; + +#[tokio::test] +async fn test_scan_all_type() { + let fixture = set_test_fixture("test_scan_all_type").await; + + let ns = Namespace::with_properties( + NamespaceIdent::from_strs(["apple", "ios"]).unwrap(), + HashMap::from([ + ("owner".to_string(), "ray".to_string()), + ("community".to_string(), "apache".to_string()), + ]), + ); + + fixture + .rest_catalog + .create_namespace(ns.name(), ns.properties().clone()) + .await + .unwrap(); + let schema = Schema::builder() + .with_schema_id(1) + .with_identifier_field_ids(vec![2]) + .with_fields(vec![ + // test all type + NestedField::required(1, "int", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required(2, "long", Type::Primitive(PrimitiveType::Long)).into(), + NestedField::required(3, "float", Type::Primitive(PrimitiveType::Float)).into(), + NestedField::required(4, "double", Type::Primitive(PrimitiveType::Double)).into(), + NestedField::required( + 5, + "decimal", + Type::Primitive(PrimitiveType::Decimal { + precision: 20, + scale: 5, + }), + ) + .into(), + NestedField::required(6, "string", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(7, "boolean", Type::Primitive(PrimitiveType::Boolean)).into(), + NestedField::required(8, "binary", Type::Primitive(PrimitiveType::Binary)).into(), + NestedField::required(9, "date", Type::Primitive(PrimitiveType::Date)).into(), + NestedField::required(10, "time", Type::Primitive(PrimitiveType::Time)).into(), + NestedField::required(11, "timestamp", Type::Primitive(PrimitiveType::Timestamp)) + .into(), + NestedField::required(12, "fixed", Type::Primitive(PrimitiveType::Fixed(10))).into(), + NestedField::required(13, "uuid", Type::Primitive(PrimitiveType::Uuid)).into(), + NestedField::required( + 14, + "timestamptz", + Type::Primitive(PrimitiveType::Timestamptz), + ) + .into(), + NestedField::required( + 15, + "struct", + Type::Struct(StructType::new(vec![ + NestedField::required(18, "int", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required(19, "string", Type::Primitive(PrimitiveType::String)) + .into(), + ])), + ) + .into(), + NestedField::required( + 16, + "list", + Type::List(ListType::new( + NestedField::list_element(20, Type::Primitive(PrimitiveType::Int), true).into(), + )), + ) + .into(), + NestedField::required( + 17, + "map", + Type::Map(MapType::new( + NestedField::map_key_element(21, Type::Primitive(PrimitiveType::Int)).into(), + NestedField::map_value_element( + 22, + Type::Primitive(PrimitiveType::String), + true, + ) + .into(), + )), + ) + .into(), + ]) + .build() + .unwrap(); + + let table_creation = TableCreation::builder() + .name("t1".to_string()) + .schema(schema.clone()) + .build(); + + let table = fixture + .rest_catalog + .create_table(ns.name(), table_creation) + .await + .unwrap(); + + // Create the writer and write the data + let schema: Arc = Arc::new( + table + .metadata() + .current_schema() + .as_ref() + .try_into() + .unwrap(), + ); + let location_generator = DefaultLocationGenerator::new(table.metadata().clone()).unwrap(); + let file_name_generator = DefaultFileNameGenerator::new( + "test".to_string(), + None, + iceberg::spec::DataFileFormat::Parquet, + ); + let parquet_writer_builder = ParquetWriterBuilder::new( + WriterProperties::default(), + table.metadata().current_schema().clone(), + table.file_io().clone(), + location_generator.clone(), + file_name_generator.clone(), + ); + let data_file_writer_builder = DataFileWriterBuilder::new(parquet_writer_builder, None); + let mut data_file_writer = data_file_writer_builder.build().await.unwrap(); + + // Prepare data + let col1 = Int32Array::from(vec![1, 2, 3, 4, 5]); + let col2 = Int64Array::from(vec![1, 2, 3, 4, 5]); + let col3 = Float32Array::from(vec![1.1, 2.2, 3.3, 4.4, 5.5]); + let col4 = Float64Array::from(vec![1.1, 2.2, 3.3, 4.4, 5.5]); + let col5 = Decimal128Array::from(vec![ + Some(1.into()), + Some(2.into()), + Some(3.into()), + Some(4.into()), + Some(5.into()), + ]) + .with_data_type(DataType::Decimal128(20, 5)); + let col6 = StringArray::from(vec!["a", "b", "c", "d", "e"]); + let col7 = BooleanArray::from(vec![true, false, true, false, true]); + let col8 = LargeBinaryArray::from_opt_vec(vec![ + Some(b"a"), + Some(b"b"), + Some(b"c"), + Some(b"d"), + Some(b"e"), + ]); + let col9 = Date32Array::from(vec![1, 2, 3, 4, 5]); + let col10 = Time64MicrosecondArray::from(vec![1, 2, 3, 4, 5]); + let col11 = TimestampMicrosecondArray::from(vec![1, 2, 3, 4, 5]); + let col12 = FixedSizeBinaryArray::try_from_iter( + vec![ + vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + ] + .into_iter(), + ) + .unwrap(); + assert_eq!(col12.data_type(), &DataType::FixedSizeBinary(10)); + let col13 = FixedSizeBinaryArray::try_from_iter( + vec![ + Uuid::new_v4().as_bytes().to_vec(), + Uuid::new_v4().as_bytes().to_vec(), + Uuid::new_v4().as_bytes().to_vec(), + Uuid::new_v4().as_bytes().to_vec(), + Uuid::new_v4().as_bytes().to_vec(), + ] + .into_iter(), + ) + .unwrap(); + assert_eq!(col13.data_type(), &DataType::FixedSizeBinary(16)); + let col14 = TimestampMicrosecondArray::from(vec![1, 2, 3, 4, 5]).with_timezone(UTC_TIME_ZONE); + let col15 = StructArray::from(vec![ + ( + Arc::new( + Field::new("int", DataType::Int32, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + 18.to_string(), + )])), + ), + Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])) as ArrayRef, + ), + ( + Arc::new( + Field::new("string", DataType::Utf8, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + 19.to_string(), + )])), + ), + Arc::new(StringArray::from(vec!["a", "b", "c", "d", "e"])) as ArrayRef, + ), + ]); + let col16 = { + let mut builder = ListBuilder::new(Int32Builder::new()).with_field(Arc::new( + Field::new(LIST_FIELD_NAME, DataType::Int32, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + 20.to_string(), + )])), + )); + builder.append_value([Some(1), Some(2), Some(3), Some(4), Some(5)]); + builder.append_value([Some(1), Some(2), Some(3), Some(4), Some(5)]); + builder.append_value([Some(1), Some(2), Some(3), Some(4), Some(5)]); + builder.append_value([Some(1), Some(2), Some(3), Some(4), Some(5)]); + builder.append_value([Some(1), Some(2), Some(3), Some(4), Some(5)]); + builder.finish() + }; + let col17 = { + let string_builder = StringBuilder::new(); + let int_builder = Int32Builder::with_capacity(4); + let mut builder = MapBuilder::new(None, int_builder, string_builder); + builder.keys().append_value(1); + builder.values().append_value("a"); + builder.append(true).unwrap(); + builder.keys().append_value(2); + builder.values().append_value("b"); + builder.append(true).unwrap(); + builder.keys().append_value(3); + builder.values().append_value("c"); + builder.append(true).unwrap(); + builder.keys().append_value(4); + builder.values().append_value("d"); + builder.append(true).unwrap(); + builder.keys().append_value(5); + builder.values().append_value("e"); + builder.append(true).unwrap(); + let array = builder.finish(); + let (_field, offsets, entries, nulls, ordered) = array.into_parts(); + let new_struct_fields = Fields::from(vec![ + Field::new(MAP_KEY_FIELD_NAME, DataType::Int32, false).with_metadata(HashMap::from([ + (PARQUET_FIELD_ID_META_KEY.to_string(), 21.to_string()), + ])), + Field::new(MAP_VALUE_FIELD_NAME, DataType::Utf8, false).with_metadata(HashMap::from([ + (PARQUET_FIELD_ID_META_KEY.to_string(), 22.to_string()), + ])), + ]); + let entries = { + let (_, arrays, nulls) = entries.into_parts(); + StructArray::new(new_struct_fields.clone(), arrays, nulls) + }; + let field = Arc::new(Field::new( + DEFAULT_MAP_FIELD_NAME, + DataType::Struct(new_struct_fields), + false, + )); + MapArray::new(field, offsets, entries, nulls, ordered) + }; + + let batch = RecordBatch::try_new(schema.clone(), vec![ + Arc::new(col1) as ArrayRef, + Arc::new(col2) as ArrayRef, + Arc::new(col3) as ArrayRef, + Arc::new(col4) as ArrayRef, + Arc::new(col5) as ArrayRef, + Arc::new(col6) as ArrayRef, + Arc::new(col7) as ArrayRef, + Arc::new(col8) as ArrayRef, + Arc::new(col9) as ArrayRef, + Arc::new(col10) as ArrayRef, + Arc::new(col11) as ArrayRef, + Arc::new(col12) as ArrayRef, + Arc::new(col13) as ArrayRef, + Arc::new(col14) as ArrayRef, + Arc::new(col15) as ArrayRef, + Arc::new(col16) as ArrayRef, + Arc::new(col17) as ArrayRef, + ]) + .unwrap(); + data_file_writer.write(batch.clone()).await.unwrap(); + let data_file = data_file_writer.close().await.unwrap(); + + // commit result + let tx = Transaction::new(&table); + let mut append_action = tx.fast_append(None, vec![]).unwrap(); + append_action.add_data_files(data_file.clone()).unwrap(); + let tx = append_action.apply().await.unwrap(); + let table = tx.commit(&fixture.rest_catalog).await.unwrap(); + + // check result + let batch_stream = table + .scan() + .select(vec![ + "int", + "long", + "float", + "double", + "decimal", + "string", + "boolean", + "binary", + "date", + "time", + "timestamp", + "fixed", + "uuid", + "timestamptz", + "struct", + "list", + "map", + ]) + .build() + .unwrap() + .to_arrow() + .await + .unwrap(); + let batches: Vec<_> = batch_stream.try_collect().await.unwrap(); + assert_eq!(batches.len(), 1); + assert_eq!(batches[0], batch); + + // check result + let batch_stream = table + .scan() + .select_all() + .build() + .unwrap() + .to_arrow() + .await + .unwrap(); + let batches: Vec<_> = batch_stream.try_collect().await.unwrap(); + assert_eq!(batches.len(), 1); + assert_eq!(batches[0], batch); +} diff --git a/crates/integrations/datafusion/Cargo.toml b/crates/integrations/datafusion/Cargo.toml index 81a94d839..ccb9ca175 100644 --- a/crates/integrations/datafusion/Cargo.toml +++ b/crates/integrations/datafusion/Cargo.toml @@ -20,7 +20,10 @@ name = "iceberg-datafusion" version = { workspace = true } edition = { workspace = true } homepage = { workspace = true } -rust-version = { workspace = true } +# kept the same as DataFusion's MSRV +# https://github.com/apache/datafusion?tab=readme-ov-file#rust-version-compatibility-policy +# https://github.com/apache/datafusion/blob/main/Cargo.toml#L68 +rust-version = "1.80.1" categories = ["database"] description = "Apache Iceberg DataFusion Integration" @@ -31,7 +34,7 @@ keywords = ["iceberg", "integrations", "datafusion"] [dependencies] anyhow = { workspace = true } async-trait = { workspace = true } -datafusion = { version = "44" } +datafusion = { workspace = true } futures = { workspace = true } iceberg = { workspace = true } tokio = { workspace = true } diff --git a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs index f438308e6..03fb132f2 100644 --- a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs +++ b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs @@ -119,6 +119,7 @@ fn to_iceberg_predicate(expr: &Expr) -> TransformedResult { _ => TransformedResult::NotTransformed, } } + Expr::Cast(c) => to_iceberg_predicate(&c.expr), _ => TransformedResult::NotTransformed, } } @@ -211,7 +212,7 @@ fn scalar_value_to_datum(value: &ScalarValue) -> Option { #[cfg(test)] mod tests { - use datafusion::arrow::datatypes::{DataType, Field, Schema}; + use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit}; use datafusion::common::DFSchema; use datafusion::logical_expr::utils::split_conjunction; use datafusion::prelude::{Expr, SessionContext}; @@ -224,6 +225,7 @@ mod tests { let arrow_schema = Schema::new(vec![ Field::new("foo", DataType::Int32, true), Field::new("bar", DataType::Utf8, true), + Field::new("ts", DataType::Timestamp(TimeUnit::Second, None), true), ]); DFSchema::try_from_qualified_schema("my_table", &arrow_schema).unwrap() } @@ -392,4 +394,13 @@ mod tests { let expected_predicate = Reference::new("foo").less_than(Datum::long(0)); assert_eq!(predicate, expected_predicate); } + + #[test] + fn test_predicate_conversion_with_cast() { + let sql = "ts >= timestamp '2023-01-05T00:00:00'"; + let predicate = convert_to_iceberg_predicate(sql).unwrap(); + let expected_predicate = + Reference::new("ts").greater_than_or_equal_to(Datum::string("2023-01-05T00:00:00")); + assert_eq!(predicate, expected_predicate); + } } diff --git a/crates/test_utils/Cargo.toml b/crates/test_utils/Cargo.toml index d4f6e1696..c98309760 100644 --- a/crates/test_utils/Cargo.toml +++ b/crates/test_utils/Cargo.toml @@ -27,7 +27,7 @@ license = { workspace = true } [dependencies] env_logger = { workspace = true } -log = "0.4.20" +log = { workspace = true } [features] tests = []