From edbc16985d735ee276e83c776f5e5989735948ca Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 27 Dec 2024 07:39:20 +0100 Subject: [PATCH 01/32] Bump griffe from 1.5.1 to 1.5.4 (#1474) Bumps [griffe](https://github.com/mkdocstrings/griffe) from 1.5.1 to 1.5.4. - [Release notes](https://github.com/mkdocstrings/griffe/releases) - [Changelog](https://github.com/mkdocstrings/griffe/blob/main/CHANGELOG.md) - [Commits](https://github.com/mkdocstrings/griffe/compare/1.5.1...1.5.4) --- updated-dependencies: - dependency-name: griffe dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- mkdocs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs/requirements.txt b/mkdocs/requirements.txt index bf992c03a3..45da03aa05 100644 --- a/mkdocs/requirements.txt +++ b/mkdocs/requirements.txt @@ -16,7 +16,7 @@ # under the License. mkdocs==1.6.1 -griffe==1.5.1 +griffe==1.5.4 jinja2==3.1.5 mkdocstrings==0.27.0 mkdocstrings-python==1.12.2 From f5bdae84f49a07056ba97db973d668a81f78f795 Mon Sep 17 00:00:00 2001 From: Tyler White <50381805+IndexSeek@users.noreply.github.com> Date: Fri, 27 Dec 2024 01:40:39 -0500 Subject: [PATCH 02/32] docs: various spelling fixes (#1471) --- mkdocs/docs/api.md | 2 +- mkdocs/docs/how-to-release.md | 2 +- mkdocs/docs/verify-release.md | 2 +- pyiceberg/table/__init__.py | 2 +- pyiceberg/utils/decimal.py | 2 +- tests/integration/test_writes/test_partitioned_writes.py | 6 +++--- tests/table/test_init.py | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md index 7aa4159016..9c48718877 100644 --- a/mkdocs/docs/api.md +++ b/mkdocs/docs/api.md @@ -1005,7 +1005,7 @@ tbl.add_files(file_paths=file_paths) ## Schema evolution -PyIceberg supports full schema evolution through the Python API. It takes care of setting the field-IDs and makes sure that only non-breaking changes are done (can be overriden). +PyIceberg supports full schema evolution through the Python API. It takes care of setting the field-IDs and makes sure that only non-breaking changes are done (can be overridden). In the examples below, the `.update_schema()` is called from the table itself. diff --git a/mkdocs/docs/how-to-release.md b/mkdocs/docs/how-to-release.md index bea5548748..c44f56a9ff 100644 --- a/mkdocs/docs/how-to-release.md +++ b/mkdocs/docs/how-to-release.md @@ -31,7 +31,7 @@ This guide outlines the process for releasing PyIceberg in accordance with the [ * A GPG key must be registered and published in the [Apache Iceberg KEYS file](https://downloads.apache.org/iceberg/KEYS). Follow [the instructions for setting up a GPG key and uploading it to the KEYS file](#set-up-gpg-key-and-upload-to-apache-iceberg-keys-file). * SVN Access - * Permission to upload artifacts to the [Apache development distribution](https://dist.apache.org/repos/dist/dev/iceberg/) (requires Apache Commmitter access). + * Permission to upload artifacts to the [Apache development distribution](https://dist.apache.org/repos/dist/dev/iceberg/) (requires Apache Committer access). * Permission to upload artifacts to the [Apache release distribution](https://dist.apache.org/repos/dist/release/iceberg/) (requires Apache PMC access). * PyPI Access * The `twine` package must be installed for uploading releases to PyPi. diff --git a/mkdocs/docs/verify-release.md b/mkdocs/docs/verify-release.md index 07e4c32a86..6148bfebdb 100644 --- a/mkdocs/docs/verify-release.md +++ b/mkdocs/docs/verify-release.md @@ -111,7 +111,7 @@ To run the full test coverage, with both unit tests and integration tests: make test-coverage ``` -This will spin up Docker containers to faciliate running test coverage. +This will spin up Docker containers to facilitate running test coverage. # Cast the vote diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 4ec3403bb3..2469a9ed7b 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -902,7 +902,7 @@ def scan( Args: row_filter: - A string or BooleanExpression that decsribes the + A string or BooleanExpression that describes the desired rows selected_fields: A tuple of strings representing the column names diff --git a/pyiceberg/utils/decimal.py b/pyiceberg/utils/decimal.py index 4432564dd1..99638d2a00 100644 --- a/pyiceberg/utils/decimal.py +++ b/pyiceberg/utils/decimal.py @@ -85,7 +85,7 @@ def bytes_to_decimal(value: bytes, scale: int) -> Decimal: """Return a decimal from the bytes. Args: - value (bytes): tbe bytes to be converted into a decimal. + value (bytes): the bytes to be converted into a decimal. scale (int): the scale of the decimal. Returns: diff --git a/tests/integration/test_writes/test_partitioned_writes.py b/tests/integration/test_writes/test_partitioned_writes.py index b92c338931..8a3a5c9acc 100644 --- a/tests/integration/test_writes/test_partitioned_writes.py +++ b/tests/integration/test_writes/test_partitioned_writes.py @@ -395,7 +395,7 @@ def test_dynamic_partition_overwrite_unpartitioned_evolve_to_identity_transform( # For a long string, the lower bound and upper bound is truncated # e.g. aaaaaaaaaaaaaaaaaaaaaa has lower bound of aaaaaaaaaaaaaaaa and upper bound of aaaaaaaaaaaaaaab # this makes strict metric evaluator determine the file evaluate as ROWS_MIGHT_NOT_MATCH - # this further causes the partitioned data file to be overwriten rather than deleted + # this further causes the partitioned data file to be overwritten rather than deleted if part_col == "string_long": expected_operations = ["append", "append", "overwrite", "append"] assert tbl.inspect.snapshots().to_pydict()["operation"] == expected_operations @@ -539,7 +539,7 @@ def test_data_files_with_table_partitioned_with_null( # the first snapshot generates M3 with 6 delete data entries collected from M1 and M2. # ML3 = [M3] # - # The second snapshot generates M4 with 3 appended data entries and since M3 (previous manifests) only has delte entries it does not lint to it. + # The second snapshot generates M4 with 3 appended data entries and since M3 (previous manifests) only has delete entries it does not lint to it. # ML4 = [M4] # Append : Append generates M5 with new data entries and links to all previous manifests which is M4 . @@ -552,7 +552,7 @@ def test_data_files_with_table_partitioned_with_null( # ML6 = [M6, M7, M8] # # The second snapshot generates M9 with 3 appended data entries and it also looks at manifests in ML6 (previous manifests) - # it ignores M6 since it only has delte entries but it links to M7 and M8. + # it ignores M6 since it only has delete entries but it links to M7 and M8. # ML7 = [M9, M7, M8] # tldr: diff --git a/tests/table/test_init.py b/tests/table/test_init.py index bdc3d030fd..397fa9f537 100644 --- a/tests/table/test_init.py +++ b/tests/table/test_init.py @@ -527,7 +527,7 @@ def test_update_column(table_v1: Table, table_v2: Table) -> None: new_schema = table.transaction().update_schema().update_column("y", doc=COMMENT2)._apply() assert new_schema.find_field("y").doc == COMMENT2, "failed to update existing field doc" - # update existing doc to an emtpy string + # update existing doc to an empty string assert new_schema.find_field("y").doc == COMMENT2 new_schema2 = table.transaction().update_schema().update_column("y", doc="")._apply() assert new_schema2.find_field("y").doc == "", "failed to remove existing field doc" From 6e537e86d4db52b151088f3f3fdb012ee1c3cc77 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 27 Dec 2024 08:47:10 +0100 Subject: [PATCH 03/32] Bump coverage from 7.6.9 to 7.6.10 (#1473) Bumps [coverage](https://github.com/nedbat/coveragepy) from 7.6.9 to 7.6.10. - [Release notes](https://github.com/nedbat/coveragepy/releases) - [Changelog](https://github.com/nedbat/coveragepy/blob/master/CHANGES.rst) - [Commits](https://github.com/nedbat/coveragepy/compare/7.6.9...7.6.10) --- updated-dependencies: - dependency-name: coverage dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 126 ++++++++++++++++++++++++++-------------------------- 1 file changed, 63 insertions(+), 63 deletions(-) diff --git a/poetry.lock b/poetry.lock index 6e4f55f39a..e6afffab09 100644 --- a/poetry.lock +++ b/poetry.lock @@ -701,73 +701,73 @@ files = [ [[package]] name = "coverage" -version = "7.6.9" +version = "7.6.10" description = "Code coverage measurement for Python" optional = false python-versions = ">=3.9" files = [ - {file = "coverage-7.6.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:85d9636f72e8991a1706b2b55b06c27545448baf9f6dbf51c4004609aacd7dcb"}, - {file = "coverage-7.6.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:608a7fd78c67bee8936378299a6cb9f5149bb80238c7a566fc3e6717a4e68710"}, - {file = "coverage-7.6.9-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96d636c77af18b5cb664ddf12dab9b15a0cfe9c0bde715da38698c8cea748bfa"}, - {file = "coverage-7.6.9-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d75cded8a3cff93da9edc31446872d2997e327921d8eed86641efafd350e1df1"}, - {file = "coverage-7.6.9-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7b15f589593110ae767ce997775d645b47e5cbbf54fd322f8ebea6277466cec"}, - {file = "coverage-7.6.9-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:44349150f6811b44b25574839b39ae35291f6496eb795b7366fef3bd3cf112d3"}, - {file = "coverage-7.6.9-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:d891c136b5b310d0e702e186d70cd16d1119ea8927347045124cb286b29297e5"}, - {file = "coverage-7.6.9-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:db1dab894cc139f67822a92910466531de5ea6034ddfd2b11c0d4c6257168073"}, - {file = "coverage-7.6.9-cp310-cp310-win32.whl", hash = "sha256:41ff7b0da5af71a51b53f501a3bac65fb0ec311ebed1632e58fc6107f03b9198"}, - {file = "coverage-7.6.9-cp310-cp310-win_amd64.whl", hash = "sha256:35371f8438028fdccfaf3570b31d98e8d9eda8bb1d6ab9473f5a390969e98717"}, - {file = "coverage-7.6.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:932fc826442132dde42ee52cf66d941f581c685a6313feebed358411238f60f9"}, - {file = "coverage-7.6.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:085161be5f3b30fd9b3e7b9a8c301f935c8313dcf928a07b116324abea2c1c2c"}, - {file = "coverage-7.6.9-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ccc660a77e1c2bf24ddbce969af9447a9474790160cfb23de6be4fa88e3951c7"}, - {file = "coverage-7.6.9-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c69e42c892c018cd3c8d90da61d845f50a8243062b19d228189b0224150018a9"}, - {file = "coverage-7.6.9-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0824a28ec542a0be22f60c6ac36d679e0e262e5353203bea81d44ee81fe9c6d4"}, - {file = "coverage-7.6.9-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:4401ae5fc52ad8d26d2a5d8a7428b0f0c72431683f8e63e42e70606374c311a1"}, - {file = "coverage-7.6.9-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:98caba4476a6c8d59ec1eb00c7dd862ba9beca34085642d46ed503cc2d440d4b"}, - {file = "coverage-7.6.9-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ee5defd1733fd6ec08b168bd4f5387d5b322f45ca9e0e6c817ea6c4cd36313e3"}, - {file = "coverage-7.6.9-cp311-cp311-win32.whl", hash = "sha256:f2d1ec60d6d256bdf298cb86b78dd715980828f50c46701abc3b0a2b3f8a0dc0"}, - {file = "coverage-7.6.9-cp311-cp311-win_amd64.whl", hash = "sha256:0d59fd927b1f04de57a2ba0137166d31c1a6dd9e764ad4af552912d70428c92b"}, - {file = "coverage-7.6.9-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:99e266ae0b5d15f1ca8d278a668df6f51cc4b854513daab5cae695ed7b721cf8"}, - {file = "coverage-7.6.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9901d36492009a0a9b94b20e52ebfc8453bf49bb2b27bca2c9706f8b4f5a554a"}, - {file = "coverage-7.6.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abd3e72dd5b97e3af4246cdada7738ef0e608168de952b837b8dd7e90341f015"}, - {file = "coverage-7.6.9-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ff74026a461eb0660366fb01c650c1d00f833a086b336bdad7ab00cc952072b3"}, - {file = "coverage-7.6.9-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:65dad5a248823a4996724a88eb51d4b31587aa7aa428562dbe459c684e5787ae"}, - {file = "coverage-7.6.9-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:22be16571504c9ccea919fcedb459d5ab20d41172056206eb2994e2ff06118a4"}, - {file = "coverage-7.6.9-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:0f957943bc718b87144ecaee70762bc2bc3f1a7a53c7b861103546d3a403f0a6"}, - {file = "coverage-7.6.9-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:0ae1387db4aecb1f485fb70a6c0148c6cdaebb6038f1d40089b1fc84a5db556f"}, - {file = "coverage-7.6.9-cp312-cp312-win32.whl", hash = "sha256:1a330812d9cc7ac2182586f6d41b4d0fadf9be9049f350e0efb275c8ee8eb692"}, - {file = "coverage-7.6.9-cp312-cp312-win_amd64.whl", hash = "sha256:b12c6b18269ca471eedd41c1b6a1065b2f7827508edb9a7ed5555e9a56dcfc97"}, - {file = "coverage-7.6.9-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:899b8cd4781c400454f2f64f7776a5d87bbd7b3e7f7bda0cb18f857bb1334664"}, - {file = "coverage-7.6.9-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:61f70dc68bd36810972e55bbbe83674ea073dd1dcc121040a08cdf3416c5349c"}, - {file = "coverage-7.6.9-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8a289d23d4c46f1a82d5db4abeb40b9b5be91731ee19a379d15790e53031c014"}, - {file = "coverage-7.6.9-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e216d8044a356fc0337c7a2a0536d6de07888d7bcda76febcb8adc50bdbbd00"}, - {file = "coverage-7.6.9-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c026eb44f744acaa2bda7493dad903aa5bf5fc4f2554293a798d5606710055d"}, - {file = "coverage-7.6.9-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e77363e8425325384f9d49272c54045bbed2f478e9dd698dbc65dbc37860eb0a"}, - {file = "coverage-7.6.9-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:777abfab476cf83b5177b84d7486497e034eb9eaea0d746ce0c1268c71652077"}, - {file = "coverage-7.6.9-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:447af20e25fdbe16f26e84eb714ba21d98868705cb138252d28bc400381f6ffb"}, - {file = "coverage-7.6.9-cp313-cp313-win32.whl", hash = "sha256:d872ec5aeb086cbea771c573600d47944eea2dcba8be5f3ee649bfe3cb8dc9ba"}, - {file = "coverage-7.6.9-cp313-cp313-win_amd64.whl", hash = "sha256:fd1213c86e48dfdc5a0cc676551db467495a95a662d2396ecd58e719191446e1"}, - {file = "coverage-7.6.9-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:ba9e7484d286cd5a43744e5f47b0b3fb457865baf07bafc6bee91896364e1419"}, - {file = "coverage-7.6.9-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e5ea1cf0872ee455c03e5674b5bca5e3e68e159379c1af0903e89f5eba9ccc3a"}, - {file = "coverage-7.6.9-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d10e07aa2b91835d6abec555ec8b2733347956991901eea6ffac295f83a30e4"}, - {file = "coverage-7.6.9-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:13a9e2d3ee855db3dd6ea1ba5203316a1b1fd8eaeffc37c5b54987e61e4194ae"}, - {file = "coverage-7.6.9-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c38bf15a40ccf5619fa2fe8f26106c7e8e080d7760aeccb3722664c8656b030"}, - {file = "coverage-7.6.9-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:d5275455b3e4627c8e7154feaf7ee0743c2e7af82f6e3b561967b1cca755a0be"}, - {file = "coverage-7.6.9-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:8f8770dfc6e2c6a2d4569f411015c8d751c980d17a14b0530da2d7f27ffdd88e"}, - {file = "coverage-7.6.9-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8d2dfa71665a29b153a9681edb1c8d9c1ea50dfc2375fb4dac99ea7e21a0bcd9"}, - {file = "coverage-7.6.9-cp313-cp313t-win32.whl", hash = "sha256:5e6b86b5847a016d0fbd31ffe1001b63355ed309651851295315031ea7eb5a9b"}, - {file = "coverage-7.6.9-cp313-cp313t-win_amd64.whl", hash = "sha256:97ddc94d46088304772d21b060041c97fc16bdda13c6c7f9d8fcd8d5ae0d8611"}, - {file = "coverage-7.6.9-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:adb697c0bd35100dc690de83154627fbab1f4f3c0386df266dded865fc50a902"}, - {file = "coverage-7.6.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:be57b6d56e49c2739cdf776839a92330e933dd5e5d929966fbbd380c77f060be"}, - {file = "coverage-7.6.9-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1592791f8204ae9166de22ba7e6705fa4ebd02936c09436a1bb85aabca3e599"}, - {file = "coverage-7.6.9-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4e12ae8cc979cf83d258acb5e1f1cf2f3f83524d1564a49d20b8bec14b637f08"}, - {file = "coverage-7.6.9-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb5555cff66c4d3d6213a296b360f9e1a8e323e74e0426b6c10ed7f4d021e464"}, - {file = "coverage-7.6.9-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:b9389a429e0e5142e69d5bf4a435dd688c14478a19bb901735cdf75e57b13845"}, - {file = "coverage-7.6.9-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:592ac539812e9b46046620341498caf09ca21023c41c893e1eb9dbda00a70cbf"}, - {file = "coverage-7.6.9-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a27801adef24cc30871da98a105f77995e13a25a505a0161911f6aafbd66e678"}, - {file = "coverage-7.6.9-cp39-cp39-win32.whl", hash = "sha256:8e3c3e38930cfb729cb8137d7f055e5a473ddaf1217966aa6238c88bd9fd50e6"}, - {file = "coverage-7.6.9-cp39-cp39-win_amd64.whl", hash = "sha256:e28bf44afa2b187cc9f41749138a64435bf340adfcacb5b2290c070ce99839d4"}, - {file = "coverage-7.6.9-pp39.pp310-none-any.whl", hash = "sha256:f3ca78518bc6bc92828cd11867b121891d75cae4ea9e908d72030609b996db1b"}, - {file = "coverage-7.6.9.tar.gz", hash = "sha256:4a8d8977b0c6ef5aeadcb644da9e69ae0dcfe66ec7f368c89c72e058bd71164d"}, + {file = "coverage-7.6.10-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5c912978f7fbf47ef99cec50c4401340436d200d41d714c7a4766f377c5b7b78"}, + {file = "coverage-7.6.10-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a01ec4af7dfeb96ff0078ad9a48810bb0cc8abcb0115180c6013a6b26237626c"}, + {file = "coverage-7.6.10-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a3b204c11e2b2d883946fe1d97f89403aa1811df28ce0447439178cc7463448a"}, + {file = "coverage-7.6.10-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:32ee6d8491fcfc82652a37109f69dee9a830e9379166cb73c16d8dc5c2915165"}, + {file = "coverage-7.6.10-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675cefc4c06e3b4c876b85bfb7c59c5e2218167bbd4da5075cbe3b5790a28988"}, + {file = "coverage-7.6.10-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f4f620668dbc6f5e909a0946a877310fb3d57aea8198bde792aae369ee1c23b5"}, + {file = "coverage-7.6.10-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:4eea95ef275de7abaef630c9b2c002ffbc01918b726a39f5a4353916ec72d2f3"}, + {file = "coverage-7.6.10-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e2f0280519e42b0a17550072861e0bc8a80a0870de260f9796157d3fca2733c5"}, + {file = "coverage-7.6.10-cp310-cp310-win32.whl", hash = "sha256:bc67deb76bc3717f22e765ab3e07ee9c7a5e26b9019ca19a3b063d9f4b874244"}, + {file = "coverage-7.6.10-cp310-cp310-win_amd64.whl", hash = "sha256:0f460286cb94036455e703c66988851d970fdfd8acc2a1122ab7f4f904e4029e"}, + {file = "coverage-7.6.10-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ea3c8f04b3e4af80e17bab607c386a830ffc2fb88a5484e1df756478cf70d1d3"}, + {file = "coverage-7.6.10-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:507a20fc863cae1d5720797761b42d2d87a04b3e5aeb682ef3b7332e90598f43"}, + {file = "coverage-7.6.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d37a84878285b903c0fe21ac8794c6dab58150e9359f1aaebbeddd6412d53132"}, + {file = "coverage-7.6.10-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a534738b47b0de1995f85f582d983d94031dffb48ab86c95bdf88dc62212142f"}, + {file = "coverage-7.6.10-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d7a2bf79378d8fb8afaa994f91bfd8215134f8631d27eba3e0e2c13546ce994"}, + {file = "coverage-7.6.10-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6713ba4b4ebc330f3def51df1d5d38fad60b66720948112f114968feb52d3f99"}, + {file = "coverage-7.6.10-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:ab32947f481f7e8c763fa2c92fd9f44eeb143e7610c4ca9ecd6a36adab4081bd"}, + {file = "coverage-7.6.10-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:7bbd8c8f1b115b892e34ba66a097b915d3871db7ce0e6b9901f462ff3a975377"}, + {file = "coverage-7.6.10-cp311-cp311-win32.whl", hash = "sha256:299e91b274c5c9cdb64cbdf1b3e4a8fe538a7a86acdd08fae52301b28ba297f8"}, + {file = "coverage-7.6.10-cp311-cp311-win_amd64.whl", hash = "sha256:489a01f94aa581dbd961f306e37d75d4ba16104bbfa2b0edb21d29b73be83609"}, + {file = "coverage-7.6.10-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:27c6e64726b307782fa5cbe531e7647aee385a29b2107cd87ba7c0105a5d3853"}, + {file = "coverage-7.6.10-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c56e097019e72c373bae32d946ecf9858fda841e48d82df7e81c63ac25554078"}, + {file = "coverage-7.6.10-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c7827a5bc7bdb197b9e066cdf650b2887597ad124dd99777332776f7b7c7d0d0"}, + {file = "coverage-7.6.10-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:204a8238afe787323a8b47d8be4df89772d5c1e4651b9ffa808552bdf20e1d50"}, + {file = "coverage-7.6.10-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e67926f51821b8e9deb6426ff3164870976fe414d033ad90ea75e7ed0c2e5022"}, + {file = "coverage-7.6.10-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e78b270eadb5702938c3dbe9367f878249b5ef9a2fcc5360ac7bff694310d17b"}, + {file = "coverage-7.6.10-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:714f942b9c15c3a7a5fe6876ce30af831c2ad4ce902410b7466b662358c852c0"}, + {file = "coverage-7.6.10-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:abb02e2f5a3187b2ac4cd46b8ced85a0858230b577ccb2c62c81482ca7d18852"}, + {file = "coverage-7.6.10-cp312-cp312-win32.whl", hash = "sha256:55b201b97286cf61f5e76063f9e2a1d8d2972fc2fcfd2c1272530172fd28c359"}, + {file = "coverage-7.6.10-cp312-cp312-win_amd64.whl", hash = "sha256:e4ae5ac5e0d1e4edfc9b4b57b4cbecd5bc266a6915c500f358817a8496739247"}, + {file = "coverage-7.6.10-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:05fca8ba6a87aabdd2d30d0b6c838b50510b56cdcfc604d40760dae7153b73d9"}, + {file = "coverage-7.6.10-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9e80eba8801c386f72e0712a0453431259c45c3249f0009aff537a517b52942b"}, + {file = "coverage-7.6.10-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a372c89c939d57abe09e08c0578c1d212e7a678135d53aa16eec4430adc5e690"}, + {file = "coverage-7.6.10-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ec22b5e7fe7a0fa8509181c4aac1db48f3dd4d3a566131b313d1efc102892c18"}, + {file = "coverage-7.6.10-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26bcf5c4df41cad1b19c84af71c22cbc9ea9a547fc973f1f2cc9a290002c8b3c"}, + {file = "coverage-7.6.10-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4e4630c26b6084c9b3cb53b15bd488f30ceb50b73c35c5ad7871b869cb7365fd"}, + {file = "coverage-7.6.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2396e8116db77789f819d2bc8a7e200232b7a282c66e0ae2d2cd84581a89757e"}, + {file = "coverage-7.6.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:79109c70cc0882e4d2d002fe69a24aa504dec0cc17169b3c7f41a1d341a73694"}, + {file = "coverage-7.6.10-cp313-cp313-win32.whl", hash = "sha256:9e1747bab246d6ff2c4f28b4d186b205adced9f7bd9dc362051cc37c4a0c7bd6"}, + {file = "coverage-7.6.10-cp313-cp313-win_amd64.whl", hash = "sha256:254f1a3b1eef5f7ed23ef265eaa89c65c8c5b6b257327c149db1ca9d4a35f25e"}, + {file = "coverage-7.6.10-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:2ccf240eb719789cedbb9fd1338055de2761088202a9a0b73032857e53f612fe"}, + {file = "coverage-7.6.10-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:0c807ca74d5a5e64427c8805de15b9ca140bba13572d6d74e262f46f50b13273"}, + {file = "coverage-7.6.10-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bcfa46d7709b5a7ffe089075799b902020b62e7ee56ebaed2f4bdac04c508d8"}, + {file = "coverage-7.6.10-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4e0de1e902669dccbf80b0415fb6b43d27edca2fbd48c74da378923b05316098"}, + {file = "coverage-7.6.10-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f7b444c42bbc533aaae6b5a2166fd1a797cdb5eb58ee51a92bee1eb94a1e1cb"}, + {file = "coverage-7.6.10-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:b330368cb99ef72fcd2dc3ed260adf67b31499584dc8a20225e85bfe6f6cfed0"}, + {file = "coverage-7.6.10-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:9a7cfb50515f87f7ed30bc882f68812fd98bc2852957df69f3003d22a2aa0abf"}, + {file = "coverage-7.6.10-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6f93531882a5f68c28090f901b1d135de61b56331bba82028489bc51bdd818d2"}, + {file = "coverage-7.6.10-cp313-cp313t-win32.whl", hash = "sha256:89d76815a26197c858f53c7f6a656686ec392b25991f9e409bcef020cd532312"}, + {file = "coverage-7.6.10-cp313-cp313t-win_amd64.whl", hash = "sha256:54a5f0f43950a36312155dae55c505a76cd7f2b12d26abeebbe7a0b36dbc868d"}, + {file = "coverage-7.6.10-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:656c82b8a0ead8bba147de9a89bda95064874c91a3ed43a00e687f23cc19d53a"}, + {file = "coverage-7.6.10-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ccc2b70a7ed475c68ceb548bf69cec1e27305c1c2606a5eb7c3afff56a1b3b27"}, + {file = "coverage-7.6.10-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5e37dc41d57ceba70956fa2fc5b63c26dba863c946ace9705f8eca99daecdc4"}, + {file = "coverage-7.6.10-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0aa9692b4fdd83a4647eeb7db46410ea1322b5ed94cd1715ef09d1d5922ba87f"}, + {file = "coverage-7.6.10-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa744da1820678b475e4ba3dfd994c321c5b13381d1041fe9c608620e6676e25"}, + {file = "coverage-7.6.10-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:c0b1818063dc9e9d838c09e3a473c1422f517889436dd980f5d721899e66f315"}, + {file = "coverage-7.6.10-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:59af35558ba08b758aec4d56182b222976330ef8d2feacbb93964f576a7e7a90"}, + {file = "coverage-7.6.10-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7ed2f37cfce1ce101e6dffdfd1c99e729dd2ffc291d02d3e2d0af8b53d13840d"}, + {file = "coverage-7.6.10-cp39-cp39-win32.whl", hash = "sha256:4bcc276261505d82f0ad426870c3b12cb177752834a633e737ec5ee79bbdff18"}, + {file = "coverage-7.6.10-cp39-cp39-win_amd64.whl", hash = "sha256:457574f4599d2b00f7f637a0700a6422243b3565509457b2dbd3f50703e11f59"}, + {file = "coverage-7.6.10-pp39.pp310-none-any.whl", hash = "sha256:fd34e7b3405f0cc7ab03d54a334c17a9e802897580d964bd8c2001f4b9fd488f"}, + {file = "coverage-7.6.10.tar.gz", hash = "sha256:7fb105327c8f8f0682e29843e2ff96af9dcbe5bab8eeb4b398c6a33a16d80a23"}, ] [package.dependencies] From a5be07a2c0544876abb02e767dd4cabc3d69128d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 27 Dec 2024 11:42:55 +0100 Subject: [PATCH 04/32] Bump mkdocstrings-python from 1.12.2 to 1.13.0 (#1472) Bumps [mkdocstrings-python](https://github.com/mkdocstrings/python) from 1.12.2 to 1.13.0. - [Release notes](https://github.com/mkdocstrings/python/releases) - [Changelog](https://github.com/mkdocstrings/python/blob/main/CHANGELOG.md) - [Commits](https://github.com/mkdocstrings/python/compare/1.12.2...1.13.0) --- updated-dependencies: - dependency-name: mkdocstrings-python dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- mkdocs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs/requirements.txt b/mkdocs/requirements.txt index 45da03aa05..f374b85bea 100644 --- a/mkdocs/requirements.txt +++ b/mkdocs/requirements.txt @@ -19,7 +19,7 @@ mkdocs==1.6.1 griffe==1.5.4 jinja2==3.1.5 mkdocstrings==0.27.0 -mkdocstrings-python==1.12.2 +mkdocstrings-python==1.13.0 mkdocs-literate-nav==0.6.1 mkdocs-autorefs==1.2.0 mkdocs-gen-files==0.5.0 From a926d379e6d14ec5898aedc16aa5ac3e57e9ed2f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 28 Dec 2024 17:23:07 +0100 Subject: [PATCH 05/32] Bump boto3 from 1.35.81 to 1.35.88 (#1476) Bumps [boto3](https://github.com/boto/boto3) from 1.35.81 to 1.35.88. - [Release notes](https://github.com/boto/boto3/releases) - [Commits](https://github.com/boto/boto3/compare/1.35.81...1.35.88) --- updated-dependencies: - dependency-name: boto3 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/poetry.lock b/poetry.lock index e6afffab09..893f5a4a9e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -25,24 +25,24 @@ tests = ["arrow", "dask[dataframe]", "docker", "pytest", "pytest-mock"] [[package]] name = "aiobotocore" -version = "2.16.0" +version = "2.16.1" description = "Async client for aws services using botocore and aiohttp" optional = true python-versions = ">=3.8" files = [ - {file = "aiobotocore-2.16.0-py3-none-any.whl", hash = "sha256:eb3641a7b9c51113adbc33a029441de6201ebb026c64ff2e149c7fa802c9abfc"}, - {file = "aiobotocore-2.16.0.tar.gz", hash = "sha256:6d6721961a81570e9b920b98778d95eec3d52a9f83b7844c6c5cfdbf2a2d6a11"}, + {file = "aiobotocore-2.16.1-py3-none-any.whl", hash = "sha256:e7cf6295471224c82a111deaf31c2c3a4bcd6dbd6973e75c7fc4739fcccd5b0b"}, + {file = "aiobotocore-2.16.1.tar.gz", hash = "sha256:0f94904c6a1d14d5aac0502fcc1d721b95ee60d46d8a0e546f6203de0410d522"}, ] [package.dependencies] aiohttp = ">=3.9.2,<4.0.0" aioitertools = ">=0.5.1,<1.0.0" -botocore = ">=1.35.74,<1.35.82" +botocore = ">=1.35.74,<1.35.89" wrapt = ">=1.10.10,<2.0.0" [package.extras] -awscli = ["awscli (>=1.36.15,<1.36.23)"] -boto3 = ["boto3 (>=1.35.74,<1.35.82)"] +awscli = ["awscli (>=1.36.15,<1.36.30)"] +boto3 = ["boto3 (>=1.35.74,<1.35.89)"] [[package]] name = "aiohappyeyeballs" @@ -358,17 +358,17 @@ files = [ [[package]] name = "boto3" -version = "1.35.81" +version = "1.35.88" description = "The AWS SDK for Python" optional = false python-versions = ">=3.8" files = [ - {file = "boto3-1.35.81-py3-none-any.whl", hash = "sha256:742941b2424c0223d2d94a08c3485462fa7c58d816b62ca80f08e555243acee1"}, - {file = "boto3-1.35.81.tar.gz", hash = "sha256:d2e95fa06f095b8e0c545dd678c6269d253809b2997c30f5ce8a956c410b4e86"}, + {file = "boto3-1.35.88-py3-none-any.whl", hash = "sha256:7bc9b27ad87607256470c70a86c8b8c319ddd6ecae89cc191687cbf8ccb7b6a6"}, + {file = "boto3-1.35.88.tar.gz", hash = "sha256:43c6a7a70bb226770a82a601870136e3bb3bf2808f4576ab5b9d7d140dbf1323"}, ] [package.dependencies] -botocore = ">=1.35.81,<1.36.0" +botocore = ">=1.35.88,<1.36.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.10.0,<0.11.0" @@ -377,13 +377,13 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.35.81" +version = "1.35.88" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">=3.8" files = [ - {file = "botocore-1.35.81-py3-none-any.whl", hash = "sha256:a7b13bbd959bf2d6f38f681676aab408be01974c46802ab997617b51399239f7"}, - {file = "botocore-1.35.81.tar.gz", hash = "sha256:564c2478e50179e0b766e6a87e5e0cdd35e1bc37eb375c1cf15511f5dd13600d"}, + {file = "botocore-1.35.88-py3-none-any.whl", hash = "sha256:e60cc3fbe8d7a10f70e7e852d76be2b29f23ead418a5899d366ea32b1eacb5a5"}, + {file = "botocore-1.35.88.tar.gz", hash = "sha256:58dcd9a464c354b8c6c25261d8de830d175d9739eae568bf0c52e57116fb03c6"}, ] [package.dependencies] From e6465001bd8a47718ff79da4def5800962e6b895 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 29 Dec 2024 06:37:14 +0100 Subject: [PATCH 06/32] Bump moto from 5.0.24 to 5.0.25 (#1475) Bumps [moto](https://github.com/getmoto/moto) from 5.0.24 to 5.0.25. - [Release notes](https://github.com/getmoto/moto/releases) - [Changelog](https://github.com/getmoto/moto/blob/master/CHANGELOG.md) - [Commits](https://github.com/getmoto/moto/compare/5.0.24...5.0.25) --- updated-dependencies: - dependency-name: moto dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 53 +++++++++++++++++------------------------------------ 1 file changed, 17 insertions(+), 36 deletions(-) diff --git a/poetry.lock b/poetry.lock index 893f5a4a9e..640cab2733 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1874,23 +1874,6 @@ cryptography = "*" [package.extras] drafts = ["pycryptodome"] -[[package]] -name = "jsondiff" -version = "2.2.1" -description = "Diff JSON and JSON-like structures in Python" -optional = false -python-versions = ">=3.8" -files = [ - {file = "jsondiff-2.2.1-py3-none-any.whl", hash = "sha256:b1f0f7e2421881848b1d556d541ac01a91680cfcc14f51a9b62cdf4da0e56722"}, - {file = "jsondiff-2.2.1.tar.gz", hash = "sha256:658d162c8a86ba86de26303cd86a7b37e1b2c1ec98b569a60e2ca6180545f7fe"}, -] - -[package.dependencies] -pyyaml = "*" - -[package.extras] -dev = ["build", "hypothesis", "pytest", "setuptools-scm"] - [[package]] name = "jsonpatch" version = "1.33" @@ -2249,13 +2232,13 @@ type = ["mypy (==1.11.2)"] [[package]] name = "moto" -version = "5.0.24" -description = "" +version = "5.0.25" +description = "A library that allows you to easily mock out tests based on AWS infrastructure" optional = false python-versions = ">=3.8" files = [ - {file = "moto-5.0.24-py3-none-any.whl", hash = "sha256:4d826f1574849f18ddd2fcbf614d97f82c8fddfb9d95fac1078da01a39b57c10"}, - {file = "moto-5.0.24.tar.gz", hash = "sha256:dba6426bd770fbb9d892633fbd35253cbc181eeaa0eba97d6f058720a8fe9b42"}, + {file = "moto-5.0.25-py3-none-any.whl", hash = "sha256:ab790f9d7d08f30667a196af7cacead03e76c10be2d1148ea00a731d47918a1e"}, + {file = "moto-5.0.25.tar.gz", hash = "sha256:deea8b158cec5a65c9635ae1fff4579d735b11ac8a0e5226fbbeb742ce0ce6b2"}, ] [package.dependencies] @@ -2271,10 +2254,9 @@ flask-cors = {version = "*", optional = true, markers = "extra == \"server\""} graphql-core = {version = "*", optional = true, markers = "extra == \"server\""} Jinja2 = ">=2.10.1" joserfc = {version = ">=0.9.0", optional = true, markers = "extra == \"server\""} -jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""} jsonpath-ng = {version = "*", optional = true, markers = "extra == \"server\""} openapi-spec-validator = {version = ">=0.5.0", optional = true, markers = "extra == \"server\""} -py-partiql-parser = {version = "0.5.6", optional = true, markers = "extra == \"server\""} +py-partiql-parser = {version = "0.6.1", optional = true, markers = "extra == \"server\""} pyparsing = {version = ">=3.0.7", optional = true, markers = "extra == \"server\""} python-dateutil = ">=2.1,<3.0.0" PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""} @@ -2285,25 +2267,24 @@ werkzeug = ">=0.5,<2.2.0 || >2.2.0,<2.2.1 || >2.2.1" xmltodict = "*" [package.extras] -all = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "jsonschema", "multipart", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.6)", "pyparsing (>=3.0.7)", "setuptools"] +all = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsonpath-ng", "jsonschema", "multipart", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.6.1)", "pyparsing (>=3.0.7)", "setuptools"] apigateway = ["PyYAML (>=5.1)", "joserfc (>=0.9.0)", "openapi-spec-validator (>=0.5.0)"] apigatewayv2 = ["PyYAML (>=5.1)", "openapi-spec-validator (>=0.5.0)"] appsync = ["graphql-core"] awslambda = ["docker (>=3.0.0)"] batch = ["docker (>=3.0.0)"] -cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.6)", "pyparsing (>=3.0.7)", "setuptools"] +cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.6.1)", "pyparsing (>=3.0.7)", "setuptools"] cognitoidp = ["joserfc (>=0.9.0)"] -dynamodb = ["docker (>=3.0.0)", "py-partiql-parser (==0.5.6)"] -dynamodbstreams = ["docker (>=3.0.0)", "py-partiql-parser (==0.5.6)"] +dynamodb = ["docker (>=3.0.0)", "py-partiql-parser (==0.6.1)"] +dynamodbstreams = ["docker (>=3.0.0)", "py-partiql-parser (==0.6.1)"] events = ["jsonpath-ng"] glue = ["pyparsing (>=3.0.7)"] -iotdata = ["jsondiff (>=1.1.2)"] -proxy = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "multipart", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.6)", "pyparsing (>=3.0.7)", "setuptools"] +proxy = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "graphql-core", "joserfc (>=0.9.0)", "jsonpath-ng", "multipart", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.6.1)", "pyparsing (>=3.0.7)", "setuptools"] quicksight = ["jsonschema"] -resourcegroupstaggingapi = ["PyYAML (>=5.1)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.6)", "pyparsing (>=3.0.7)"] -s3 = ["PyYAML (>=5.1)", "py-partiql-parser (==0.5.6)"] -s3crc32c = ["PyYAML (>=5.1)", "crc32c", "py-partiql-parser (==0.5.6)"] -server = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.6)", "pyparsing (>=3.0.7)", "setuptools"] +resourcegroupstaggingapi = ["PyYAML (>=5.1)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.6.1)", "pyparsing (>=3.0.7)"] +s3 = ["PyYAML (>=5.1)", "py-partiql-parser (==0.6.1)"] +s3crc32c = ["PyYAML (>=5.1)", "crc32c", "py-partiql-parser (==0.6.1)"] +server = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "joserfc (>=0.9.0)", "jsonpath-ng", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.6.1)", "pyparsing (>=3.0.7)", "setuptools"] ssm = ["PyYAML (>=5.1)"] stepfunctions = ["antlr4-python3-runtime", "jsonpath-ng"] xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"] @@ -3068,13 +3049,13 @@ files = [ [[package]] name = "py-partiql-parser" -version = "0.5.6" +version = "0.6.1" description = "Pure Python PartiQL Parser" optional = false python-versions = "*" files = [ - {file = "py_partiql_parser-0.5.6-py2.py3-none-any.whl", hash = "sha256:622d7b0444becd08c1f4e9e73b31690f4b1c309ab6e5ed45bf607fe71319309f"}, - {file = "py_partiql_parser-0.5.6.tar.gz", hash = "sha256:6339f6bf85573a35686529fc3f491302e71dd091711dfe8df3be89a93767f97b"}, + {file = "py_partiql_parser-0.6.1-py2.py3-none-any.whl", hash = "sha256:ff6a48067bff23c37e9044021bf1d949c83e195490c17e020715e927fe5b2456"}, + {file = "py_partiql_parser-0.6.1.tar.gz", hash = "sha256:8583ff2a0e15560ef3bc3df109a7714d17f87d81d33e8c38b7fed4e58a63215d"}, ] [package.extras] From 5da1f4d6b66cdc689e561d6291abbb757ffa561a Mon Sep 17 00:00:00 2001 From: smaheshwar-pltr Date: Thu, 2 Jan 2025 15:06:20 -0500 Subject: [PATCH 07/32] URL-encode partition field names in file locations (#1457) * URL-encode partition field names in file locations * Separate into variable * Add test * Revert to main * Failing test * Disable justication from test * Use `quote_plus` instead of `quote` to match Java behaviour * Temporarily update test to pass * Uncomment test * Add unit test * Fix typo in comment * Add `make_name_compatible` suggestion so test passes * Fix typo in schema field name --------- Co-authored-by: Sreesh Maheshwar --- pyiceberg/partitioning.py | 8 ++- tests/integration/test_partitioning_key.py | 67 +++++++++++++++++++++- tests/table/test_partitioning.py | 24 +++++++- 3 files changed, 92 insertions(+), 7 deletions(-) diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py index 5f9178ebf9..c9b6316f59 100644 --- a/pyiceberg/partitioning.py +++ b/pyiceberg/partitioning.py @@ -30,7 +30,7 @@ Tuple, TypeVar, ) -from urllib.parse import quote +from urllib.parse import quote_plus from pydantic import ( BeforeValidator, @@ -234,9 +234,11 @@ def partition_to_path(self, data: Record, schema: Schema) -> str: partition_field = self.fields[pos] value_str = partition_field.transform.to_human_string(field_types[pos].field_type, value=data[pos]) - value_str = quote(value_str, safe="") + value_str = quote_plus(value_str, safe="") value_strs.append(value_str) - field_strs.append(partition_field.name) + + field_str = quote_plus(partition_field.name, safe="") + field_strs.append(field_str) path = "/".join([field_str + "=" + value_str for field_str, value_str in zip(field_strs, value_strs)]) return path diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py index 29f664909c..1ac808c7d0 100644 --- a/tests/integration/test_partitioning_key.py +++ b/tests/integration/test_partitioning_key.py @@ -18,7 +18,7 @@ import uuid from datetime import date, datetime, timedelta, timezone from decimal import Decimal -from typing import Any, List +from typing import Any, Callable, List, Optional import pytest from pyspark.sql import SparkSession @@ -70,6 +70,7 @@ NestedField(field_id=12, name="fixed_field", field_type=FixedType(16), required=False), NestedField(field_id=13, name="decimal_field", field_type=DecimalType(5, 2), required=False), NestedField(field_id=14, name="uuid_field", field_type=UUIDType(), required=False), + NestedField(field_id=15, name="special#string+field", field_type=StringType(), required=False), ) @@ -77,7 +78,7 @@ @pytest.mark.parametrize( - "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification", + "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification, make_compatible_name", [ # # Identity Transform ( @@ -98,6 +99,7 @@ VALUES (false, 'Boolean field set to false'); """, + None, ), ( [PartitionField(source_id=2, field_id=1001, transform=IdentityTransform(), name="string_field")], @@ -117,6 +119,7 @@ VALUES ('sample_string', 'Another string value') """, + None, ), ( [PartitionField(source_id=4, field_id=1001, transform=IdentityTransform(), name="int_field")], @@ -136,6 +139,7 @@ VALUES (42, 'Associated string value for int 42') """, + None, ), ( [PartitionField(source_id=5, field_id=1001, transform=IdentityTransform(), name="long_field")], @@ -155,6 +159,7 @@ VALUES (1234567890123456789, 'Associated string value for long 1234567890123456789') """, + None, ), ( [PartitionField(source_id=6, field_id=1001, transform=IdentityTransform(), name="float_field")], @@ -178,6 +183,7 @@ # VALUES # (3.14, 'Associated string value for float 3.14') # """ + None, ), ( [PartitionField(source_id=7, field_id=1001, transform=IdentityTransform(), name="double_field")], @@ -201,6 +207,7 @@ # VALUES # (6.282, 'Associated string value for double 6.282') # """ + None, ), ( [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")], @@ -220,6 +227,7 @@ VALUES (CAST('2023-01-01 12:00:01.000999' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00') """, + None, ), ( [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")], @@ -239,6 +247,7 @@ VALUES (CAST('2023-01-01 12:00:01' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00') """, + None, ), ( [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")], @@ -263,6 +272,7 @@ # VALUES # (CAST('2023-01-01 12:00:00' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00') # """ + None, ), ( [PartitionField(source_id=9, field_id=1001, transform=IdentityTransform(), name="timestamptz_field")], @@ -287,6 +297,7 @@ # VALUES # (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Associated string value for timestamp 2023-01-01 12:00:01.000999+03:00') # """ + None, ), ( [PartitionField(source_id=10, field_id=1001, transform=IdentityTransform(), name="date_field")], @@ -306,6 +317,7 @@ VALUES (CAST('2023-01-01' AS DATE), 'Associated string value for date 2023-01-01') """, + None, ), ( [PartitionField(source_id=14, field_id=1001, transform=IdentityTransform(), name="uuid_field")], @@ -325,6 +337,7 @@ VALUES ('f47ac10b-58cc-4372-a567-0e02b2c3d479', 'Associated string value for UUID f47ac10b-58cc-4372-a567-0e02b2c3d479') """, + None, ), ( [PartitionField(source_id=11, field_id=1001, transform=IdentityTransform(), name="binary_field")], @@ -344,6 +357,7 @@ VALUES (CAST('example' AS BINARY), 'Associated string value for binary `example`') """, + None, ), ( [PartitionField(source_id=13, field_id=1001, transform=IdentityTransform(), name="decimal_field")], @@ -363,6 +377,7 @@ VALUES (123.45, 'Associated string value for decimal 123.45') """, + None, ), # # Year Month Day Hour Transform # Month Transform @@ -384,6 +399,7 @@ VALUES (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP_NTZ), 'Event at 2023-01-01 11:55:59.999999'); """, + None, ), ( [PartitionField(source_id=9, field_id=1001, transform=MonthTransform(), name="timestamptz_field_month")], @@ -403,6 +419,7 @@ VALUES (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00'); """, + None, ), ( [PartitionField(source_id=10, field_id=1001, transform=MonthTransform(), name="date_field_month")], @@ -422,6 +439,7 @@ VALUES (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01'); """, + None, ), # Year Transform ( @@ -442,6 +460,7 @@ VALUES (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event at 2023-01-01 11:55:59.999999'); """, + None, ), ( [PartitionField(source_id=9, field_id=1001, transform=YearTransform(), name="timestamptz_field_year")], @@ -461,6 +480,7 @@ VALUES (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00'); """, + None, ), ( [PartitionField(source_id=10, field_id=1001, transform=YearTransform(), name="date_field_year")], @@ -480,6 +500,7 @@ VALUES (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01'); """, + None, ), # # Day Transform ( @@ -500,6 +521,7 @@ VALUES (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01'); """, + None, ), ( [PartitionField(source_id=9, field_id=1001, transform=DayTransform(), name="timestamptz_field_day")], @@ -519,6 +541,7 @@ VALUES (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00'); """, + None, ), ( [PartitionField(source_id=10, field_id=1001, transform=DayTransform(), name="date_field_day")], @@ -538,6 +561,7 @@ VALUES (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01'); """, + None, ), # Hour Transform ( @@ -558,6 +582,7 @@ VALUES (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event within the 11th hour of 2023-01-01'); """, + None, ), ( [PartitionField(source_id=9, field_id=1001, transform=HourTransform(), name="timestamptz_field_hour")], @@ -577,6 +602,7 @@ VALUES (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00'); """, + None, ), # Truncate Transform ( @@ -597,6 +623,7 @@ VALUES (12345, 'Sample data for int'); """, + None, ), ( [PartitionField(source_id=5, field_id=1001, transform=TruncateTransform(2), name="bigint_field_trunc")], @@ -616,6 +643,7 @@ VALUES (4294967297, 'Sample data for long'); """, + None, ), ( [PartitionField(source_id=2, field_id=1001, transform=TruncateTransform(3), name="string_field_trunc")], @@ -635,6 +663,7 @@ VALUES ('abcdefg', 'Another sample for string'); """, + None, ), ( [PartitionField(source_id=13, field_id=1001, transform=TruncateTransform(width=5), name="decimal_field_trunc")], @@ -654,6 +683,7 @@ VALUES (678.90, 'Associated string value for decimal 678.90') """, + None, ), ( [PartitionField(source_id=11, field_id=1001, transform=TruncateTransform(10), name="binary_field_trunc")], @@ -673,6 +703,7 @@ VALUES (binary('HELLOICEBERG'), 'Sample data for binary'); """, + None, ), # Bucket Transform ( @@ -693,6 +724,7 @@ VALUES (10, 'Integer with value 10'); """, + None, ), # Test multiple field combinations could generate the Partition record and hive partition path correctly ( @@ -721,6 +753,27 @@ VALUES (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), CAST('2023-01-01' AS DATE), 'some data'); """, + None, + ), + # Test that special characters are URL-encoded + ( + [PartitionField(source_id=15, field_id=1001, transform=IdentityTransform(), name="special#string+field")], + ["special string"], + Record(**{"special#string+field": "special string"}), # type: ignore + "special%23string%2Bfield=special+string", + f"""CREATE TABLE {identifier} ( + `special#string+field` string + ) + USING iceberg + PARTITIONED BY ( + identity(`special#string+field`) + ) + """, + f"""INSERT INTO {identifier} + VALUES + ('special string') + """, + lambda name: name.replace("#", "_x23").replace("+", "_x2B"), ), ], ) @@ -734,6 +787,7 @@ def test_partition_key( expected_hive_partition_path_slice: str, spark_create_table_sql_for_justification: str, spark_data_insert_sql_for_justification: str, + make_compatible_name: Optional[Callable[[str], str]], ) -> None: partition_field_values = [PartitionFieldValue(field, value) for field, value in zip(partition_fields, partition_values)] spec = PartitionSpec(*partition_fields) @@ -768,5 +822,12 @@ def test_partition_key( spark_path_for_justification = ( snapshot.manifests(iceberg_table.io)[0].fetch_manifest_entry(iceberg_table.io)[0].data_file.file_path ) - assert spark_partition_for_justification == expected_partition_record + # Special characters in partition value are sanitized when written to the data file's partition field + # Use `make_compatible_name` to match the sanitize behavior + sanitized_record = ( + Record(**{make_compatible_name(k): v for k, v in vars(expected_partition_record).items()}) + if make_compatible_name + else expected_partition_record + ) + assert spark_partition_for_justification == sanitized_record assert expected_hive_partition_path_slice in spark_path_for_justification diff --git a/tests/table/test_partitioning.py b/tests/table/test_partitioning.py index d7425bc351..127d57a798 100644 --- a/tests/table/test_partitioning.py +++ b/tests/table/test_partitioning.py @@ -16,7 +16,8 @@ # under the License. from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionField, PartitionSpec from pyiceberg.schema import Schema -from pyiceberg.transforms import BucketTransform, TruncateTransform +from pyiceberg.transforms import BucketTransform, IdentityTransform, TruncateTransform +from pyiceberg.typedef import Record from pyiceberg.types import ( IntegerType, NestedField, @@ -118,6 +119,27 @@ def test_deserialize_partition_spec() -> None: ) +def test_partition_spec_to_path() -> None: + schema = Schema( + NestedField(field_id=1, name="str", field_type=StringType(), required=False), + NestedField(field_id=2, name="other_str", field_type=StringType(), required=False), + NestedField(field_id=3, name="int", field_type=IntegerType(), required=True), + ) + + spec = PartitionSpec( + PartitionField(source_id=1, field_id=1000, transform=TruncateTransform(width=19), name="my#str%bucket"), + PartitionField(source_id=2, field_id=1001, transform=IdentityTransform(), name="other str+bucket"), + PartitionField(source_id=3, field_id=1002, transform=BucketTransform(num_buckets=25), name="my!int:bucket"), + spec_id=3, + ) + + record = Record(**{"my#str%bucket": "my+str", "other str+bucket": "( )", "my!int:bucket": 10}) # type: ignore + + # Both partition field names and values should be URL encoded, with spaces mapping to plus signs, to match the Java + # behaviour: https://github.com/apache/iceberg/blob/ca3db931b0f024f0412084751ac85dd4ef2da7e7/api/src/main/java/org/apache/iceberg/PartitionSpec.java#L198-L204 + assert spec.partition_to_path(record, schema) == "my%23str%25bucket=my%2Bstr/other+str%2Bbucket=%28+%29/my%21int%3Abucket=10" + + def test_partition_type(table_schema_simple: Schema) -> None: spec = PartitionSpec( PartitionField(source_id=1, field_id=1000, transform=TruncateTransform(width=19), name="str_truncate"), From f7d8a2f50a7e8caafd04ff8ec3c08e113a5b71b3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 3 Jan 2025 08:39:12 +0100 Subject: [PATCH 08/32] Bump pyparsing from 3.2.0 to 3.2.1 (#1481) Bumps [pyparsing](https://github.com/pyparsing/pyparsing) from 3.2.0 to 3.2.1. - [Release notes](https://github.com/pyparsing/pyparsing/releases) - [Changelog](https://github.com/pyparsing/pyparsing/blob/master/CHANGES) - [Commits](https://github.com/pyparsing/pyparsing/compare/3.2.0...3.2.1) --- updated-dependencies: - dependency-name: pyparsing dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index 640cab2733..4fd524bb3f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3330,13 +3330,13 @@ tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] [[package]] name = "pyparsing" -version = "3.2.0" +version = "3.2.1" description = "pyparsing module - Classes and methods to define and execute parsing grammars" optional = false python-versions = ">=3.9" files = [ - {file = "pyparsing-3.2.0-py3-none-any.whl", hash = "sha256:93d9577b88da0bbea8cc8334ee8b918ed014968fd2ec383e868fb8afb1ccef84"}, - {file = "pyparsing-3.2.0.tar.gz", hash = "sha256:cbf74e27246d595d9a74b186b810f6fbb86726dbf3b9532efb343f6d7294fe9c"}, + {file = "pyparsing-3.2.1-py3-none-any.whl", hash = "sha256:506ff4f4386c4cec0590ec19e6302d3aedb992fdc02c761e90416f158dacf8e1"}, + {file = "pyparsing-3.2.1.tar.gz", hash = "sha256:61980854fd66de3a90028d679a954d5f2623e83144b5afe5ee86f43d762e5f0a"}, ] [package.extras] From f863c4e7cde850ec23111d45105351b314716e3a Mon Sep 17 00:00:00 2001 From: Tyler White <50381805+IndexSeek@users.noreply.github.com> Date: Fri, 3 Jan 2025 14:43:07 -0500 Subject: [PATCH 09/32] Configure `codespell` in `pre-commit` (#1478) * feat: configure codespell in pre-commit * add apache license header * style: resolve pre-commit violations --- .codespellrc | 18 ++++++++++++++++++ .pre-commit-config.yaml | 4 ++++ pyiceberg/avro/reader.py | 2 +- pyiceberg/io/pyarrow.py | 2 +- pyiceberg/utils/singleton.py | 2 +- tests/test_transforms.py | 2 +- 6 files changed, 26 insertions(+), 4 deletions(-) create mode 100644 .codespellrc diff --git a/.codespellrc b/.codespellrc new file mode 100644 index 0000000000..a38787e126 --- /dev/null +++ b/.codespellrc @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +[codespell] +ignore-words-list = BoundIn,fo,MoR,NotIn,notIn,oT diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c0b9a31792..bdd1f362b5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -69,6 +69,10 @@ repos: # --line-length is set to a high value to deal with very long lines - --line-length - '99999' + - repo: https://github.com/codespell-project/codespell + rev: v2.3.0 + hooks: + - id: codespell ci: autofix_commit_msg: | [pre-commit.ci] auto fixes from pre-commit.com hooks diff --git a/pyiceberg/avro/reader.py b/pyiceberg/avro/reader.py index 988bd42ba4..a5578680d6 100644 --- a/pyiceberg/avro/reader.py +++ b/pyiceberg/avro/reader.py @@ -51,7 +51,7 @@ def _skip_map_array(decoder: BinaryDecoder, skip_entry: Callable[[], None]) -> None: """Skips over an array or map. - Both the array and map are encoded similar, and we can re-use + Both the array and map are encoded similar, and we can reuse the logic of skipping in an efficient way. From the Avro spec: diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index ef6937f1bb..e8c9f64d63 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -1536,7 +1536,7 @@ def _to_requested_schema( include_field_ids: bool = False, use_large_types: bool = True, ) -> pa.RecordBatch: - # We could re-use some of these visitors + # We could reuse some of these visitors struct_array = visit_with_partner( requested_schema, batch, diff --git a/pyiceberg/utils/singleton.py b/pyiceberg/utils/singleton.py index 8a4bbf91ce..06ee62febe 100644 --- a/pyiceberg/utils/singleton.py +++ b/pyiceberg/utils/singleton.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. """ -This is a singleton metaclass that can be used to cache and re-use existing objects. +This is a singleton metaclass that can be used to cache and reuse existing objects. In the Iceberg codebase we have a lot of objects that are stateless (for example Types such as StringType, BooleanType etc). FixedTypes have arguments (eg. Fixed[22]) that we also make part of the key when caching diff --git a/tests/test_transforms.py b/tests/test_transforms.py index 7ebab87e3a..6d04a1e4ce 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -899,7 +899,7 @@ def test_projection_truncate_string_set_same_result(bound_reference_str: BoundRe def test_projection_truncate_string_set_in(bound_reference_str: BoundReference[str]) -> None: assert TruncateTransform(3).project( "name", BoundIn(term=bound_reference_str, literals={literal("hello"), literal("world")}) - ) == In(term="name", literals={literal("hel"), literal("wor")}) + ) == In(term="name", literals={literal("hel"), literal("wor")}) # codespell:ignore hel def test_projection_truncate_string_set_not_in(bound_reference_str: BoundReference[str]) -> None: From acd6f5a8a19db709e835e2686b87d4db3dca254f Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Fri, 3 Jan 2025 15:22:02 -0500 Subject: [PATCH 10/32] Remove deprecation warnings (#1416) * tests/expressions/test_parser.py::test_is_null Deprecated in 0.8.0, will be removed in 0.9.0. Parsing expressions with table name is deprecated. Only provide field names in the row_filter. * tests/catalog/test_rest.py: Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI configuration --- pyiceberg/utils/deprecated.py | 1 - tests/catalog/test_rest.py | 33 ++++++++++++++++++++++++++++++++ tests/expressions/test_parser.py | 1 - 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/pyiceberg/utils/deprecated.py b/pyiceberg/utils/deprecated.py index da2cb3b500..b196f47ec6 100644 --- a/pyiceberg/utils/deprecated.py +++ b/pyiceberg/utils/deprecated.py @@ -56,7 +56,6 @@ def deprecation_message(deprecated_in: str, removed_in: str, help_message: Optio def _deprecation_warning(message: str) -> None: with warnings.catch_warnings(): # temporarily override warning handling - warnings.simplefilter("always", DeprecationWarning) # turn off filter warnings.warn( message, category=DeprecationWarning, diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index 091a67166b..2a4b3a7a1f 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -121,6 +121,9 @@ def test_no_uri_supplied() -> None: RestCatalog("production") +@pytest.mark.filterwarnings( + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" +) def test_token_200(rest_mock: Mocker) -> None: rest_mock.post( f"{TEST_URI}v1/oauth/tokens", @@ -141,6 +144,9 @@ def test_token_200(rest_mock: Mocker) -> None: ) +@pytest.mark.filterwarnings( + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" +) def test_token_200_without_optional_fields(rest_mock: Mocker) -> None: rest_mock.post( f"{TEST_URI}v1/oauth/tokens", @@ -157,6 +163,9 @@ def test_token_200_without_optional_fields(rest_mock: Mocker) -> None: ) +@pytest.mark.filterwarnings( + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" +) def test_token_with_optional_oauth_params(rest_mock: Mocker) -> None: mock_request = rest_mock.post( f"{TEST_URI}v1/oauth/tokens", @@ -179,6 +188,9 @@ def test_token_with_optional_oauth_params(rest_mock: Mocker) -> None: assert TEST_RESOURCE in mock_request.last_request.text +@pytest.mark.filterwarnings( + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" +) def test_token_with_optional_oauth_params_as_empty(rest_mock: Mocker) -> None: mock_request = rest_mock.post( f"{TEST_URI}v1/oauth/tokens", @@ -199,6 +211,9 @@ def test_token_with_optional_oauth_params_as_empty(rest_mock: Mocker) -> None: assert TEST_RESOURCE not in mock_request.last_request.text +@pytest.mark.filterwarnings( + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" +) def test_token_with_default_scope(rest_mock: Mocker) -> None: mock_request = rest_mock.post( f"{TEST_URI}v1/oauth/tokens", @@ -217,6 +232,9 @@ def test_token_with_default_scope(rest_mock: Mocker) -> None: assert "catalog" in mock_request.last_request.text +@pytest.mark.filterwarnings( + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" +) def test_token_with_custom_scope(rest_mock: Mocker) -> None: mock_request = rest_mock.post( f"{TEST_URI}v1/oauth/tokens", @@ -236,6 +254,9 @@ def test_token_with_custom_scope(rest_mock: Mocker) -> None: assert TEST_SCOPE in mock_request.last_request.text +@pytest.mark.filterwarnings( + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" +) def test_token_200_w_auth_url(rest_mock: Mocker) -> None: rest_mock.post( TEST_AUTH_URL, @@ -258,6 +279,9 @@ def test_token_200_w_auth_url(rest_mock: Mocker) -> None: # pylint: enable=W0212 +@pytest.mark.filterwarnings( + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" +) def test_config_200(requests_mock: Mocker) -> None: requests_mock.get( f"{TEST_URI}v1/config", @@ -343,6 +367,9 @@ def test_config_sets_headers(requests_mock: Mocker) -> None: ) +@pytest.mark.filterwarnings( + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" +) def test_token_400(rest_mock: Mocker) -> None: rest_mock.post( f"{TEST_URI}v1/oauth/tokens", @@ -356,6 +383,9 @@ def test_token_400(rest_mock: Mocker) -> None: assert str(e.value) == "invalid_client: Credentials for key invalid_key do not match" +@pytest.mark.filterwarnings( + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" +) def test_token_401(rest_mock: Mocker) -> None: message = "invalid_client" rest_mock.post( @@ -489,6 +519,9 @@ def test_list_namespace_with_parent_200(rest_mock: Mocker) -> None: ] +@pytest.mark.filterwarnings( + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" +) def test_list_namespaces_token_expired(rest_mock: Mocker) -> None: new_token = "new_jwt_token" new_header = dict(TEST_HEADERS) diff --git a/tests/expressions/test_parser.py b/tests/expressions/test_parser.py index 085150edec..9d7a3ac094 100644 --- a/tests/expressions/test_parser.py +++ b/tests/expressions/test_parser.py @@ -70,7 +70,6 @@ def test_equals_false() -> None: def test_is_null() -> None: assert IsNull("foo") == parser.parse("foo is null") assert IsNull("foo") == parser.parse("foo IS NULL") - assert IsNull("foo") == parser.parse("table.foo IS NULL") def test_not_null() -> None: From 59fffe30204185f8f3981f2dd51047f540eaa6ef Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 5 Jan 2025 18:32:23 -0500 Subject: [PATCH 11/32] [infra] replace `pycln` with `ruff` (#1485) * pre-commit autoupdate * run ruff linter and formatter * remove pycln * ignore some rules * make lint * poetry add ruff --dev * remove ruff from dev dep * git checkout apache/main poetry.lock * add back --exit-non-zero-on-fix --- .pre-commit-config.yaml | 15 +- pyiceberg/cli/output.py | 12 +- pyiceberg/expressions/visitors.py | 16 +- pyiceberg/io/pyarrow.py | 44 +- pyiceberg/manifest.py | 56 ++- pyiceberg/schema.py | 14 +- pyiceberg/table/__init__.py | 32 +- pyiceberg/table/inspect.py | 480 ++++++++++--------- ruff.toml | 2 +- tests/avro/test_resolver.py | 50 +- tests/avro/test_writer.py | 40 +- tests/catalog/test_rest.py | 48 +- tests/catalog/test_sql.py | 34 +- tests/conftest.py | 296 ++++++------ tests/expressions/test_evaluator.py | 30 +- tests/expressions/test_visitors.py | 480 +++++++++---------- tests/integration/test_add_files.py | 104 ++-- tests/integration/test_deletes.py | 16 +- tests/integration/test_reads.py | 28 +- tests/integration/test_rest_schema.py | 20 +- tests/integration/test_writes/test_writes.py | 180 ++++--- tests/io/test_pyarrow.py | 122 +++-- tests/io/test_pyarrow_visitor.py | 352 +++++++------- tests/table/test_init.py | 114 ++--- tests/table/test_name_mapping.py | 244 +++++----- tests/test_schema.py | 24 +- tests/utils/test_manifest.py | 6 +- 27 files changed, 1535 insertions(+), 1324 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bdd1f362b5..e3dc04bde3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,26 +28,19 @@ repos: - id: check-yaml - id: check-ast - repo: https://github.com/astral-sh/ruff-pre-commit - # Ruff version (Used for linting) - rev: v0.7.4 + rev: v0.8.6 hooks: - id: ruff - args: [ --fix, --exit-non-zero-on-fix, --preview ] + args: [ --fix, --exit-non-zero-on-fix ] - id: ruff-format - args: [ --preview ] - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.8.0 + rev: v1.14.1 hooks: - id: mypy args: [--install-types, --non-interactive, --config=pyproject.toml] - - repo: https://github.com/hadialqattan/pycln - rev: v2.4.0 - hooks: - - id: pycln - args: [--config=pyproject.toml] - repo: https://github.com/igorshubovych/markdownlint-cli - rev: v0.42.0 + rev: v0.43.0 hooks: - id: markdownlint args: ["--fix"] diff --git a/pyiceberg/cli/output.py b/pyiceberg/cli/output.py index a4183c32bd..0eb85841bf 100644 --- a/pyiceberg/cli/output.py +++ b/pyiceberg/cli/output.py @@ -242,8 +242,10 @@ def version(self, version: str) -> None: self._out({"version": version}) def describe_refs(self, refs: List[Tuple[str, SnapshotRefType, Dict[str, str]]]) -> None: - self._out([ - {"name": name, "type": type, detail_key: detail_val} - for name, type, detail in refs - for detail_key, detail_val in detail.items() - ]) + self._out( + [ + {"name": name, "type": type, detail_key: detail_val} + for name, type, detail in refs + for detail_key, detail_val in detail.items() + ] + ) diff --git a/pyiceberg/expressions/visitors.py b/pyiceberg/expressions/visitors.py index 26698921b5..768878b068 100644 --- a/pyiceberg/expressions/visitors.py +++ b/pyiceberg/expressions/visitors.py @@ -1228,7 +1228,7 @@ def visit_less_than(self, term: BoundTerm[L], literal: Literal[L]) -> bool: # NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. return ROWS_MIGHT_MATCH - if lower_bound >= literal.value: + if lower_bound >= literal.value: # type: ignore[operator] return ROWS_CANNOT_MATCH return ROWS_MIGHT_MATCH @@ -1249,7 +1249,7 @@ def visit_less_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> b # NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. return ROWS_MIGHT_MATCH - if lower_bound > literal.value: + if lower_bound > literal.value: # type: ignore[operator] return ROWS_CANNOT_MATCH return ROWS_MIGHT_MATCH @@ -1266,7 +1266,7 @@ def visit_greater_than(self, term: BoundTerm[L], literal: Literal[L]) -> bool: if upper_bound_bytes := self.upper_bounds.get(field_id): upper_bound = from_bytes(field.field_type, upper_bound_bytes) - if upper_bound <= literal.value: + if upper_bound <= literal.value: # type: ignore[operator] if self._is_nan(upper_bound): # NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. return ROWS_MIGHT_MATCH @@ -1287,7 +1287,7 @@ def visit_greater_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) - if upper_bound_bytes := self.upper_bounds.get(field_id): upper_bound = from_bytes(field.field_type, upper_bound_bytes) - if upper_bound < literal.value: + if upper_bound < literal.value: # type: ignore[operator] if self._is_nan(upper_bound): # NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. return ROWS_MIGHT_MATCH @@ -1312,7 +1312,7 @@ def visit_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool: # NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. return ROWS_MIGHT_MATCH - if lower_bound > literal.value: + if lower_bound > literal.value: # type: ignore[operator] return ROWS_CANNOT_MATCH if upper_bound_bytes := self.upper_bounds.get(field_id): @@ -1321,7 +1321,7 @@ def visit_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool: # NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. return ROWS_MIGHT_MATCH - if upper_bound < literal.value: + if upper_bound < literal.value: # type: ignore[operator] return ROWS_CANNOT_MATCH return ROWS_MIGHT_MATCH @@ -1349,7 +1349,7 @@ def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> bool: # NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. return ROWS_MIGHT_MATCH - literals = {lit for lit in literals if lower_bound <= lit} + literals = {lit for lit in literals if lower_bound <= lit} # type: ignore[operator] if len(literals) == 0: return ROWS_CANNOT_MATCH @@ -1359,7 +1359,7 @@ def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> bool: if self._is_nan(upper_bound): return ROWS_MIGHT_MATCH - literals = {lit for lit in literals if upper_bound >= lit} + literals = {lit for lit in literals if upper_bound >= lit} # type: ignore[operator] if len(literals) == 0: return ROWS_CANNOT_MATCH diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index e8c9f64d63..dc41a7d6a1 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -2449,27 +2449,31 @@ def _dataframe_to_data_files( yield from write_file( io=io, table_metadata=table_metadata, - tasks=iter([ - WriteTask(write_uuid=write_uuid, task_id=next(counter), record_batches=batches, schema=task_schema) - for batches in bin_pack_arrow_table(df, target_file_size) - ]), + tasks=iter( + [ + WriteTask(write_uuid=write_uuid, task_id=next(counter), record_batches=batches, schema=task_schema) + for batches in bin_pack_arrow_table(df, target_file_size) + ] + ), ) else: partitions = _determine_partitions(spec=table_metadata.spec(), schema=table_metadata.schema(), arrow_table=df) yield from write_file( io=io, table_metadata=table_metadata, - tasks=iter([ - WriteTask( - write_uuid=write_uuid, - task_id=next(counter), - record_batches=batches, - partition_key=partition.partition_key, - schema=task_schema, - ) - for partition in partitions - for batches in bin_pack_arrow_table(partition.arrow_table_partition, target_file_size) - ]), + tasks=iter( + [ + WriteTask( + write_uuid=write_uuid, + task_id=next(counter), + record_batches=batches, + partition_key=partition.partition_key, + schema=task_schema, + ) + for partition in partitions + for batches in bin_pack_arrow_table(partition.arrow_table_partition, target_file_size) + ] + ), ) @@ -2534,10 +2538,12 @@ def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.T partition_columns: List[Tuple[PartitionField, NestedField]] = [ (partition_field, schema.find_field(partition_field.source_id)) for partition_field in spec.fields ] - partition_values_table = pa.table({ - str(partition.field_id): partition.transform.pyarrow_transform(field.field_type)(arrow_table[field.name]) - for partition, field in partition_columns - }) + partition_values_table = pa.table( + { + str(partition.field_id): partition.transform.pyarrow_transform(field.field_type)(arrow_table[field.name]) + for partition, field in partition_columns + } + ) # Sort by partitions sort_indices = pa.compute.sort_indices( diff --git a/pyiceberg/manifest.py b/pyiceberg/manifest.py index a56da5fc05..5a32a6330c 100644 --- a/pyiceberg/manifest.py +++ b/pyiceberg/manifest.py @@ -292,28 +292,32 @@ def __repr__(self) -> str: def data_file_with_partition(partition_type: StructType, format_version: TableVersion) -> StructType: - data_file_partition_type = StructType(*[ - NestedField( - field_id=field.field_id, - name=field.name, - field_type=field.field_type, - required=field.required, - ) - for field in partition_type.fields - ]) + data_file_partition_type = StructType( + *[ + NestedField( + field_id=field.field_id, + name=field.name, + field_type=field.field_type, + required=field.required, + ) + for field in partition_type.fields + ] + ) - return StructType(*[ - NestedField( - field_id=102, - name="partition", - field_type=data_file_partition_type, - required=True, - doc="Partition data tuple, schema based on the partition spec", - ) - if field.field_id == 102 - else field - for field in DATA_FILE_TYPE[format_version].fields - ]) + return StructType( + *[ + NestedField( + field_id=102, + name="partition", + field_type=data_file_partition_type, + required=True, + doc="Partition data tuple, schema based on the partition spec", + ) + if field.field_id == 102 + else field + for field in DATA_FILE_TYPE[format_version].fields + ] + ) class DataFile(Record): @@ -398,10 +402,12 @@ def __eq__(self, other: Any) -> bool: def manifest_entry_schema_with_data_file(format_version: TableVersion, data_file: StructType) -> Schema: - return Schema(*[ - NestedField(2, "data_file", data_file, required=True) if field.field_id == 2 else field - for field in MANIFEST_ENTRY_SCHEMAS[format_version].fields - ]) + return Schema( + *[ + NestedField(2, "data_file", data_file, required=True) if field.field_id == 2 else field + for field in MANIFEST_ENTRY_SCHEMAS[format_version].fields + ] + ) class ManifestEntry(Record): diff --git a/pyiceberg/schema.py b/pyiceberg/schema.py index cfe3fe3a7b..5a373cb15f 100644 --- a/pyiceberg/schema.py +++ b/pyiceberg/schema.py @@ -1707,12 +1707,14 @@ def list(self, list_type: ListType, element_result: Callable[[], bool]) -> bool: return self._is_field_compatible(list_type.element_field) and element_result() def map(self, map_type: MapType, key_result: Callable[[], bool], value_result: Callable[[], bool]) -> bool: - return all([ - self._is_field_compatible(map_type.key_field), - self._is_field_compatible(map_type.value_field), - key_result(), - value_result(), - ]) + return all( + [ + self._is_field_compatible(map_type.key_field), + self._is_field_compatible(map_type.value_field), + key_result(), + value_result(), + ] + ) def primitive(self, primitive: PrimitiveType) -> bool: return True diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 2469a9ed7b..7bc3fe838b 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -629,18 +629,20 @@ def delete( if len(filtered_df) == 0: replaced_files.append((original_file.file, [])) elif len(df) != len(filtered_df): - replaced_files.append(( - original_file.file, - list( - _dataframe_to_data_files( - io=self._table.io, - df=filtered_df, - table_metadata=self.table_metadata, - write_uuid=commit_uuid, - counter=counter, - ) - ), - )) + replaced_files.append( + ( + original_file.file, + list( + _dataframe_to_data_files( + io=self._table.io, + df=filtered_df, + table_metadata=self.table_metadata, + write_uuid=commit_uuid, + counter=counter, + ) + ), + ) + ) if len(replaced_files) > 0: with self.update_snapshot(snapshot_properties=snapshot_properties).overwrite() as overwrite_snapshot: @@ -680,9 +682,9 @@ def add_files( raise ValueError(f"Cannot add files that are already referenced by table, files: {', '.join(referenced_files)}") if self.table_metadata.name_mapping() is None: - self.set_properties(**{ - TableProperties.DEFAULT_NAME_MAPPING: self.table_metadata.schema().name_mapping.model_dump_json() - }) + self.set_properties( + **{TableProperties.DEFAULT_NAME_MAPPING: self.table_metadata.schema().name_mapping.model_dump_json()} + ) with self.update_snapshot(snapshot_properties=snapshot_properties).fast_append() as update_snapshot: data_files = _parquet_files_to_data_files( table_metadata=self.table_metadata, file_paths=file_paths, io=self._table.io diff --git a/pyiceberg/table/inspect.py b/pyiceberg/table/inspect.py index beee426533..71d38a2279 100644 --- a/pyiceberg/table/inspect.py +++ b/pyiceberg/table/inspect.py @@ -58,14 +58,16 @@ def _get_snapshot(self, snapshot_id: Optional[int] = None) -> Snapshot: def snapshots(self) -> "pa.Table": import pyarrow as pa - snapshots_schema = pa.schema([ - pa.field("committed_at", pa.timestamp(unit="ms"), nullable=False), - pa.field("snapshot_id", pa.int64(), nullable=False), - pa.field("parent_id", pa.int64(), nullable=True), - pa.field("operation", pa.string(), nullable=True), - pa.field("manifest_list", pa.string(), nullable=False), - pa.field("summary", pa.map_(pa.string(), pa.string()), nullable=True), - ]) + snapshots_schema = pa.schema( + [ + pa.field("committed_at", pa.timestamp(unit="ms"), nullable=False), + pa.field("snapshot_id", pa.int64(), nullable=False), + pa.field("parent_id", pa.int64(), nullable=True), + pa.field("operation", pa.string(), nullable=True), + pa.field("manifest_list", pa.string(), nullable=False), + pa.field("summary", pa.map_(pa.string(), pa.string()), nullable=True), + ] + ) snapshots = [] for snapshot in self.tbl.metadata.snapshots: if summary := snapshot.summary: @@ -75,14 +77,16 @@ def snapshots(self) -> "pa.Table": operation = None additional_properties = None - snapshots.append({ - "committed_at": datetime.fromtimestamp(snapshot.timestamp_ms / 1000.0, tz=timezone.utc), - "snapshot_id": snapshot.snapshot_id, - "parent_id": snapshot.parent_snapshot_id, - "operation": str(operation), - "manifest_list": snapshot.manifest_list, - "summary": additional_properties, - }) + snapshots.append( + { + "committed_at": datetime.fromtimestamp(snapshot.timestamp_ms / 1000.0, tz=timezone.utc), + "snapshot_id": snapshot.snapshot_id, + "parent_id": snapshot.parent_snapshot_id, + "operation": str(operation), + "manifest_list": snapshot.manifest_list, + "summary": additional_properties, + } + ) return pa.Table.from_pylist( snapshots, @@ -100,14 +104,16 @@ def entries(self, snapshot_id: Optional[int] = None) -> "pa.Table": def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType: pa_bound_type = schema_to_pyarrow(bound_type) - return pa.struct([ - pa.field("column_size", pa.int64(), nullable=True), - pa.field("value_count", pa.int64(), nullable=True), - pa.field("null_value_count", pa.int64(), nullable=True), - pa.field("nan_value_count", pa.int64(), nullable=True), - pa.field("lower_bound", pa_bound_type, nullable=True), - pa.field("upper_bound", pa_bound_type, nullable=True), - ]) + return pa.struct( + [ + pa.field("column_size", pa.int64(), nullable=True), + pa.field("value_count", pa.int64(), nullable=True), + pa.field("null_value_count", pa.int64(), nullable=True), + pa.field("nan_value_count", pa.int64(), nullable=True), + pa.field("lower_bound", pa_bound_type, nullable=True), + pa.field("upper_bound", pa_bound_type, nullable=True), + ] + ) for field in self.tbl.metadata.schema().fields: readable_metrics_struct.append( @@ -117,35 +123,39 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType: partition_record = self.tbl.metadata.specs_struct() pa_record_struct = schema_to_pyarrow(partition_record) - entries_schema = pa.schema([ - pa.field("status", pa.int8(), nullable=False), - pa.field("snapshot_id", pa.int64(), nullable=False), - pa.field("sequence_number", pa.int64(), nullable=False), - pa.field("file_sequence_number", pa.int64(), nullable=False), - pa.field( - "data_file", - pa.struct([ - pa.field("content", pa.int8(), nullable=False), - pa.field("file_path", pa.string(), nullable=False), - pa.field("file_format", pa.string(), nullable=False), - pa.field("partition", pa_record_struct, nullable=False), - pa.field("record_count", pa.int64(), nullable=False), - pa.field("file_size_in_bytes", pa.int64(), nullable=False), - pa.field("column_sizes", pa.map_(pa.int32(), pa.int64()), nullable=True), - pa.field("value_counts", pa.map_(pa.int32(), pa.int64()), nullable=True), - pa.field("null_value_counts", pa.map_(pa.int32(), pa.int64()), nullable=True), - pa.field("nan_value_counts", pa.map_(pa.int32(), pa.int64()), nullable=True), - pa.field("lower_bounds", pa.map_(pa.int32(), pa.binary()), nullable=True), - pa.field("upper_bounds", pa.map_(pa.int32(), pa.binary()), nullable=True), - pa.field("key_metadata", pa.binary(), nullable=True), - pa.field("split_offsets", pa.list_(pa.int64()), nullable=True), - pa.field("equality_ids", pa.list_(pa.int32()), nullable=True), - pa.field("sort_order_id", pa.int32(), nullable=True), - ]), - nullable=False, - ), - pa.field("readable_metrics", pa.struct(readable_metrics_struct), nullable=True), - ]) + entries_schema = pa.schema( + [ + pa.field("status", pa.int8(), nullable=False), + pa.field("snapshot_id", pa.int64(), nullable=False), + pa.field("sequence_number", pa.int64(), nullable=False), + pa.field("file_sequence_number", pa.int64(), nullable=False), + pa.field( + "data_file", + pa.struct( + [ + pa.field("content", pa.int8(), nullable=False), + pa.field("file_path", pa.string(), nullable=False), + pa.field("file_format", pa.string(), nullable=False), + pa.field("partition", pa_record_struct, nullable=False), + pa.field("record_count", pa.int64(), nullable=False), + pa.field("file_size_in_bytes", pa.int64(), nullable=False), + pa.field("column_sizes", pa.map_(pa.int32(), pa.int64()), nullable=True), + pa.field("value_counts", pa.map_(pa.int32(), pa.int64()), nullable=True), + pa.field("null_value_counts", pa.map_(pa.int32(), pa.int64()), nullable=True), + pa.field("nan_value_counts", pa.map_(pa.int32(), pa.int64()), nullable=True), + pa.field("lower_bounds", pa.map_(pa.int32(), pa.binary()), nullable=True), + pa.field("upper_bounds", pa.map_(pa.int32(), pa.binary()), nullable=True), + pa.field("key_metadata", pa.binary(), nullable=True), + pa.field("split_offsets", pa.list_(pa.int64()), nullable=True), + pa.field("equality_ids", pa.list_(pa.int32()), nullable=True), + pa.field("sort_order_id", pa.int32(), nullable=True), + ] + ), + nullable=False, + ), + pa.field("readable_metrics", pa.struct(readable_metrics_struct), nullable=True), + ] + ) entries = [] snapshot = self._get_snapshot(snapshot_id) @@ -180,32 +190,34 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType: for pos, field in enumerate(self.tbl.metadata.specs()[manifest.partition_spec_id].fields) } - entries.append({ - "status": entry.status.value, - "snapshot_id": entry.snapshot_id, - "sequence_number": entry.sequence_number, - "file_sequence_number": entry.file_sequence_number, - "data_file": { - "content": entry.data_file.content, - "file_path": entry.data_file.file_path, - "file_format": entry.data_file.file_format, - "partition": partition_record_dict, - "record_count": entry.data_file.record_count, - "file_size_in_bytes": entry.data_file.file_size_in_bytes, - "column_sizes": dict(entry.data_file.column_sizes), - "value_counts": dict(entry.data_file.value_counts), - "null_value_counts": dict(entry.data_file.null_value_counts), - "nan_value_counts": entry.data_file.nan_value_counts, - "lower_bounds": entry.data_file.lower_bounds, - "upper_bounds": entry.data_file.upper_bounds, - "key_metadata": entry.data_file.key_metadata, - "split_offsets": entry.data_file.split_offsets, - "equality_ids": entry.data_file.equality_ids, - "sort_order_id": entry.data_file.sort_order_id, - "spec_id": entry.data_file.spec_id, - }, - "readable_metrics": readable_metrics, - }) + entries.append( + { + "status": entry.status.value, + "snapshot_id": entry.snapshot_id, + "sequence_number": entry.sequence_number, + "file_sequence_number": entry.file_sequence_number, + "data_file": { + "content": entry.data_file.content, + "file_path": entry.data_file.file_path, + "file_format": entry.data_file.file_format, + "partition": partition_record_dict, + "record_count": entry.data_file.record_count, + "file_size_in_bytes": entry.data_file.file_size_in_bytes, + "column_sizes": dict(entry.data_file.column_sizes), + "value_counts": dict(entry.data_file.value_counts), + "null_value_counts": dict(entry.data_file.null_value_counts), + "nan_value_counts": entry.data_file.nan_value_counts, + "lower_bounds": entry.data_file.lower_bounds, + "upper_bounds": entry.data_file.upper_bounds, + "key_metadata": entry.data_file.key_metadata, + "split_offsets": entry.data_file.split_offsets, + "equality_ids": entry.data_file.equality_ids, + "sort_order_id": entry.data_file.sort_order_id, + "spec_id": entry.data_file.spec_id, + }, + "readable_metrics": readable_metrics, + } + ) return pa.Table.from_pylist( entries, @@ -215,26 +227,30 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType: def refs(self) -> "pa.Table": import pyarrow as pa - ref_schema = pa.schema([ - pa.field("name", pa.string(), nullable=False), - pa.field("type", pa.dictionary(pa.int32(), pa.string()), nullable=False), - pa.field("snapshot_id", pa.int64(), nullable=False), - pa.field("max_reference_age_in_ms", pa.int64(), nullable=True), - pa.field("min_snapshots_to_keep", pa.int32(), nullable=True), - pa.field("max_snapshot_age_in_ms", pa.int64(), nullable=True), - ]) + ref_schema = pa.schema( + [ + pa.field("name", pa.string(), nullable=False), + pa.field("type", pa.dictionary(pa.int32(), pa.string()), nullable=False), + pa.field("snapshot_id", pa.int64(), nullable=False), + pa.field("max_reference_age_in_ms", pa.int64(), nullable=True), + pa.field("min_snapshots_to_keep", pa.int32(), nullable=True), + pa.field("max_snapshot_age_in_ms", pa.int64(), nullable=True), + ] + ) ref_results = [] for ref in self.tbl.metadata.refs: if snapshot_ref := self.tbl.metadata.refs.get(ref): - ref_results.append({ - "name": ref, - "type": snapshot_ref.snapshot_ref_type.upper(), - "snapshot_id": snapshot_ref.snapshot_id, - "max_reference_age_in_ms": snapshot_ref.max_ref_age_ms, - "min_snapshots_to_keep": snapshot_ref.min_snapshots_to_keep, - "max_snapshot_age_in_ms": snapshot_ref.max_snapshot_age_ms, - }) + ref_results.append( + { + "name": ref, + "type": snapshot_ref.snapshot_ref_type.upper(), + "snapshot_id": snapshot_ref.snapshot_id, + "max_reference_age_in_ms": snapshot_ref.max_ref_age_ms, + "min_snapshots_to_keep": snapshot_ref.min_snapshots_to_keep, + "max_snapshot_age_in_ms": snapshot_ref.max_snapshot_age_ms, + } + ) return pa.Table.from_pylist(ref_results, schema=ref_schema) @@ -243,27 +259,31 @@ def partitions(self, snapshot_id: Optional[int] = None) -> "pa.Table": from pyiceberg.io.pyarrow import schema_to_pyarrow - table_schema = pa.schema([ - pa.field("record_count", pa.int64(), nullable=False), - pa.field("file_count", pa.int32(), nullable=False), - pa.field("total_data_file_size_in_bytes", pa.int64(), nullable=False), - pa.field("position_delete_record_count", pa.int64(), nullable=False), - pa.field("position_delete_file_count", pa.int32(), nullable=False), - pa.field("equality_delete_record_count", pa.int64(), nullable=False), - pa.field("equality_delete_file_count", pa.int32(), nullable=False), - pa.field("last_updated_at", pa.timestamp(unit="ms"), nullable=True), - pa.field("last_updated_snapshot_id", pa.int64(), nullable=True), - ]) + table_schema = pa.schema( + [ + pa.field("record_count", pa.int64(), nullable=False), + pa.field("file_count", pa.int32(), nullable=False), + pa.field("total_data_file_size_in_bytes", pa.int64(), nullable=False), + pa.field("position_delete_record_count", pa.int64(), nullable=False), + pa.field("position_delete_file_count", pa.int32(), nullable=False), + pa.field("equality_delete_record_count", pa.int64(), nullable=False), + pa.field("equality_delete_file_count", pa.int32(), nullable=False), + pa.field("last_updated_at", pa.timestamp(unit="ms"), nullable=True), + pa.field("last_updated_snapshot_id", pa.int64(), nullable=True), + ] + ) partition_record = self.tbl.metadata.specs_struct() has_partitions = len(partition_record.fields) > 0 if has_partitions: pa_record_struct = schema_to_pyarrow(partition_record) - partitions_schema = pa.schema([ - pa.field("partition", pa_record_struct, nullable=False), - pa.field("spec_id", pa.int32(), nullable=False), - ]) + partitions_schema = pa.schema( + [ + pa.field("partition", pa_record_struct, nullable=False), + pa.field("spec_id", pa.int32(), nullable=False), + ] + ) table_schema = pa.unify_schemas([partitions_schema, table_schema]) @@ -329,27 +349,31 @@ def update_partitions_map( def manifests(self) -> "pa.Table": import pyarrow as pa - partition_summary_schema = pa.struct([ - pa.field("contains_null", pa.bool_(), nullable=False), - pa.field("contains_nan", pa.bool_(), nullable=True), - pa.field("lower_bound", pa.string(), nullable=True), - pa.field("upper_bound", pa.string(), nullable=True), - ]) - - manifest_schema = pa.schema([ - pa.field("content", pa.int8(), nullable=False), - pa.field("path", pa.string(), nullable=False), - pa.field("length", pa.int64(), nullable=False), - pa.field("partition_spec_id", pa.int32(), nullable=False), - pa.field("added_snapshot_id", pa.int64(), nullable=False), - pa.field("added_data_files_count", pa.int32(), nullable=False), - pa.field("existing_data_files_count", pa.int32(), nullable=False), - pa.field("deleted_data_files_count", pa.int32(), nullable=False), - pa.field("added_delete_files_count", pa.int32(), nullable=False), - pa.field("existing_delete_files_count", pa.int32(), nullable=False), - pa.field("deleted_delete_files_count", pa.int32(), nullable=False), - pa.field("partition_summaries", pa.list_(partition_summary_schema), nullable=False), - ]) + partition_summary_schema = pa.struct( + [ + pa.field("contains_null", pa.bool_(), nullable=False), + pa.field("contains_nan", pa.bool_(), nullable=True), + pa.field("lower_bound", pa.string(), nullable=True), + pa.field("upper_bound", pa.string(), nullable=True), + ] + ) + + manifest_schema = pa.schema( + [ + pa.field("content", pa.int8(), nullable=False), + pa.field("path", pa.string(), nullable=False), + pa.field("length", pa.int64(), nullable=False), + pa.field("partition_spec_id", pa.int32(), nullable=False), + pa.field("added_snapshot_id", pa.int64(), nullable=False), + pa.field("added_data_files_count", pa.int32(), nullable=False), + pa.field("existing_data_files_count", pa.int32(), nullable=False), + pa.field("deleted_data_files_count", pa.int32(), nullable=False), + pa.field("added_delete_files_count", pa.int32(), nullable=False), + pa.field("existing_delete_files_count", pa.int32(), nullable=False), + pa.field("deleted_delete_files_count", pa.int32(), nullable=False), + pa.field("partition_summaries", pa.list_(partition_summary_schema), nullable=False), + ] + ) def _partition_summaries_to_rows( spec: PartitionSpec, partition_summaries: List[PartitionFieldSummary] @@ -376,12 +400,14 @@ def _partition_summaries_to_rows( if field_summary.upper_bound else None ) - rows.append({ - "contains_null": field_summary.contains_null, - "contains_nan": field_summary.contains_nan, - "lower_bound": lower_bound, - "upper_bound": upper_bound, - }) + rows.append( + { + "contains_null": field_summary.contains_null, + "contains_nan": field_summary.contains_nan, + "lower_bound": lower_bound, + "upper_bound": upper_bound, + } + ) return rows specs = self.tbl.metadata.specs() @@ -390,22 +416,26 @@ def _partition_summaries_to_rows( for manifest in snapshot.manifests(self.tbl.io): is_data_file = manifest.content == ManifestContent.DATA is_delete_file = manifest.content == ManifestContent.DELETES - manifests.append({ - "content": manifest.content, - "path": manifest.manifest_path, - "length": manifest.manifest_length, - "partition_spec_id": manifest.partition_spec_id, - "added_snapshot_id": manifest.added_snapshot_id, - "added_data_files_count": manifest.added_files_count if is_data_file else 0, - "existing_data_files_count": manifest.existing_files_count if is_data_file else 0, - "deleted_data_files_count": manifest.deleted_files_count if is_data_file else 0, - "added_delete_files_count": manifest.added_files_count if is_delete_file else 0, - "existing_delete_files_count": manifest.existing_files_count if is_delete_file else 0, - "deleted_delete_files_count": manifest.deleted_files_count if is_delete_file else 0, - "partition_summaries": _partition_summaries_to_rows(specs[manifest.partition_spec_id], manifest.partitions) - if manifest.partitions - else [], - }) + manifests.append( + { + "content": manifest.content, + "path": manifest.manifest_path, + "length": manifest.manifest_length, + "partition_spec_id": manifest.partition_spec_id, + "added_snapshot_id": manifest.added_snapshot_id, + "added_data_files_count": manifest.added_files_count if is_data_file else 0, + "existing_data_files_count": manifest.existing_files_count if is_data_file else 0, + "deleted_data_files_count": manifest.deleted_files_count if is_data_file else 0, + "added_delete_files_count": manifest.added_files_count if is_delete_file else 0, + "existing_delete_files_count": manifest.existing_files_count if is_delete_file else 0, + "deleted_delete_files_count": manifest.deleted_files_count if is_delete_file else 0, + "partition_summaries": _partition_summaries_to_rows( + specs[manifest.partition_spec_id], manifest.partitions + ) + if manifest.partitions + else [], + } + ) return pa.Table.from_pylist( manifests, @@ -417,13 +447,15 @@ def metadata_log_entries(self) -> "pa.Table": from pyiceberg.table.snapshots import MetadataLogEntry - table_schema = pa.schema([ - pa.field("timestamp", pa.timestamp(unit="ms"), nullable=False), - pa.field("file", pa.string(), nullable=False), - pa.field("latest_snapshot_id", pa.int64(), nullable=True), - pa.field("latest_schema_id", pa.int32(), nullable=True), - pa.field("latest_sequence_number", pa.int64(), nullable=True), - ]) + table_schema = pa.schema( + [ + pa.field("timestamp", pa.timestamp(unit="ms"), nullable=False), + pa.field("file", pa.string(), nullable=False), + pa.field("latest_snapshot_id", pa.int64(), nullable=True), + pa.field("latest_schema_id", pa.int32(), nullable=True), + pa.field("latest_sequence_number", pa.int64(), nullable=True), + ] + ) def metadata_log_entry_to_row(metadata_entry: MetadataLogEntry) -> Dict[str, Any]: latest_snapshot = self.tbl.snapshot_as_of_timestamp(metadata_entry.timestamp_ms) @@ -449,12 +481,14 @@ def metadata_log_entry_to_row(metadata_entry: MetadataLogEntry) -> Dict[str, Any def history(self) -> "pa.Table": import pyarrow as pa - history_schema = pa.schema([ - pa.field("made_current_at", pa.timestamp(unit="ms"), nullable=False), - pa.field("snapshot_id", pa.int64(), nullable=False), - pa.field("parent_id", pa.int64(), nullable=True), - pa.field("is_current_ancestor", pa.bool_(), nullable=False), - ]) + history_schema = pa.schema( + [ + pa.field("made_current_at", pa.timestamp(unit="ms"), nullable=False), + pa.field("snapshot_id", pa.int64(), nullable=False), + pa.field("parent_id", pa.int64(), nullable=True), + pa.field("is_current_ancestor", pa.bool_(), nullable=False), + ] + ) ancestors_ids = {snapshot.snapshot_id for snapshot in ancestors_of(self.tbl.current_snapshot(), self.tbl.metadata)} @@ -464,12 +498,14 @@ def history(self) -> "pa.Table": for snapshot_entry in metadata.snapshot_log: snapshot = metadata.snapshot_by_id(snapshot_entry.snapshot_id) - history.append({ - "made_current_at": datetime.fromtimestamp(snapshot_entry.timestamp_ms / 1000.0, tz=timezone.utc), - "snapshot_id": snapshot_entry.snapshot_id, - "parent_id": snapshot.parent_snapshot_id if snapshot else None, - "is_current_ancestor": snapshot_entry.snapshot_id in ancestors_ids, - }) + history.append( + { + "made_current_at": datetime.fromtimestamp(snapshot_entry.timestamp_ms / 1000.0, tz=timezone.utc), + "snapshot_id": snapshot_entry.snapshot_id, + "parent_id": snapshot.parent_snapshot_id if snapshot else None, + "is_current_ancestor": snapshot_entry.snapshot_id in ancestors_ids, + } + ) return pa.Table.from_pylist(history, schema=history_schema) @@ -483,39 +519,43 @@ def _files(self, snapshot_id: Optional[int] = None, data_file_filter: Optional[S def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType: pa_bound_type = schema_to_pyarrow(bound_type) - return pa.struct([ - pa.field("column_size", pa.int64(), nullable=True), - pa.field("value_count", pa.int64(), nullable=True), - pa.field("null_value_count", pa.int64(), nullable=True), - pa.field("nan_value_count", pa.int64(), nullable=True), - pa.field("lower_bound", pa_bound_type, nullable=True), - pa.field("upper_bound", pa_bound_type, nullable=True), - ]) + return pa.struct( + [ + pa.field("column_size", pa.int64(), nullable=True), + pa.field("value_count", pa.int64(), nullable=True), + pa.field("null_value_count", pa.int64(), nullable=True), + pa.field("nan_value_count", pa.int64(), nullable=True), + pa.field("lower_bound", pa_bound_type, nullable=True), + pa.field("upper_bound", pa_bound_type, nullable=True), + ] + ) for field in self.tbl.metadata.schema().fields: readable_metrics_struct.append( pa.field(schema.find_column_name(field.field_id), _readable_metrics_struct(field.field_type), nullable=False) ) - files_schema = pa.schema([ - pa.field("content", pa.int8(), nullable=False), - pa.field("file_path", pa.string(), nullable=False), - pa.field("file_format", pa.dictionary(pa.int32(), pa.string()), nullable=False), - pa.field("spec_id", pa.int32(), nullable=False), - pa.field("record_count", pa.int64(), nullable=False), - pa.field("file_size_in_bytes", pa.int64(), nullable=False), - pa.field("column_sizes", pa.map_(pa.int32(), pa.int64()), nullable=True), - pa.field("value_counts", pa.map_(pa.int32(), pa.int64()), nullable=True), - pa.field("null_value_counts", pa.map_(pa.int32(), pa.int64()), nullable=True), - pa.field("nan_value_counts", pa.map_(pa.int32(), pa.int64()), nullable=True), - pa.field("lower_bounds", pa.map_(pa.int32(), pa.binary()), nullable=True), - pa.field("upper_bounds", pa.map_(pa.int32(), pa.binary()), nullable=True), - pa.field("key_metadata", pa.binary(), nullable=True), - pa.field("split_offsets", pa.list_(pa.int64()), nullable=True), - pa.field("equality_ids", pa.list_(pa.int32()), nullable=True), - pa.field("sort_order_id", pa.int32(), nullable=True), - pa.field("readable_metrics", pa.struct(readable_metrics_struct), nullable=True), - ]) + files_schema = pa.schema( + [ + pa.field("content", pa.int8(), nullable=False), + pa.field("file_path", pa.string(), nullable=False), + pa.field("file_format", pa.dictionary(pa.int32(), pa.string()), nullable=False), + pa.field("spec_id", pa.int32(), nullable=False), + pa.field("record_count", pa.int64(), nullable=False), + pa.field("file_size_in_bytes", pa.int64(), nullable=False), + pa.field("column_sizes", pa.map_(pa.int32(), pa.int64()), nullable=True), + pa.field("value_counts", pa.map_(pa.int32(), pa.int64()), nullable=True), + pa.field("null_value_counts", pa.map_(pa.int32(), pa.int64()), nullable=True), + pa.field("nan_value_counts", pa.map_(pa.int32(), pa.int64()), nullable=True), + pa.field("lower_bounds", pa.map_(pa.int32(), pa.binary()), nullable=True), + pa.field("upper_bounds", pa.map_(pa.int32(), pa.binary()), nullable=True), + pa.field("key_metadata", pa.binary(), nullable=True), + pa.field("split_offsets", pa.list_(pa.int64()), nullable=True), + pa.field("equality_ids", pa.list_(pa.int32()), nullable=True), + pa.field("sort_order_id", pa.int32(), nullable=True), + pa.field("readable_metrics", pa.struct(readable_metrics_struct), nullable=True), + ] + ) files: list[dict[str, Any]] = [] @@ -553,25 +593,29 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType: } for field in self.tbl.metadata.schema().fields } - files.append({ - "content": data_file.content, - "file_path": data_file.file_path, - "file_format": data_file.file_format, - "spec_id": data_file.spec_id, - "record_count": data_file.record_count, - "file_size_in_bytes": data_file.file_size_in_bytes, - "column_sizes": dict(data_file.column_sizes) if data_file.column_sizes is not None else None, - "value_counts": dict(data_file.value_counts) if data_file.value_counts is not None else None, - "null_value_counts": dict(data_file.null_value_counts) if data_file.null_value_counts is not None else None, - "nan_value_counts": dict(data_file.nan_value_counts) if data_file.nan_value_counts is not None else None, - "lower_bounds": dict(data_file.lower_bounds) if data_file.lower_bounds is not None else None, - "upper_bounds": dict(data_file.upper_bounds) if data_file.upper_bounds is not None else None, - "key_metadata": data_file.key_metadata, - "split_offsets": data_file.split_offsets, - "equality_ids": data_file.equality_ids, - "sort_order_id": data_file.sort_order_id, - "readable_metrics": readable_metrics, - }) + files.append( + { + "content": data_file.content, + "file_path": data_file.file_path, + "file_format": data_file.file_format, + "spec_id": data_file.spec_id, + "record_count": data_file.record_count, + "file_size_in_bytes": data_file.file_size_in_bytes, + "column_sizes": dict(data_file.column_sizes) if data_file.column_sizes is not None else None, + "value_counts": dict(data_file.value_counts) if data_file.value_counts is not None else None, + "null_value_counts": dict(data_file.null_value_counts) + if data_file.null_value_counts is not None + else None, + "nan_value_counts": dict(data_file.nan_value_counts) if data_file.nan_value_counts is not None else None, + "lower_bounds": dict(data_file.lower_bounds) if data_file.lower_bounds is not None else None, + "upper_bounds": dict(data_file.upper_bounds) if data_file.upper_bounds is not None else None, + "key_metadata": data_file.key_metadata, + "split_offsets": data_file.split_offsets, + "equality_ids": data_file.equality_ids, + "sort_order_id": data_file.sort_order_id, + "readable_metrics": readable_metrics, + } + ) return pa.Table.from_pylist( files, diff --git a/ruff.toml b/ruff.toml index caaa108c84..11fd2a957b 100644 --- a/ruff.toml +++ b/ruff.toml @@ -58,7 +58,7 @@ select = [ "I", # isort "UP", # pyupgrade ] -ignore = ["E501","E203","B024","B028","UP037"] +ignore = ["E501","E203","B024","B028","UP037", "UP035", "UP006"] # Allow autofix for all enabled rules (when `--fix`) is provided. fixable = ["ALL"] diff --git a/tests/avro/test_resolver.py b/tests/avro/test_resolver.py index decd9060a4..b5388b5ebb 100644 --- a/tests/avro/test_resolver.py +++ b/tests/avro/test_resolver.py @@ -322,30 +322,34 @@ def test_resolver_initial_value() -> None: def test_resolve_writer() -> None: actual = resolve_writer(record_schema=MANIFEST_ENTRY_SCHEMAS[2], file_schema=MANIFEST_ENTRY_SCHEMAS[1]) - expected = StructWriter(( - (0, IntegerWriter()), - (1, IntegerWriter()), + expected = StructWriter( ( - 4, - StructWriter(( - (1, StringWriter()), - (2, StringWriter()), - (3, StructWriter(())), - (4, IntegerWriter()), - (5, IntegerWriter()), - (None, DefaultWriter(writer=IntegerWriter(), value=67108864)), - (6, OptionWriter(option=MapWriter(key_writer=IntegerWriter(), value_writer=IntegerWriter()))), - (7, OptionWriter(option=MapWriter(key_writer=IntegerWriter(), value_writer=IntegerWriter()))), - (8, OptionWriter(option=MapWriter(key_writer=IntegerWriter(), value_writer=IntegerWriter()))), - (9, OptionWriter(option=MapWriter(key_writer=IntegerWriter(), value_writer=IntegerWriter()))), - (10, OptionWriter(option=MapWriter(key_writer=IntegerWriter(), value_writer=BinaryWriter()))), - (11, OptionWriter(option=MapWriter(key_writer=IntegerWriter(), value_writer=BinaryWriter()))), - (12, OptionWriter(option=BinaryWriter())), - (13, OptionWriter(option=ListWriter(element_writer=IntegerWriter()))), - (15, OptionWriter(option=IntegerWriter())), - )), - ), - )) + (0, IntegerWriter()), + (1, IntegerWriter()), + ( + 4, + StructWriter( + ( + (1, StringWriter()), + (2, StringWriter()), + (3, StructWriter(())), + (4, IntegerWriter()), + (5, IntegerWriter()), + (None, DefaultWriter(writer=IntegerWriter(), value=67108864)), + (6, OptionWriter(option=MapWriter(key_writer=IntegerWriter(), value_writer=IntegerWriter()))), + (7, OptionWriter(option=MapWriter(key_writer=IntegerWriter(), value_writer=IntegerWriter()))), + (8, OptionWriter(option=MapWriter(key_writer=IntegerWriter(), value_writer=IntegerWriter()))), + (9, OptionWriter(option=MapWriter(key_writer=IntegerWriter(), value_writer=IntegerWriter()))), + (10, OptionWriter(option=MapWriter(key_writer=IntegerWriter(), value_writer=BinaryWriter()))), + (11, OptionWriter(option=MapWriter(key_writer=IntegerWriter(), value_writer=BinaryWriter()))), + (12, OptionWriter(option=BinaryWriter())), + (13, OptionWriter(option=ListWriter(element_writer=IntegerWriter()))), + (15, OptionWriter(option=IntegerWriter())), + ) + ), + ), + ) + ) assert actual == expected diff --git a/tests/avro/test_writer.py b/tests/avro/test_writer.py index 5a531c7748..39b8ecc393 100644 --- a/tests/avro/test_writer.py +++ b/tests/avro/test_writer.py @@ -178,15 +178,17 @@ class MyStruct(Record): construct_writer(schema).write(encoder, my_struct) - assert output.getbuffer() == b"".join([ - b"\x18", - zigzag_encode(len(my_struct.properties)), - zigzag_encode(1), - zigzag_encode(2), - zigzag_encode(3), - zigzag_encode(4), - b"\x00", - ]) + assert output.getbuffer() == b"".join( + [ + b"\x18", + zigzag_encode(len(my_struct.properties)), + zigzag_encode(1), + zigzag_encode(2), + zigzag_encode(3), + zigzag_encode(4), + b"\x00", + ] + ) def test_write_struct_with_list() -> None: @@ -206,15 +208,17 @@ class MyStruct(Record): construct_writer(schema).write(encoder, my_struct) - assert output.getbuffer() == b"".join([ - b"\x18", - zigzag_encode(len(my_struct.properties)), - zigzag_encode(1), - zigzag_encode(2), - zigzag_encode(3), - zigzag_encode(4), - b"\x00", - ]) + assert output.getbuffer() == b"".join( + [ + b"\x18", + zigzag_encode(len(my_struct.properties)), + zigzag_encode(1), + zigzag_encode(2), + zigzag_encode(3), + zigzag_encode(4), + b"\x00", + ] + ) def test_write_decimal() -> None: diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index 2a4b3a7a1f..21aa9677bd 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -323,19 +323,19 @@ def test_properties_sets_headers(requests_mock: Mocker) -> None: **{"header.Content-Type": "application/vnd.api+json", "header.Customized-Header": "some/value"}, ) - assert catalog._session.headers.get("Content-type") == "application/json", ( - "Expected 'Content-Type' default header not to be overwritten" - ) - assert requests_mock.last_request.headers["Content-type"] == "application/json", ( - "Config request did not include expected 'Content-Type' header" - ) + assert ( + catalog._session.headers.get("Content-type") == "application/json" + ), "Expected 'Content-Type' default header not to be overwritten" + assert ( + requests_mock.last_request.headers["Content-type"] == "application/json" + ), "Config request did not include expected 'Content-Type' header" - assert catalog._session.headers.get("Customized-Header") == "some/value", ( - "Expected 'Customized-Header' header to be 'some/value'" - ) - assert requests_mock.last_request.headers["Customized-Header"] == "some/value", ( - "Config request did not include expected 'Customized-Header' header" - ) + assert ( + catalog._session.headers.get("Customized-Header") == "some/value" + ), "Expected 'Customized-Header' header to be 'some/value'" + assert ( + requests_mock.last_request.headers["Customized-Header"] == "some/value" + ), "Config request did not include expected 'Customized-Header' header" def test_config_sets_headers(requests_mock: Mocker) -> None: @@ -352,19 +352,19 @@ def test_config_sets_headers(requests_mock: Mocker) -> None: catalog = RestCatalog("rest", uri=TEST_URI, warehouse="s3://some-bucket") catalog.create_namespace(namespace) - assert catalog._session.headers.get("Content-type") == "application/json", ( - "Expected 'Content-Type' default header not to be overwritten" - ) - assert requests_mock.last_request.headers["Content-type"] == "application/json", ( - "Create namespace request did not include expected 'Content-Type' header" - ) + assert ( + catalog._session.headers.get("Content-type") == "application/json" + ), "Expected 'Content-Type' default header not to be overwritten" + assert ( + requests_mock.last_request.headers["Content-type"] == "application/json" + ), "Create namespace request did not include expected 'Content-Type' header" - assert catalog._session.headers.get("Customized-Header") == "some/value", ( - "Expected 'Customized-Header' header to be 'some/value'" - ) - assert requests_mock.last_request.headers["Customized-Header"] == "some/value", ( - "Create namespace request did not include expected 'Customized-Header' header" - ) + assert ( + catalog._session.headers.get("Customized-Header") == "some/value" + ), "Expected 'Customized-Header' header to be 'some/value'" + assert ( + requests_mock.last_request.headers["Customized-Header"] == "some/value" + ), "Create namespace request did not include expected 'Customized-Header' header" @pytest.mark.filterwarnings( diff --git a/tests/catalog/test_sql.py b/tests/catalog/test_sql.py index 7f72568b41..cffc14d9d7 100644 --- a/tests/catalog/test_sql.py +++ b/tests/catalog/test_sql.py @@ -401,12 +401,14 @@ def test_write_pyarrow_schema(catalog: SqlCatalog, table_identifier: Identifier) pa.array([True, None, False, True]), # 'baz' column pa.array([None, "A", "B", "C"]), # 'large' column ], - schema=pa.schema([ - pa.field("foo", pa.large_string(), nullable=True), - pa.field("bar", pa.int32(), nullable=False), - pa.field("baz", pa.bool_(), nullable=True), - pa.field("large", pa.large_string(), nullable=True), - ]), + schema=pa.schema( + [ + pa.field("foo", pa.large_string(), nullable=True), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + pa.field("large", pa.large_string(), nullable=True), + ] + ), ) namespace = Catalog.namespace_from(table_identifier) catalog.create_namespace(namespace) @@ -1426,10 +1428,12 @@ def test_write_and_evolve(catalog: SqlCatalog, format_version: int) -> None: "foo": ["a", None, "z"], "bar": [19, None, 25], }, - schema=pa.schema([ - pa.field("foo", pa.large_string(), nullable=True), - pa.field("bar", pa.int32(), nullable=True), - ]), + schema=pa.schema( + [ + pa.field("foo", pa.large_string(), nullable=True), + pa.field("bar", pa.int32(), nullable=True), + ] + ), ) with tbl.transaction() as txn: @@ -1474,10 +1478,12 @@ def test_create_table_transaction(catalog: SqlCatalog, format_version: int) -> N "foo": ["a", None, "z"], "bar": [19, None, 25], }, - schema=pa.schema([ - pa.field("foo", pa.large_string(), nullable=True), - pa.field("bar", pa.int32(), nullable=True), - ]), + schema=pa.schema( + [ + pa.field("foo", pa.large_string(), nullable=True), + pa.field("bar", pa.int32(), nullable=True), + ] + ), ) with catalog.create_table_transaction( diff --git a/tests/conftest.py b/tests/conftest.py index 22329b3882..ef980f3818 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -353,49 +353,57 @@ def table_schema_with_all_types() -> Schema: def pyarrow_schema_simple_without_ids() -> "pa.Schema": import pyarrow as pa - return pa.schema([ - pa.field("foo", pa.string(), nullable=True), - pa.field("bar", pa.int32(), nullable=False), - pa.field("baz", pa.bool_(), nullable=True), - ]) + return pa.schema( + [ + pa.field("foo", pa.string(), nullable=True), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + ] + ) @pytest.fixture(scope="session") def pyarrow_schema_nested_without_ids() -> "pa.Schema": import pyarrow as pa - return pa.schema([ - pa.field("foo", pa.string(), nullable=False), - pa.field("bar", pa.int32(), nullable=False), - pa.field("baz", pa.bool_(), nullable=True), - pa.field("qux", pa.list_(pa.string()), nullable=False), - pa.field( - "quux", - pa.map_( - pa.string(), - pa.map_(pa.string(), pa.int32()), + return pa.schema( + [ + pa.field("foo", pa.string(), nullable=False), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + pa.field("qux", pa.list_(pa.string()), nullable=False), + pa.field( + "quux", + pa.map_( + pa.string(), + pa.map_(pa.string(), pa.int32()), + ), + nullable=False, ), - nullable=False, - ), - pa.field( - "location", - pa.list_( - pa.struct([ - pa.field("latitude", pa.float32(), nullable=False), - pa.field("longitude", pa.float32(), nullable=False), - ]), + pa.field( + "location", + pa.list_( + pa.struct( + [ + pa.field("latitude", pa.float32(), nullable=False), + pa.field("longitude", pa.float32(), nullable=False), + ] + ), + ), + nullable=False, ), - nullable=False, - ), - pa.field( - "person", - pa.struct([ - pa.field("name", pa.string(), nullable=True), - pa.field("age", pa.int32(), nullable=False), - ]), - nullable=True, - ), - ]) + pa.field( + "person", + pa.struct( + [ + pa.field("name", pa.string(), nullable=True), + pa.field("age", pa.int32(), nullable=False), + ] + ), + nullable=True, + ), + ] + ) @pytest.fixture(scope="session") @@ -2314,26 +2322,28 @@ def spark() -> "SparkSession": def pa_schema() -> "pa.Schema": import pyarrow as pa - return pa.schema([ - ("bool", pa.bool_()), - ("string", pa.large_string()), - ("string_long", pa.large_string()), - ("int", pa.int32()), - ("long", pa.int64()), - ("float", pa.float32()), - ("double", pa.float64()), - # Not supported by Spark - # ("time", pa.time64('us')), - ("timestamp", pa.timestamp(unit="us")), - ("timestamptz", pa.timestamp(unit="us", tz="UTC")), - ("date", pa.date32()), - # Not supported by Spark - # ("time", pa.time64("us")), - # Not natively supported by Arrow - # ("uuid", pa.fixed(16)), - ("binary", pa.large_binary()), - ("fixed", pa.binary(16)), - ]) + return pa.schema( + [ + ("bool", pa.bool_()), + ("string", pa.large_string()), + ("string_long", pa.large_string()), + ("int", pa.int32()), + ("long", pa.int64()), + ("float", pa.float32()), + ("double", pa.float64()), + # Not supported by Spark + # ("time", pa.time64('us')), + ("timestamp", pa.timestamp(unit="us")), + ("timestamptz", pa.timestamp(unit="us", tz="UTC")), + ("date", pa.date32()), + # Not supported by Spark + # ("time", pa.time64("us")), + # Not natively supported by Arrow + # ("uuid", pa.fixed(16)), + ("binary", pa.large_binary()), + ("fixed", pa.binary(16)), + ] + ) @pytest.fixture(scope="session") @@ -2415,11 +2425,13 @@ def arrow_table_date_timestamps() -> "pa.Table": None, ], }, - schema=pa.schema([ - ("date", pa.date32()), - ("timestamp", pa.timestamp(unit="us")), - ("timestamptz", pa.timestamp(unit="us", tz="UTC")), - ]), + schema=pa.schema( + [ + ("date", pa.date32()), + ("timestamp", pa.timestamp(unit="us")), + ("timestamptz", pa.timestamp(unit="us", tz="UTC")), + ] + ), ) @@ -2438,19 +2450,21 @@ def arrow_table_schema_with_all_timestamp_precisions() -> "pa.Schema": """Pyarrow Schema with all supported timestamp types.""" import pyarrow as pa - return pa.schema([ - ("timestamp_s", pa.timestamp(unit="s")), - ("timestamptz_s", pa.timestamp(unit="s", tz="UTC")), - ("timestamp_ms", pa.timestamp(unit="ms")), - ("timestamptz_ms", pa.timestamp(unit="ms", tz="UTC")), - ("timestamp_us", pa.timestamp(unit="us")), - ("timestamptz_us", pa.timestamp(unit="us", tz="UTC")), - ("timestamp_ns", pa.timestamp(unit="ns")), - ("timestamptz_ns", pa.timestamp(unit="ns", tz="UTC")), - ("timestamptz_us_etc_utc", pa.timestamp(unit="us", tz="Etc/UTC")), - ("timestamptz_ns_z", pa.timestamp(unit="ns", tz="Z")), - ("timestamptz_s_0000", pa.timestamp(unit="s", tz="+00:00")), - ]) + return pa.schema( + [ + ("timestamp_s", pa.timestamp(unit="s")), + ("timestamptz_s", pa.timestamp(unit="s", tz="UTC")), + ("timestamp_ms", pa.timestamp(unit="ms")), + ("timestamptz_ms", pa.timestamp(unit="ms", tz="UTC")), + ("timestamp_us", pa.timestamp(unit="us")), + ("timestamptz_us", pa.timestamp(unit="us", tz="UTC")), + ("timestamp_ns", pa.timestamp(unit="ns")), + ("timestamptz_ns", pa.timestamp(unit="ns", tz="UTC")), + ("timestamptz_us_etc_utc", pa.timestamp(unit="us", tz="Etc/UTC")), + ("timestamptz_ns_z", pa.timestamp(unit="ns", tz="Z")), + ("timestamptz_s_0000", pa.timestamp(unit="s", tz="+00:00")), + ] + ) @pytest.fixture(scope="session") @@ -2459,51 +2473,53 @@ def arrow_table_with_all_timestamp_precisions(arrow_table_schema_with_all_timest import pandas as pd import pyarrow as pa - test_data = pd.DataFrame({ - "timestamp_s": [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)], - "timestamptz_s": [ - datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc), - None, - datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc), - ], - "timestamp_ms": [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)], - "timestamptz_ms": [ - datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc), - None, - datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc), - ], - "timestamp_us": [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)], - "timestamptz_us": [ - datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc), - None, - datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc), - ], - "timestamp_ns": [ - pd.Timestamp(year=2024, month=7, day=11, hour=3, minute=30, second=0, microsecond=12, nanosecond=6), - None, - pd.Timestamp(year=2024, month=7, day=11, hour=3, minute=30, second=0, microsecond=12, nanosecond=7), - ], - "timestamptz_ns": [ - datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc), - None, - datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc), - ], - "timestamptz_us_etc_utc": [ - datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc), - None, - datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc), - ], - "timestamptz_ns_z": [ - pd.Timestamp(year=2024, month=7, day=11, hour=3, minute=30, second=0, microsecond=12, nanosecond=6, tz="UTC"), - None, - pd.Timestamp(year=2024, month=7, day=11, hour=3, minute=30, second=0, microsecond=12, nanosecond=7, tz="UTC"), - ], - "timestamptz_s_0000": [ - datetime(2023, 1, 1, 19, 25, 1, tzinfo=timezone.utc), - None, - datetime(2023, 3, 1, 19, 25, 1, tzinfo=timezone.utc), - ], - }) + test_data = pd.DataFrame( + { + "timestamp_s": [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)], + "timestamptz_s": [ + datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc), + None, + datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc), + ], + "timestamp_ms": [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)], + "timestamptz_ms": [ + datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc), + None, + datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc), + ], + "timestamp_us": [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)], + "timestamptz_us": [ + datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc), + None, + datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc), + ], + "timestamp_ns": [ + pd.Timestamp(year=2024, month=7, day=11, hour=3, minute=30, second=0, microsecond=12, nanosecond=6), + None, + pd.Timestamp(year=2024, month=7, day=11, hour=3, minute=30, second=0, microsecond=12, nanosecond=7), + ], + "timestamptz_ns": [ + datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc), + None, + datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc), + ], + "timestamptz_us_etc_utc": [ + datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc), + None, + datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc), + ], + "timestamptz_ns_z": [ + pd.Timestamp(year=2024, month=7, day=11, hour=3, minute=30, second=0, microsecond=12, nanosecond=6, tz="UTC"), + None, + pd.Timestamp(year=2024, month=7, day=11, hour=3, minute=30, second=0, microsecond=12, nanosecond=7, tz="UTC"), + ], + "timestamptz_s_0000": [ + datetime(2023, 1, 1, 19, 25, 1, tzinfo=timezone.utc), + None, + datetime(2023, 3, 1, 19, 25, 1, tzinfo=timezone.utc), + ], + } + ) return pa.Table.from_pandas(test_data, schema=arrow_table_schema_with_all_timestamp_precisions) @@ -2512,19 +2528,21 @@ def arrow_table_schema_with_all_microseconds_timestamp_precisions() -> "pa.Schem """Pyarrow Schema with all microseconds timestamp.""" import pyarrow as pa - return pa.schema([ - ("timestamp_s", pa.timestamp(unit="us")), - ("timestamptz_s", pa.timestamp(unit="us", tz="UTC")), - ("timestamp_ms", pa.timestamp(unit="us")), - ("timestamptz_ms", pa.timestamp(unit="us", tz="UTC")), - ("timestamp_us", pa.timestamp(unit="us")), - ("timestamptz_us", pa.timestamp(unit="us", tz="UTC")), - ("timestamp_ns", pa.timestamp(unit="us")), - ("timestamptz_ns", pa.timestamp(unit="us", tz="UTC")), - ("timestamptz_us_etc_utc", pa.timestamp(unit="us", tz="UTC")), - ("timestamptz_ns_z", pa.timestamp(unit="us", tz="UTC")), - ("timestamptz_s_0000", pa.timestamp(unit="us", tz="UTC")), - ]) + return pa.schema( + [ + ("timestamp_s", pa.timestamp(unit="us")), + ("timestamptz_s", pa.timestamp(unit="us", tz="UTC")), + ("timestamp_ms", pa.timestamp(unit="us")), + ("timestamptz_ms", pa.timestamp(unit="us", tz="UTC")), + ("timestamp_us", pa.timestamp(unit="us")), + ("timestamptz_us", pa.timestamp(unit="us", tz="UTC")), + ("timestamp_ns", pa.timestamp(unit="us")), + ("timestamptz_ns", pa.timestamp(unit="us", tz="UTC")), + ("timestamptz_us_etc_utc", pa.timestamp(unit="us", tz="UTC")), + ("timestamptz_ns_z", pa.timestamp(unit="us", tz="UTC")), + ("timestamptz_s_0000", pa.timestamp(unit="us", tz="UTC")), + ] + ) @pytest.fixture(scope="session") @@ -2578,13 +2596,15 @@ def pyarrow_schema_with_promoted_types() -> "pa.Schema": """Pyarrow Schema with longs, doubles and uuid in simple and nested types.""" import pyarrow as pa - return pa.schema(( - pa.field("long", pa.int32(), nullable=True), # can support upcasting integer to long - pa.field("list", pa.list_(pa.int32()), nullable=False), # can support upcasting integer to long - pa.field("map", pa.map_(pa.string(), pa.int32()), nullable=False), # can support upcasting integer to long - pa.field("double", pa.float32(), nullable=True), # can support upcasting float to double - pa.field("uuid", pa.binary(length=16), nullable=True), # can support upcasting float to double - )) + return pa.schema( + ( + pa.field("long", pa.int32(), nullable=True), # can support upcasting integer to long + pa.field("list", pa.list_(pa.int32()), nullable=False), # can support upcasting integer to long + pa.field("map", pa.map_(pa.string(), pa.int32()), nullable=False), # can support upcasting integer to long + pa.field("double", pa.float32(), nullable=True), # can support upcasting float to double + pa.field("uuid", pa.binary(length=16), nullable=True), # can support upcasting float to double + ) + ) @pytest.fixture(scope="session") diff --git a/tests/expressions/test_evaluator.py b/tests/expressions/test_evaluator.py index f8a9a8806d..e2b1f27377 100644 --- a/tests/expressions/test_evaluator.py +++ b/tests/expressions/test_evaluator.py @@ -681,25 +681,25 @@ def data_file_nan() -> DataFile: def test_inclusive_metrics_evaluator_less_than_and_less_than_equal(schema_data_file_nan: Schema, data_file_nan: DataFile) -> None: for operator in [LessThan, LessThanOrEqual]: - should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("all_nan", 1)).eval(data_file_nan) + should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("all_nan", 1)).eval(data_file_nan) # type: ignore[arg-type] assert not should_read, "Should not match: all nan column doesn't contain number" - should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("max_nan", 1)).eval(data_file_nan) + should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("max_nan", 1)).eval(data_file_nan) # type: ignore[arg-type] assert not should_read, "Should not match: 1 is smaller than lower bound" - should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("max_nan", 10)).eval(data_file_nan) + should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("max_nan", 10)).eval(data_file_nan) # type: ignore[arg-type] assert should_read, "Should match: 10 is larger than lower bound" - should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("min_max_nan", 1)).eval(data_file_nan) + should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("min_max_nan", 1)).eval(data_file_nan) # type: ignore[arg-type] assert should_read, "Should match: no visibility" - should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("all_nan_null_bounds", 1)).eval(data_file_nan) + should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("all_nan_null_bounds", 1)).eval(data_file_nan) # type: ignore[arg-type] assert not should_read, "Should not match: all nan column doesn't contain number" - should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("some_nan_correct_bounds", 1)).eval(data_file_nan) + should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("some_nan_correct_bounds", 1)).eval(data_file_nan) # type: ignore[arg-type] assert not should_read, "Should not match: 1 is smaller than lower bound" - should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("some_nan_correct_bounds", 10)).eval( + should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("some_nan_correct_bounds", 10)).eval( # type: ignore[arg-type] data_file_nan ) assert should_read, "Should match: 10 larger than lower bound" @@ -709,30 +709,30 @@ def test_inclusive_metrics_evaluator_greater_than_and_greater_than_equal( schema_data_file_nan: Schema, data_file_nan: DataFile ) -> None: for operator in [GreaterThan, GreaterThanOrEqual]: - should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("all_nan", 1)).eval(data_file_nan) + should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("all_nan", 1)).eval(data_file_nan) # type: ignore[arg-type] assert not should_read, "Should not match: all nan column doesn't contain number" - should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("max_nan", 1)).eval(data_file_nan) + should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("max_nan", 1)).eval(data_file_nan) # type: ignore[arg-type] assert should_read, "Should match: upper bound is larger than 1" - should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("max_nan", 10)).eval(data_file_nan) + should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("max_nan", 10)).eval(data_file_nan) # type: ignore[arg-type] assert should_read, "Should match: upper bound is larger than 10" - should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("min_max_nan", 1)).eval(data_file_nan) + should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("min_max_nan", 1)).eval(data_file_nan) # type: ignore[arg-type] assert should_read, "Should match: no visibility" - should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("all_nan_null_bounds", 1)).eval(data_file_nan) + should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("all_nan_null_bounds", 1)).eval(data_file_nan) # type: ignore[arg-type] assert not should_read, "Should not match: all nan column doesn't contain number" - should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("some_nan_correct_bounds", 1)).eval(data_file_nan) + should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("some_nan_correct_bounds", 1)).eval(data_file_nan) # type: ignore[arg-type] assert should_read, "Should match: 1 is smaller than upper bound" - should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("some_nan_correct_bounds", 10)).eval( + should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("some_nan_correct_bounds", 10)).eval( # type: ignore[arg-type] data_file_nan ) assert should_read, "Should match: 10 is smaller than upper bound" - should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("all_nan", 30)).eval(data_file_nan) + should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("all_nan", 30)).eval(data_file_nan) # type: ignore[arg-type] assert not should_read, "Should not match: 30 is greater than upper bound" diff --git a/tests/expressions/test_visitors.py b/tests/expressions/test_visitors.py index d61c193719..94bfcf076c 100644 --- a/tests/expressions/test_visitors.py +++ b/tests/expressions/test_visitors.py @@ -947,95 +947,95 @@ def manifest() -> ManifestFile: def test_all_nulls(schema: Schema, manifest: ManifestFile) -> None: - assert not _ManifestEvalVisitor(schema, NotNull(Reference("all_nulls_missing_nan")), case_sensitive=True).eval(manifest), ( - "Should skip: all nulls column with non-floating type contains all null" - ) + assert not _ManifestEvalVisitor(schema, NotNull(Reference("all_nulls_missing_nan")), case_sensitive=True).eval( + manifest + ), "Should skip: all nulls column with non-floating type contains all null" - assert _ManifestEvalVisitor(schema, NotNull(Reference("all_nulls_missing_nan_float")), case_sensitive=True).eval(manifest), ( - "Should read: no NaN information may indicate presence of NaN value" - ) + assert _ManifestEvalVisitor(schema, NotNull(Reference("all_nulls_missing_nan_float")), case_sensitive=True).eval( + manifest + ), "Should read: no NaN information may indicate presence of NaN value" - assert _ManifestEvalVisitor(schema, NotNull(Reference("some_nulls")), case_sensitive=True).eval(manifest), ( - "Should read: column with some nulls contains a non-null value" - ) + assert _ManifestEvalVisitor(schema, NotNull(Reference("some_nulls")), case_sensitive=True).eval( + manifest + ), "Should read: column with some nulls contains a non-null value" - assert _ManifestEvalVisitor(schema, NotNull(Reference("no_nulls")), case_sensitive=True).eval(manifest), ( - "Should read: non-null column contains a non-null value" - ) + assert _ManifestEvalVisitor(schema, NotNull(Reference("no_nulls")), case_sensitive=True).eval( + manifest + ), "Should read: non-null column contains a non-null value" def test_no_nulls(schema: Schema, manifest: ManifestFile) -> None: - assert _ManifestEvalVisitor(schema, IsNull(Reference("all_nulls_missing_nan")), case_sensitive=True).eval(manifest), ( - "Should read: at least one null value in all null column" - ) + assert _ManifestEvalVisitor(schema, IsNull(Reference("all_nulls_missing_nan")), case_sensitive=True).eval( + manifest + ), "Should read: at least one null value in all null column" - assert _ManifestEvalVisitor(schema, IsNull(Reference("some_nulls")), case_sensitive=True).eval(manifest), ( - "Should read: column with some nulls contains a null value" - ) + assert _ManifestEvalVisitor(schema, IsNull(Reference("some_nulls")), case_sensitive=True).eval( + manifest + ), "Should read: column with some nulls contains a null value" - assert not _ManifestEvalVisitor(schema, IsNull(Reference("no_nulls")), case_sensitive=True).eval(manifest), ( - "Should skip: non-null column contains no null values" - ) + assert not _ManifestEvalVisitor(schema, IsNull(Reference("no_nulls")), case_sensitive=True).eval( + manifest + ), "Should skip: non-null column contains no null values" - assert _ManifestEvalVisitor(schema, IsNull(Reference("both_nan_and_null")), case_sensitive=True).eval(manifest), ( - "Should read: both_nan_and_null column contains no null values" - ) + assert _ManifestEvalVisitor(schema, IsNull(Reference("both_nan_and_null")), case_sensitive=True).eval( + manifest + ), "Should read: both_nan_and_null column contains no null values" def test_is_nan(schema: Schema, manifest: ManifestFile) -> None: - assert _ManifestEvalVisitor(schema, IsNaN(Reference("float")), case_sensitive=True).eval(manifest), ( - "Should read: no information on if there are nan value in float column" - ) + assert _ManifestEvalVisitor(schema, IsNaN(Reference("float")), case_sensitive=True).eval( + manifest + ), "Should read: no information on if there are nan value in float column" - assert _ManifestEvalVisitor(schema, IsNaN(Reference("all_nulls_double")), case_sensitive=True).eval(manifest), ( - "Should read: no NaN information may indicate presence of NaN value" - ) + assert _ManifestEvalVisitor(schema, IsNaN(Reference("all_nulls_double")), case_sensitive=True).eval( + manifest + ), "Should read: no NaN information may indicate presence of NaN value" - assert _ManifestEvalVisitor(schema, IsNaN(Reference("all_nulls_missing_nan_float")), case_sensitive=True).eval(manifest), ( - "Should read: no NaN information may indicate presence of NaN value" - ) + assert _ManifestEvalVisitor(schema, IsNaN(Reference("all_nulls_missing_nan_float")), case_sensitive=True).eval( + manifest + ), "Should read: no NaN information may indicate presence of NaN value" - assert not _ManifestEvalVisitor(schema, IsNaN(Reference("all_nulls_no_nans")), case_sensitive=True).eval(manifest), ( - "Should skip: no nan column doesn't contain nan value" - ) + assert not _ManifestEvalVisitor(schema, IsNaN(Reference("all_nulls_no_nans")), case_sensitive=True).eval( + manifest + ), "Should skip: no nan column doesn't contain nan value" - assert _ManifestEvalVisitor(schema, IsNaN(Reference("all_nans")), case_sensitive=True).eval(manifest), ( - "Should read: all_nans column contains nan value" - ) + assert _ManifestEvalVisitor(schema, IsNaN(Reference("all_nans")), case_sensitive=True).eval( + manifest + ), "Should read: all_nans column contains nan value" - assert _ManifestEvalVisitor(schema, IsNaN(Reference("both_nan_and_null")), case_sensitive=True).eval(manifest), ( - "Should read: both_nan_and_null column contains nan value" - ) + assert _ManifestEvalVisitor(schema, IsNaN(Reference("both_nan_and_null")), case_sensitive=True).eval( + manifest + ), "Should read: both_nan_and_null column contains nan value" - assert not _ManifestEvalVisitor(schema, IsNaN(Reference("no_nan_or_null")), case_sensitive=True).eval(manifest), ( - "Should skip: no_nan_or_null column doesn't contain nan value" - ) + assert not _ManifestEvalVisitor(schema, IsNaN(Reference("no_nan_or_null")), case_sensitive=True).eval( + manifest + ), "Should skip: no_nan_or_null column doesn't contain nan value" def test_not_nan(schema: Schema, manifest: ManifestFile) -> None: - assert _ManifestEvalVisitor(schema, NotNaN(Reference("float")), case_sensitive=True).eval(manifest), ( - "Should read: no information on if there are nan value in float column" - ) + assert _ManifestEvalVisitor(schema, NotNaN(Reference("float")), case_sensitive=True).eval( + manifest + ), "Should read: no information on if there are nan value in float column" - assert _ManifestEvalVisitor(schema, NotNaN(Reference("all_nulls_double")), case_sensitive=True).eval(manifest), ( - "Should read: all null column contains non nan value" - ) + assert _ManifestEvalVisitor(schema, NotNaN(Reference("all_nulls_double")), case_sensitive=True).eval( + manifest + ), "Should read: all null column contains non nan value" - assert _ManifestEvalVisitor(schema, NotNaN(Reference("all_nulls_no_nans")), case_sensitive=True).eval(manifest), ( - "Should read: no_nans column contains non nan value" - ) + assert _ManifestEvalVisitor(schema, NotNaN(Reference("all_nulls_no_nans")), case_sensitive=True).eval( + manifest + ), "Should read: no_nans column contains non nan value" - assert not _ManifestEvalVisitor(schema, NotNaN(Reference("all_nans")), case_sensitive=True).eval(manifest), ( - "Should skip: all nans column doesn't contain non nan value" - ) + assert not _ManifestEvalVisitor(schema, NotNaN(Reference("all_nans")), case_sensitive=True).eval( + manifest + ), "Should skip: all nans column doesn't contain non nan value" - assert _ManifestEvalVisitor(schema, NotNaN(Reference("both_nan_and_null")), case_sensitive=True).eval(manifest), ( - "Should read: both_nan_and_null nans column contains non nan value" - ) + assert _ManifestEvalVisitor(schema, NotNaN(Reference("both_nan_and_null")), case_sensitive=True).eval( + manifest + ), "Should read: both_nan_and_null nans column contains non nan value" - assert _ManifestEvalVisitor(schema, NotNaN(Reference("no_nan_or_null")), case_sensitive=True).eval(manifest), ( - "Should read: no_nan_or_null column contains non nan value" - ) + assert _ManifestEvalVisitor(schema, NotNaN(Reference("no_nan_or_null")), case_sensitive=True).eval( + manifest + ), "Should read: no_nan_or_null column contains non nan value" def test_missing_stats(schema: Schema, manifest_no_stats: ManifestFile) -> None: @@ -1053,15 +1053,15 @@ def test_missing_stats(schema: Schema, manifest_no_stats: ManifestFile) -> None: ] for expr in expressions: - assert _ManifestEvalVisitor(schema, expr, case_sensitive=True).eval(manifest_no_stats), ( - f"Should read when missing stats for expr: {expr}" - ) + assert _ManifestEvalVisitor(schema, expr, case_sensitive=True).eval( + manifest_no_stats + ), f"Should read when missing stats for expr: {expr}" def test_not(schema: Schema, manifest: ManifestFile) -> None: - assert _ManifestEvalVisitor(schema, Not(LessThan(Reference("id"), INT_MIN_VALUE - 25)), case_sensitive=True).eval(manifest), ( - "Should read: not(false)" - ) + assert _ManifestEvalVisitor(schema, Not(LessThan(Reference("id"), INT_MIN_VALUE - 25)), case_sensitive=True).eval( + manifest + ), "Should read: not(false)" assert not _ManifestEvalVisitor(schema, Not(GreaterThan(Reference("id"), INT_MIN_VALUE - 25)), case_sensitive=True).eval( manifest @@ -1118,21 +1118,21 @@ def test_or(schema: Schema, manifest: ManifestFile) -> None: def test_integer_lt(schema: Schema, manifest: ManifestFile) -> None: - assert not _ManifestEvalVisitor(schema, LessThan(Reference("id"), INT_MIN_VALUE - 25), case_sensitive=True).eval(manifest), ( - "Should not read: id range below lower bound (5 < 30)" - ) + assert not _ManifestEvalVisitor(schema, LessThan(Reference("id"), INT_MIN_VALUE - 25), case_sensitive=True).eval( + manifest + ), "Should not read: id range below lower bound (5 < 30)" - assert not _ManifestEvalVisitor(schema, LessThan(Reference("id"), INT_MIN_VALUE), case_sensitive=True).eval(manifest), ( - "Should not read: id range below lower bound (30 is not < 30)" - ) + assert not _ManifestEvalVisitor(schema, LessThan(Reference("id"), INT_MIN_VALUE), case_sensitive=True).eval( + manifest + ), "Should not read: id range below lower bound (30 is not < 30)" - assert _ManifestEvalVisitor(schema, LessThan(Reference("id"), INT_MIN_VALUE + 1), case_sensitive=True).eval(manifest), ( - "Should read: one possible id" - ) + assert _ManifestEvalVisitor(schema, LessThan(Reference("id"), INT_MIN_VALUE + 1), case_sensitive=True).eval( + manifest + ), "Should read: one possible id" - assert _ManifestEvalVisitor(schema, LessThan(Reference("id"), INT_MAX_VALUE), case_sensitive=True).eval(manifest), ( - "Should read: may possible ids" - ) + assert _ManifestEvalVisitor(schema, LessThan(Reference("id"), INT_MAX_VALUE), case_sensitive=True).eval( + manifest + ), "Should read: may possible ids" def test_integer_lt_eq(schema: Schema, manifest: ManifestFile) -> None: @@ -1144,13 +1144,13 @@ def test_integer_lt_eq(schema: Schema, manifest: ManifestFile) -> None: manifest ), "Should not read: id range below lower bound (29 < 30)" - assert _ManifestEvalVisitor(schema, LessThanOrEqual(Reference("id"), INT_MIN_VALUE), case_sensitive=True).eval(manifest), ( - "Should read: one possible id" - ) + assert _ManifestEvalVisitor(schema, LessThanOrEqual(Reference("id"), INT_MIN_VALUE), case_sensitive=True).eval( + manifest + ), "Should read: one possible id" - assert _ManifestEvalVisitor(schema, LessThanOrEqual(Reference("id"), INT_MAX_VALUE), case_sensitive=True).eval(manifest), ( - "Should read: many possible ids" - ) + assert _ManifestEvalVisitor(schema, LessThanOrEqual(Reference("id"), INT_MAX_VALUE), case_sensitive=True).eval( + manifest + ), "Should read: many possible ids" def test_integer_gt(schema: Schema, manifest: ManifestFile) -> None: @@ -1158,17 +1158,17 @@ def test_integer_gt(schema: Schema, manifest: ManifestFile) -> None: manifest ), "Should not read: id range above upper bound (85 < 79)" - assert not _ManifestEvalVisitor(schema, GreaterThan(Reference("id"), INT_MAX_VALUE), case_sensitive=True).eval(manifest), ( - "Should not read: id range above upper bound (79 is not > 79)" - ) + assert not _ManifestEvalVisitor(schema, GreaterThan(Reference("id"), INT_MAX_VALUE), case_sensitive=True).eval( + manifest + ), "Should not read: id range above upper bound (79 is not > 79)" - assert _ManifestEvalVisitor(schema, GreaterThan(Reference("id"), INT_MAX_VALUE - 1), case_sensitive=True).eval(manifest), ( - "Should read: one possible id" - ) + assert _ManifestEvalVisitor(schema, GreaterThan(Reference("id"), INT_MAX_VALUE - 1), case_sensitive=True).eval( + manifest + ), "Should read: one possible id" - assert _ManifestEvalVisitor(schema, GreaterThan(Reference("id"), INT_MAX_VALUE - 4), case_sensitive=True).eval(manifest), ( - "Should read: may possible ids" - ) + assert _ManifestEvalVisitor(schema, GreaterThan(Reference("id"), INT_MAX_VALUE - 4), case_sensitive=True).eval( + manifest + ), "Should read: may possible ids" def test_integer_gt_eq(schema: Schema, manifest: ManifestFile) -> None: @@ -1180,133 +1180,133 @@ def test_integer_gt_eq(schema: Schema, manifest: ManifestFile) -> None: manifest ), "Should not read: id range above upper bound (80 > 79)" - assert _ManifestEvalVisitor(schema, GreaterThanOrEqual(Reference("id"), INT_MAX_VALUE), case_sensitive=True).eval(manifest), ( - "Should read: one possible id" - ) + assert _ManifestEvalVisitor(schema, GreaterThanOrEqual(Reference("id"), INT_MAX_VALUE), case_sensitive=True).eval( + manifest + ), "Should read: one possible id" - assert _ManifestEvalVisitor(schema, GreaterThanOrEqual(Reference("id"), INT_MAX_VALUE), case_sensitive=True).eval(manifest), ( - "Should read: may possible ids" - ) + assert _ManifestEvalVisitor(schema, GreaterThanOrEqual(Reference("id"), INT_MAX_VALUE), case_sensitive=True).eval( + manifest + ), "Should read: may possible ids" def test_integer_eq(schema: Schema, manifest: ManifestFile) -> None: - assert not _ManifestEvalVisitor(schema, EqualTo(Reference("id"), INT_MIN_VALUE - 25), case_sensitive=True).eval(manifest), ( - "Should not read: id below lower bound" - ) + assert not _ManifestEvalVisitor(schema, EqualTo(Reference("id"), INT_MIN_VALUE - 25), case_sensitive=True).eval( + manifest + ), "Should not read: id below lower bound" - assert not _ManifestEvalVisitor(schema, EqualTo(Reference("id"), INT_MIN_VALUE - 1), case_sensitive=True).eval(manifest), ( - "Should not read: id below lower bound" - ) + assert not _ManifestEvalVisitor(schema, EqualTo(Reference("id"), INT_MIN_VALUE - 1), case_sensitive=True).eval( + manifest + ), "Should not read: id below lower bound" - assert _ManifestEvalVisitor(schema, EqualTo(Reference("id"), INT_MIN_VALUE), case_sensitive=True).eval(manifest), ( - "Should read: id equal to lower bound" - ) + assert _ManifestEvalVisitor(schema, EqualTo(Reference("id"), INT_MIN_VALUE), case_sensitive=True).eval( + manifest + ), "Should read: id equal to lower bound" - assert _ManifestEvalVisitor(schema, EqualTo(Reference("id"), INT_MAX_VALUE - 4), case_sensitive=True).eval(manifest), ( - "Should read: id between lower and upper bounds" - ) + assert _ManifestEvalVisitor(schema, EqualTo(Reference("id"), INT_MAX_VALUE - 4), case_sensitive=True).eval( + manifest + ), "Should read: id between lower and upper bounds" - assert _ManifestEvalVisitor(schema, EqualTo(Reference("id"), INT_MAX_VALUE), case_sensitive=True).eval(manifest), ( - "Should read: id equal to upper bound" - ) + assert _ManifestEvalVisitor(schema, EqualTo(Reference("id"), INT_MAX_VALUE), case_sensitive=True).eval( + manifest + ), "Should read: id equal to upper bound" - assert not _ManifestEvalVisitor(schema, EqualTo(Reference("id"), INT_MAX_VALUE + 1), case_sensitive=True).eval(manifest), ( - "Should not read: id above upper bound" - ) + assert not _ManifestEvalVisitor(schema, EqualTo(Reference("id"), INT_MAX_VALUE + 1), case_sensitive=True).eval( + manifest + ), "Should not read: id above upper bound" - assert not _ManifestEvalVisitor(schema, EqualTo(Reference("id"), INT_MAX_VALUE + 6), case_sensitive=True).eval(manifest), ( - "Should not read: id above upper bound" - ) + assert not _ManifestEvalVisitor(schema, EqualTo(Reference("id"), INT_MAX_VALUE + 6), case_sensitive=True).eval( + manifest + ), "Should not read: id above upper bound" def test_integer_not_eq(schema: Schema, manifest: ManifestFile) -> None: - assert _ManifestEvalVisitor(schema, NotEqualTo(Reference("id"), INT_MIN_VALUE - 25), case_sensitive=True).eval(manifest), ( - "Should read: id below lower bound" - ) + assert _ManifestEvalVisitor(schema, NotEqualTo(Reference("id"), INT_MIN_VALUE - 25), case_sensitive=True).eval( + manifest + ), "Should read: id below lower bound" - assert _ManifestEvalVisitor(schema, NotEqualTo(Reference("id"), INT_MIN_VALUE - 1), case_sensitive=True).eval(manifest), ( - "Should read: id below lower bound" - ) + assert _ManifestEvalVisitor(schema, NotEqualTo(Reference("id"), INT_MIN_VALUE - 1), case_sensitive=True).eval( + manifest + ), "Should read: id below lower bound" - assert _ManifestEvalVisitor(schema, NotEqualTo(Reference("id"), INT_MIN_VALUE), case_sensitive=True).eval(manifest), ( - "Should read: id equal to lower bound" - ) + assert _ManifestEvalVisitor(schema, NotEqualTo(Reference("id"), INT_MIN_VALUE), case_sensitive=True).eval( + manifest + ), "Should read: id equal to lower bound" - assert _ManifestEvalVisitor(schema, NotEqualTo(Reference("id"), INT_MAX_VALUE - 4), case_sensitive=True).eval(manifest), ( - "Should read: id between lower and upper bounds" - ) + assert _ManifestEvalVisitor(schema, NotEqualTo(Reference("id"), INT_MAX_VALUE - 4), case_sensitive=True).eval( + manifest + ), "Should read: id between lower and upper bounds" - assert _ManifestEvalVisitor(schema, NotEqualTo(Reference("id"), INT_MAX_VALUE), case_sensitive=True).eval(manifest), ( - "Should read: id equal to upper bound" - ) + assert _ManifestEvalVisitor(schema, NotEqualTo(Reference("id"), INT_MAX_VALUE), case_sensitive=True).eval( + manifest + ), "Should read: id equal to upper bound" - assert _ManifestEvalVisitor(schema, NotEqualTo(Reference("id"), INT_MAX_VALUE + 1), case_sensitive=True).eval(manifest), ( - "Should read: id above upper bound" - ) + assert _ManifestEvalVisitor(schema, NotEqualTo(Reference("id"), INT_MAX_VALUE + 1), case_sensitive=True).eval( + manifest + ), "Should read: id above upper bound" - assert _ManifestEvalVisitor(schema, NotEqualTo(Reference("id"), INT_MAX_VALUE + 6), case_sensitive=True).eval(manifest), ( - "Should read: id above upper bound" - ) + assert _ManifestEvalVisitor(schema, NotEqualTo(Reference("id"), INT_MAX_VALUE + 6), case_sensitive=True).eval( + manifest + ), "Should read: id above upper bound" def test_integer_not_eq_rewritten(schema: Schema, manifest: ManifestFile) -> None: - assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("id"), INT_MIN_VALUE - 25)), case_sensitive=True).eval(manifest), ( - "Should read: id below lower bound" - ) + assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("id"), INT_MIN_VALUE - 25)), case_sensitive=True).eval( + manifest + ), "Should read: id below lower bound" - assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("id"), INT_MIN_VALUE - 1)), case_sensitive=True).eval(manifest), ( - "Should read: id below lower bound" - ) + assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("id"), INT_MIN_VALUE - 1)), case_sensitive=True).eval( + manifest + ), "Should read: id below lower bound" - assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("id"), INT_MIN_VALUE)), case_sensitive=True).eval(manifest), ( - "Should read: id equal to lower bound" - ) + assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("id"), INT_MIN_VALUE)), case_sensitive=True).eval( + manifest + ), "Should read: id equal to lower bound" - assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("id"), INT_MAX_VALUE - 4)), case_sensitive=True).eval(manifest), ( - "Should read: id between lower and upper bounds" - ) + assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("id"), INT_MAX_VALUE - 4)), case_sensitive=True).eval( + manifest + ), "Should read: id between lower and upper bounds" - assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("id"), INT_MAX_VALUE)), case_sensitive=True).eval(manifest), ( - "Should read: id equal to upper bound" - ) + assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("id"), INT_MAX_VALUE)), case_sensitive=True).eval( + manifest + ), "Should read: id equal to upper bound" - assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("id"), INT_MAX_VALUE + 1)), case_sensitive=True).eval(manifest), ( - "Should read: id above upper bound" - ) + assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("id"), INT_MAX_VALUE + 1)), case_sensitive=True).eval( + manifest + ), "Should read: id above upper bound" - assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("id"), INT_MAX_VALUE + 6)), case_sensitive=True).eval(manifest), ( - "Should read: id above upper bound" - ) + assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("id"), INT_MAX_VALUE + 6)), case_sensitive=True).eval( + manifest + ), "Should read: id above upper bound" def test_integer_not_eq_rewritten_case_insensitive(schema: Schema, manifest: ManifestFile) -> None: - assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("ID"), INT_MIN_VALUE - 25)), case_sensitive=False).eval(manifest), ( - "Should read: id below lower bound" - ) + assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("ID"), INT_MIN_VALUE - 25)), case_sensitive=False).eval( + manifest + ), "Should read: id below lower bound" - assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("ID"), INT_MIN_VALUE - 1)), case_sensitive=False).eval(manifest), ( - "Should read: id below lower bound" - ) + assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("ID"), INT_MIN_VALUE - 1)), case_sensitive=False).eval( + manifest + ), "Should read: id below lower bound" - assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("ID"), INT_MIN_VALUE)), case_sensitive=False).eval(manifest), ( - "Should read: id equal to lower bound" - ) + assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("ID"), INT_MIN_VALUE)), case_sensitive=False).eval( + manifest + ), "Should read: id equal to lower bound" - assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("ID"), INT_MAX_VALUE - 4)), case_sensitive=False).eval(manifest), ( - "Should read: id between lower and upper bounds" - ) + assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("ID"), INT_MAX_VALUE - 4)), case_sensitive=False).eval( + manifest + ), "Should read: id between lower and upper bounds" - assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("ID"), INT_MAX_VALUE)), case_sensitive=False).eval(manifest), ( - "Should read: id equal to upper bound" - ) + assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("ID"), INT_MAX_VALUE)), case_sensitive=False).eval( + manifest + ), "Should read: id equal to upper bound" - assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("ID"), INT_MAX_VALUE + 1)), case_sensitive=False).eval(manifest), ( - "Should read: id above upper bound" - ) + assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("ID"), INT_MAX_VALUE + 1)), case_sensitive=False).eval( + manifest + ), "Should read: id above upper bound" - assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("ID"), INT_MAX_VALUE + 6)), case_sensitive=False).eval(manifest), ( - "Should read: id above upper bound" - ) + assert _ManifestEvalVisitor(schema, Not(EqualTo(Reference("ID"), INT_MAX_VALUE + 6)), case_sensitive=False).eval( + manifest + ), "Should read: id above upper bound" def test_integer_in(schema: Schema, manifest: ManifestFile) -> None: @@ -1342,13 +1342,13 @@ def test_integer_in(schema: Schema, manifest: ManifestFile) -> None: manifest ), "Should skip: in on all nulls column" - assert _ManifestEvalVisitor(schema, In(Reference("some_nulls"), ("abc", "def")), case_sensitive=True).eval(manifest), ( - "Should read: in on some nulls column" - ) + assert _ManifestEvalVisitor(schema, In(Reference("some_nulls"), ("abc", "def")), case_sensitive=True).eval( + manifest + ), "Should read: in on some nulls column" - assert _ManifestEvalVisitor(schema, In(Reference("no_nulls"), ("abc", "def")), case_sensitive=True).eval(manifest), ( - "Should read: in on no nulls column" - ) + assert _ManifestEvalVisitor(schema, In(Reference("no_nulls"), ("abc", "def")), case_sensitive=True).eval( + manifest + ), "Should read: in on no nulls column" def test_integer_not_in(schema: Schema, manifest: ManifestFile) -> None: @@ -1384,73 +1384,73 @@ def test_integer_not_in(schema: Schema, manifest: ManifestFile) -> None: manifest ), "Should read: notIn on no nulls column" - assert _ManifestEvalVisitor(schema, NotIn(Reference("some_nulls"), ("abc", "def")), case_sensitive=True).eval(manifest), ( - "Should read: in on some nulls column" - ) + assert _ManifestEvalVisitor(schema, NotIn(Reference("some_nulls"), ("abc", "def")), case_sensitive=True).eval( + manifest + ), "Should read: in on some nulls column" - assert _ManifestEvalVisitor(schema, NotIn(Reference("no_nulls"), ("abc", "def")), case_sensitive=True).eval(manifest), ( - "Should read: in on no nulls column" - ) + assert _ManifestEvalVisitor(schema, NotIn(Reference("no_nulls"), ("abc", "def")), case_sensitive=True).eval( + manifest + ), "Should read: in on no nulls column" def test_string_starts_with(schema: Schema, manifest: ManifestFile) -> None: - assert _ManifestEvalVisitor(schema, StartsWith(Reference("some_nulls"), "a"), case_sensitive=False).eval(manifest), ( - "Should read: range matches" - ) + assert _ManifestEvalVisitor(schema, StartsWith(Reference("some_nulls"), "a"), case_sensitive=False).eval( + manifest + ), "Should read: range matches" - assert _ManifestEvalVisitor(schema, StartsWith(Reference("some_nulls"), "aa"), case_sensitive=False).eval(manifest), ( - "Should read: range matches" - ) + assert _ManifestEvalVisitor(schema, StartsWith(Reference("some_nulls"), "aa"), case_sensitive=False).eval( + manifest + ), "Should read: range matches" - assert _ManifestEvalVisitor(schema, StartsWith(Reference("some_nulls"), "dddd"), case_sensitive=False).eval(manifest), ( - "Should read: range matches" - ) + assert _ManifestEvalVisitor(schema, StartsWith(Reference("some_nulls"), "dddd"), case_sensitive=False).eval( + manifest + ), "Should read: range matches" - assert _ManifestEvalVisitor(schema, StartsWith(Reference("some_nulls"), "z"), case_sensitive=False).eval(manifest), ( - "Should read: range matches" - ) + assert _ManifestEvalVisitor(schema, StartsWith(Reference("some_nulls"), "z"), case_sensitive=False).eval( + manifest + ), "Should read: range matches" - assert _ManifestEvalVisitor(schema, StartsWith(Reference("no_nulls"), "a"), case_sensitive=False).eval(manifest), ( - "Should read: range matches" - ) + assert _ManifestEvalVisitor(schema, StartsWith(Reference("no_nulls"), "a"), case_sensitive=False).eval( + manifest + ), "Should read: range matches" - assert not _ManifestEvalVisitor(schema, StartsWith(Reference("some_nulls"), "zzzz"), case_sensitive=False).eval(manifest), ( - "Should skip: range doesn't match" - ) + assert not _ManifestEvalVisitor(schema, StartsWith(Reference("some_nulls"), "zzzz"), case_sensitive=False).eval( + manifest + ), "Should skip: range doesn't match" - assert not _ManifestEvalVisitor(schema, StartsWith(Reference("some_nulls"), "1"), case_sensitive=False).eval(manifest), ( - "Should skip: range doesn't match" - ) + assert not _ManifestEvalVisitor(schema, StartsWith(Reference("some_nulls"), "1"), case_sensitive=False).eval( + manifest + ), "Should skip: range doesn't match" def test_string_not_starts_with(schema: Schema, manifest: ManifestFile) -> None: - assert _ManifestEvalVisitor(schema, NotStartsWith(Reference("some_nulls"), "a"), case_sensitive=False).eval(manifest), ( - "Should read: range matches" - ) + assert _ManifestEvalVisitor(schema, NotStartsWith(Reference("some_nulls"), "a"), case_sensitive=False).eval( + manifest + ), "Should read: range matches" - assert _ManifestEvalVisitor(schema, NotStartsWith(Reference("some_nulls"), "aa"), case_sensitive=False).eval(manifest), ( - "Should read: range matches" - ) + assert _ManifestEvalVisitor(schema, NotStartsWith(Reference("some_nulls"), "aa"), case_sensitive=False).eval( + manifest + ), "Should read: range matches" - assert _ManifestEvalVisitor(schema, NotStartsWith(Reference("some_nulls"), "dddd"), case_sensitive=False).eval(manifest), ( - "Should read: range matches" - ) + assert _ManifestEvalVisitor(schema, NotStartsWith(Reference("some_nulls"), "dddd"), case_sensitive=False).eval( + manifest + ), "Should read: range matches" - assert _ManifestEvalVisitor(schema, NotStartsWith(Reference("some_nulls"), "z"), case_sensitive=False).eval(manifest), ( - "Should read: range matches" - ) + assert _ManifestEvalVisitor(schema, NotStartsWith(Reference("some_nulls"), "z"), case_sensitive=False).eval( + manifest + ), "Should read: range matches" - assert _ManifestEvalVisitor(schema, NotStartsWith(Reference("no_nulls"), "a"), case_sensitive=False).eval(manifest), ( - "Should read: range matches" - ) + assert _ManifestEvalVisitor(schema, NotStartsWith(Reference("no_nulls"), "a"), case_sensitive=False).eval( + manifest + ), "Should read: range matches" - assert _ManifestEvalVisitor(schema, NotStartsWith(Reference("some_nulls"), "zzzz"), case_sensitive=False).eval(manifest), ( - "Should read: range matches" - ) + assert _ManifestEvalVisitor(schema, NotStartsWith(Reference("some_nulls"), "zzzz"), case_sensitive=False).eval( + manifest + ), "Should read: range matches" - assert _ManifestEvalVisitor(schema, NotStartsWith(Reference("some_nulls"), "1"), case_sensitive=False).eval(manifest), ( - "Should read: range matches" - ) + assert _ManifestEvalVisitor(schema, NotStartsWith(Reference("some_nulls"), "1"), case_sensitive=False).eval( + manifest + ), "Should read: range matches" assert _ManifestEvalVisitor(schema, NotStartsWith(Reference("all_same_value_or_null"), "a"), case_sensitive=False).eval( manifest diff --git a/tests/integration/test_add_files.py b/tests/integration/test_add_files.py index 85e626edf4..c1d916e0e0 100644 --- a/tests/integration/test_add_files.py +++ b/tests/integration/test_add_files.py @@ -52,12 +52,14 @@ NestedField(field_id=10, name="qux", field_type=DateType(), required=False), ) -ARROW_SCHEMA = pa.schema([ - ("foo", pa.bool_()), - ("bar", pa.string()), - ("baz", pa.int32()), - ("qux", pa.date32()), -]) +ARROW_SCHEMA = pa.schema( + [ + ("foo", pa.bool_()), + ("bar", pa.string()), + ("baz", pa.int32()), + ("qux", pa.date32()), + ] +) ARROW_TABLE = pa.Table.from_pylist( [ @@ -71,12 +73,14 @@ schema=ARROW_SCHEMA, ) -ARROW_SCHEMA_WITH_IDS = pa.schema([ - pa.field("foo", pa.bool_(), nullable=False, metadata={"PARQUET:field_id": "1"}), - pa.field("bar", pa.string(), nullable=False, metadata={"PARQUET:field_id": "2"}), - pa.field("baz", pa.int32(), nullable=False, metadata={"PARQUET:field_id": "3"}), - pa.field("qux", pa.date32(), nullable=False, metadata={"PARQUET:field_id": "4"}), -]) +ARROW_SCHEMA_WITH_IDS = pa.schema( + [ + pa.field("foo", pa.bool_(), nullable=False, metadata={"PARQUET:field_id": "1"}), + pa.field("bar", pa.string(), nullable=False, metadata={"PARQUET:field_id": "2"}), + pa.field("baz", pa.int32(), nullable=False, metadata={"PARQUET:field_id": "3"}), + pa.field("qux", pa.date32(), nullable=False, metadata={"PARQUET:field_id": "4"}), + ] +) ARROW_TABLE_WITH_IDS = pa.Table.from_pylist( @@ -91,12 +95,14 @@ schema=ARROW_SCHEMA_WITH_IDS, ) -ARROW_SCHEMA_UPDATED = pa.schema([ - ("foo", pa.bool_()), - ("baz", pa.int32()), - ("qux", pa.date32()), - ("quux", pa.int32()), -]) +ARROW_SCHEMA_UPDATED = pa.schema( + [ + ("foo", pa.bool_()), + ("baz", pa.int32()), + ("qux", pa.date32()), + ("quux", pa.int32()), + ] +) ARROW_TABLE_UPDATED = pa.Table.from_pylist( [ @@ -471,12 +477,14 @@ def test_add_files_fails_on_schema_mismatch(spark: SparkSession, session_catalog identifier = f"default.table_schema_mismatch_fails_v{format_version}" tbl = _create_table(session_catalog, identifier, format_version) - WRONG_SCHEMA = pa.schema([ - ("foo", pa.bool_()), - ("bar", pa.string()), - ("baz", pa.string()), # should be integer - ("qux", pa.date32()), - ]) + WRONG_SCHEMA = pa.schema( + [ + ("foo", pa.bool_()), + ("bar", pa.string()), + ("baz", pa.string()), # should be integer + ("qux", pa.date32()), + ] + ) file_path = f"s3://warehouse/default/table_schema_mismatch_fails/v{format_version}/test.parquet" # write parquet files fo = tbl.io.new_output(file_path) @@ -522,12 +530,16 @@ def test_add_files_with_large_and_regular_schema(spark: SparkSession, session_ca identifier = f"default.unpartitioned_with_large_types{format_version}" iceberg_schema = Schema(NestedField(1, "foo", StringType(), required=True)) - arrow_schema = pa.schema([ - pa.field("foo", pa.string(), nullable=False), - ]) - arrow_schema_large = pa.schema([ - pa.field("foo", pa.large_string(), nullable=False), - ]) + arrow_schema = pa.schema( + [ + pa.field("foo", pa.string(), nullable=False), + ] + ) + arrow_schema_large = pa.schema( + [ + pa.field("foo", pa.large_string(), nullable=False), + ] + ) tbl = _create_table(session_catalog, identifier, format_version, schema=iceberg_schema) @@ -576,9 +588,11 @@ def test_add_files_with_large_and_regular_schema(spark: SparkSession, session_ca def test_add_files_with_timestamp_tz_ns_fails(session_catalog: Catalog, format_version: int, mocker: MockerFixture) -> None: nanoseconds_schema_iceberg = Schema(NestedField(1, "quux", TimestamptzType())) - nanoseconds_schema = pa.schema([ - ("quux", pa.timestamp("ns", tz="UTC")), - ]) + nanoseconds_schema = pa.schema( + [ + ("quux", pa.timestamp("ns", tz="UTC")), + ] + ) arrow_table = pa.Table.from_pylist( [ @@ -617,9 +631,11 @@ def test_add_file_with_valid_nullability_diff(spark: SparkSession, session_catal table_schema = Schema( NestedField(field_id=1, name="long", field_type=LongType(), required=False), ) - other_schema = pa.schema(( - pa.field("long", pa.int64(), nullable=False), # can support writing required pyarrow field to optional Iceberg field - )) + other_schema = pa.schema( + ( + pa.field("long", pa.int64(), nullable=False), # can support writing required pyarrow field to optional Iceberg field + ) + ) arrow_table = pa.Table.from_pydict( { "long": [1, 9], @@ -671,13 +687,15 @@ def test_add_files_with_valid_upcast( # table's long field should cast to long on read written_arrow_table = tbl.scan().to_arrow() assert written_arrow_table == pyarrow_table_with_promoted_types.cast( - pa.schema(( - pa.field("long", pa.int64(), nullable=True), - pa.field("list", pa.large_list(pa.int64()), nullable=False), - pa.field("map", pa.map_(pa.large_string(), pa.int64()), nullable=False), - pa.field("double", pa.float64(), nullable=True), - pa.field("uuid", pa.binary(length=16), nullable=True), # can UUID is read as fixed length binary of length 16 - )) + pa.schema( + ( + pa.field("long", pa.int64(), nullable=True), + pa.field("list", pa.large_list(pa.int64()), nullable=False), + pa.field("map", pa.map_(pa.large_string(), pa.int64()), nullable=False), + pa.field("double", pa.float64(), nullable=True), + pa.field("uuid", pa.binary(length=16), nullable=True), # can UUID is read as fixed length binary of length 16 + ) + ) ) lhs = spark.table(f"{identifier}").toPandas() rhs = written_arrow_table.to_pandas() diff --git a/tests/integration/test_deletes.py b/tests/integration/test_deletes.py index f2417bde2d..ae03beea53 100644 --- a/tests/integration/test_deletes.py +++ b/tests/integration/test_deletes.py @@ -746,13 +746,15 @@ def test_delete_after_partition_evolution_from_partitioned(session_catalog: Rest arrow_table = pa.Table.from_arrays( [ pa.array([2, 3, 4, 5, 6]), - pa.array([ - datetime(2021, 5, 19), - datetime(2022, 7, 25), - datetime(2023, 3, 22), - datetime(2024, 7, 17), - datetime(2025, 2, 22), - ]), + pa.array( + [ + datetime(2021, 5, 19), + datetime(2022, 7, 25), + datetime(2023, 3, 22), + datetime(2024, 7, 17), + datetime(2025, 2, 22), + ] + ), ], names=["idx", "ts"], ) diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py index 0279c2199a..8d13724087 100644 --- a/tests/integration/test_reads.py +++ b/tests/integration/test_reads.py @@ -833,12 +833,14 @@ def test_table_scan_default_to_large_types(catalog: Catalog) -> None: result_table = tbl.scan().to_arrow() - expected_schema = pa.schema([ - pa.field("string", pa.large_string()), - pa.field("string-to-binary", pa.large_binary()), - pa.field("binary", pa.large_binary()), - pa.field("list", pa.large_list(pa.large_string())), - ]) + expected_schema = pa.schema( + [ + pa.field("string", pa.large_string()), + pa.field("string-to-binary", pa.large_binary()), + pa.field("binary", pa.large_binary()), + pa.field("list", pa.large_list(pa.large_string())), + ] + ) assert result_table.schema.equals(expected_schema) @@ -874,12 +876,14 @@ def test_table_scan_override_with_small_types(catalog: Catalog) -> None: tbl.io.properties[PYARROW_USE_LARGE_TYPES_ON_READ] = "False" result_table = tbl.scan().to_arrow() - expected_schema = pa.schema([ - pa.field("string", pa.string()), - pa.field("string-to-binary", pa.binary()), - pa.field("binary", pa.binary()), - pa.field("list", pa.list_(pa.string())), - ]) + expected_schema = pa.schema( + [ + pa.field("string", pa.string()), + pa.field("string-to-binary", pa.binary()), + pa.field("binary", pa.binary()), + pa.field("list", pa.list_(pa.string())), + ] + ) assert result_table.schema.equals(expected_schema) diff --git a/tests/integration/test_rest_schema.py b/tests/integration/test_rest_schema.py index 8e64142b3f..6a704839e2 100644 --- a/tests/integration/test_rest_schema.py +++ b/tests/integration/test_rest_schema.py @@ -685,11 +685,13 @@ def test_rename_simple(simple_table: Table) -> None: ) # Check that the name mapping gets updated - assert simple_table.name_mapping() == NameMapping([ - MappedField(field_id=1, names=["foo", "vo"]), - MappedField(field_id=2, names=["bar", "var"]), - MappedField(field_id=3, names=["baz"]), - ]) + assert simple_table.name_mapping() == NameMapping( + [ + MappedField(field_id=1, names=["foo", "vo"]), + MappedField(field_id=2, names=["bar", "var"]), + MappedField(field_id=3, names=["baz"]), + ] + ) @pytest.mark.integration @@ -719,9 +721,11 @@ def test_rename_simple_nested(catalog: Catalog) -> None: ) # Check that the name mapping gets updated - assert tbl.name_mapping() == NameMapping([ - MappedField(field_id=1, names=["foo"], fields=[MappedField(field_id=2, names=["bar", "vo"])]), - ]) + assert tbl.name_mapping() == NameMapping( + [ + MappedField(field_id=1, names=["foo"], fields=[MappedField(field_id=2, names=["bar", "vo"])]), + ] + ) @pytest.mark.integration diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py index f9c0afd3bc..c23e836554 100644 --- a/tests/integration/test_writes/test_writes.py +++ b/tests/integration/test_writes/test_writes.py @@ -324,20 +324,24 @@ def test_python_writes_special_character_column_with_spark_reads( {"street": "789", "city": "Random", "zip": 10112, column_name_with_special_character: "c"}, ], } - pa_schema = pa.schema([ - pa.field(column_name_with_special_character, pa.string()), - pa.field("id", pa.int32()), - pa.field("name", pa.string()), - pa.field( - "address", - pa.struct([ - pa.field("street", pa.string()), - pa.field("city", pa.string()), - pa.field("zip", pa.int32()), - pa.field(column_name_with_special_character, pa.string()), - ]), - ), - ]) + pa_schema = pa.schema( + [ + pa.field(column_name_with_special_character, pa.string()), + pa.field("id", pa.int32()), + pa.field("name", pa.string()), + pa.field( + "address", + pa.struct( + [ + pa.field("street", pa.string()), + pa.field("city", pa.string()), + pa.field("zip", pa.int32()), + pa.field(column_name_with_special_character, pa.string()), + ] + ), + ), + ] + ) arrow_table_with_special_character_column = pa.Table.from_pydict(TEST_DATA_WITH_SPECIAL_CHARACTER_COLUMN, schema=pa_schema) tbl = _create_table(session_catalog, identifier, {"format-version": format_version}, schema=pa_schema) @@ -357,10 +361,12 @@ def test_python_writes_dictionary_encoded_column_with_spark_reads( "id": [1, 2, 3, 1, 1], "name": ["AB", "CD", "EF", "CD", "EF"], } - pa_schema = pa.schema([ - pa.field("id", pa.dictionary(pa.int32(), pa.int32(), False)), - pa.field("name", pa.dictionary(pa.int32(), pa.string(), False)), - ]) + pa_schema = pa.schema( + [ + pa.field("id", pa.dictionary(pa.int32(), pa.int32(), False)), + pa.field("name", pa.dictionary(pa.int32(), pa.string(), False)), + ] + ) arrow_table = pa.Table.from_pydict(TEST_DATA, schema=pa_schema) tbl = _create_table(session_catalog, identifier, {"format-version": format_version}, schema=pa_schema) @@ -387,20 +393,24 @@ def test_python_writes_with_small_and_large_types_spark_reads( {"street": "789", "city": "Random", "zip": 10112, "bar": "c"}, ], } - pa_schema = pa.schema([ - pa.field("foo", pa.large_string()), - pa.field("id", pa.int32()), - pa.field("name", pa.string()), - pa.field( - "address", - pa.struct([ - pa.field("street", pa.string()), - pa.field("city", pa.string()), - pa.field("zip", pa.int32()), - pa.field("bar", pa.large_string()), - ]), - ), - ]) + pa_schema = pa.schema( + [ + pa.field("foo", pa.large_string()), + pa.field("id", pa.int32()), + pa.field("name", pa.string()), + pa.field( + "address", + pa.struct( + [ + pa.field("street", pa.string()), + pa.field("city", pa.string()), + pa.field("zip", pa.int32()), + pa.field("bar", pa.large_string()), + ] + ), + ), + ] + ) arrow_table = pa.Table.from_pydict(TEST_DATA, schema=pa_schema) tbl = _create_table(session_catalog, identifier, {"format-version": format_version}, schema=pa_schema) @@ -409,20 +419,24 @@ def test_python_writes_with_small_and_large_types_spark_reads( pyiceberg_df = tbl.scan().to_pandas() assert spark_df.equals(pyiceberg_df) arrow_table_on_read = tbl.scan().to_arrow() - assert arrow_table_on_read.schema == pa.schema([ - pa.field("foo", pa.large_string()), - pa.field("id", pa.int32()), - pa.field("name", pa.large_string()), - pa.field( - "address", - pa.struct([ - pa.field("street", pa.large_string()), - pa.field("city", pa.large_string()), - pa.field("zip", pa.int32()), - pa.field("bar", pa.large_string()), - ]), - ), - ]) + assert arrow_table_on_read.schema == pa.schema( + [ + pa.field("foo", pa.large_string()), + pa.field("id", pa.int32()), + pa.field("name", pa.large_string()), + pa.field( + "address", + pa.struct( + [ + pa.field("street", pa.large_string()), + pa.field("city", pa.large_string()), + pa.field("zip", pa.int32()), + pa.field("bar", pa.large_string()), + ] + ), + ), + ] + ) @pytest.mark.integration @@ -718,10 +732,12 @@ def test_write_and_evolve(session_catalog: Catalog, format_version: int) -> None "foo": ["a", None, "z"], "bar": [19, None, 25], }, - schema=pa.schema([ - pa.field("foo", pa.string(), nullable=True), - pa.field("bar", pa.int32(), nullable=True), - ]), + schema=pa.schema( + [ + pa.field("foo", pa.string(), nullable=True), + pa.field("bar", pa.int32(), nullable=True), + ] + ), ) with tbl.transaction() as txn: @@ -761,10 +777,12 @@ def test_create_table_transaction(catalog: Catalog, format_version: int) -> None "foo": ["a", None, "z"], "bar": [19, None, 25], }, - schema=pa.schema([ - pa.field("foo", pa.string(), nullable=True), - pa.field("bar", pa.int32(), nullable=True), - ]), + schema=pa.schema( + [ + pa.field("foo", pa.string(), nullable=True), + pa.field("bar", pa.int32(), nullable=True), + ] + ), ) with catalog.create_table_transaction( @@ -810,9 +828,9 @@ def test_create_table_with_non_default_values(catalog: Catalog, table_schema_wit except NoSuchTableError: pass - iceberg_spec = PartitionSpec(*[ - PartitionField(source_id=2, field_id=1001, transform=IdentityTransform(), name="integer_partition") - ]) + iceberg_spec = PartitionSpec( + *[PartitionField(source_id=2, field_id=1001, transform=IdentityTransform(), name="integer_partition")] + ) sort_order = SortOrder(*[SortField(source_id=2, transform=IdentityTransform(), direction=SortDirection.ASC)]) @@ -1071,9 +1089,11 @@ def test_table_write_schema_with_valid_nullability_diff( table_schema = Schema( NestedField(field_id=1, name="long", field_type=LongType(), required=False), ) - other_schema = pa.schema(( - pa.field("long", pa.int64(), nullable=False), # can support writing required pyarrow field to optional Iceberg field - )) + other_schema = pa.schema( + ( + pa.field("long", pa.int64(), nullable=False), # can support writing required pyarrow field to optional Iceberg field + ) + ) arrow_table = pa.Table.from_pydict( { "long": [1, 9], @@ -1114,13 +1134,15 @@ def test_table_write_schema_with_valid_upcast( # table's long field should cast to long on read written_arrow_table = tbl.scan().to_arrow() assert written_arrow_table == pyarrow_table_with_promoted_types.cast( - pa.schema(( - pa.field("long", pa.int64(), nullable=True), - pa.field("list", pa.large_list(pa.int64()), nullable=False), - pa.field("map", pa.map_(pa.large_string(), pa.int64()), nullable=False), - pa.field("double", pa.float64(), nullable=True), # can support upcasting float to double - pa.field("uuid", pa.binary(length=16), nullable=True), # can UUID is read as fixed length binary of length 16 - )) + pa.schema( + ( + pa.field("long", pa.int64(), nullable=True), + pa.field("list", pa.large_list(pa.int64()), nullable=False), + pa.field("map", pa.map_(pa.large_string(), pa.int64()), nullable=False), + pa.field("double", pa.float64(), nullable=True), # can support upcasting float to double + pa.field("uuid", pa.binary(length=16), nullable=True), # can UUID is read as fixed length binary of length 16 + ) + ) ) lhs = spark.table(f"{identifier}").toPandas() rhs = written_arrow_table.to_pandas() @@ -1510,16 +1532,20 @@ def test_rewrite_manifest_after_partition_evolution(session_catalog: Catalog) -> def test_writing_null_structs(session_catalog: Catalog) -> None: import pyarrow as pa - schema = pa.schema([ - pa.field( - "struct_field_1", - pa.struct([ - pa.field("string_nested_1", pa.string()), - pa.field("int_item_2", pa.int32()), - pa.field("float_item_2", pa.float32()), - ]), - ), - ]) + schema = pa.schema( + [ + pa.field( + "struct_field_1", + pa.struct( + [ + pa.field("string_nested_1", pa.string()), + pa.field("int_item_2", pa.int32()), + pa.field("float_item_2", pa.float32()), + ] + ), + ), + ] + ) records = [ { diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index e4017e1df5..8bb97e150a 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -547,11 +547,13 @@ def test_binary_type_to_pyarrow() -> None: def test_struct_type_to_pyarrow(table_schema_simple: Schema) -> None: - expected = pa.struct([ - pa.field("foo", pa.large_string(), nullable=True, metadata={"field_id": "1"}), - pa.field("bar", pa.int32(), nullable=False, metadata={"field_id": "2"}), - pa.field("baz", pa.bool_(), nullable=True, metadata={"field_id": "3"}), - ]) + expected = pa.struct( + [ + pa.field("foo", pa.large_string(), nullable=True, metadata={"field_id": "1"}), + pa.field("bar", pa.int32(), nullable=False, metadata={"field_id": "2"}), + pa.field("baz", pa.bool_(), nullable=True, metadata={"field_id": "3"}), + ] + ) assert visit(table_schema_simple.as_struct(), _ConvertToArrowSchema()) == expected @@ -1771,11 +1773,13 @@ def test_bin_pack_arrow_table(arrow_table_with_null: pa.Table) -> None: def test_schema_mismatch_type(table_schema_simple: Schema) -> None: - other_schema = pa.schema(( - pa.field("foo", pa.string(), nullable=True), - pa.field("bar", pa.decimal128(18, 6), nullable=False), - pa.field("baz", pa.bool_(), nullable=True), - )) + other_schema = pa.schema( + ( + pa.field("foo", pa.string(), nullable=True), + pa.field("bar", pa.decimal128(18, 6), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + ) + ) expected = r"""Mismatch in fields: ┏━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ @@ -1792,11 +1796,13 @@ def test_schema_mismatch_type(table_schema_simple: Schema) -> None: def test_schema_mismatch_nullability(table_schema_simple: Schema) -> None: - other_schema = pa.schema(( - pa.field("foo", pa.string(), nullable=True), - pa.field("bar", pa.int32(), nullable=True), - pa.field("baz", pa.bool_(), nullable=True), - )) + other_schema = pa.schema( + ( + pa.field("foo", pa.string(), nullable=True), + pa.field("bar", pa.int32(), nullable=True), + pa.field("baz", pa.bool_(), nullable=True), + ) + ) expected = """Mismatch in fields: ┏━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓ @@ -1813,11 +1819,13 @@ def test_schema_mismatch_nullability(table_schema_simple: Schema) -> None: def test_schema_compatible_nullability_diff(table_schema_simple: Schema) -> None: - other_schema = pa.schema(( - pa.field("foo", pa.string(), nullable=True), - pa.field("bar", pa.int32(), nullable=False), - pa.field("baz", pa.bool_(), nullable=False), - )) + other_schema = pa.schema( + ( + pa.field("foo", pa.string(), nullable=True), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=False), + ) + ) try: _check_pyarrow_schema_compatible(table_schema_simple, other_schema) @@ -1826,10 +1834,12 @@ def test_schema_compatible_nullability_diff(table_schema_simple: Schema) -> None def test_schema_mismatch_missing_field(table_schema_simple: Schema) -> None: - other_schema = pa.schema(( - pa.field("foo", pa.string(), nullable=True), - pa.field("baz", pa.bool_(), nullable=True), - )) + other_schema = pa.schema( + ( + pa.field("foo", pa.string(), nullable=True), + pa.field("baz", pa.bool_(), nullable=True), + ) + ) expected = """Mismatch in fields: ┏━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓ @@ -1851,9 +1861,11 @@ def test_schema_compatible_missing_nullable_field_nested(table_schema_nested: Sc 6, pa.field( "person", - pa.struct([ - pa.field("age", pa.int32(), nullable=False), - ]), + pa.struct( + [ + pa.field("age", pa.int32(), nullable=False), + ] + ), nullable=True, ), ) @@ -1869,9 +1881,11 @@ def test_schema_mismatch_missing_required_field_nested(table_schema_nested: Sche 6, pa.field( "person", - pa.struct([ - pa.field("name", pa.string(), nullable=True), - ]), + pa.struct( + [ + pa.field("name", pa.string(), nullable=True), + ] + ), nullable=True, ), ) @@ -1920,12 +1934,14 @@ def test_schema_compatible_nested(table_schema_nested: Schema) -> None: def test_schema_mismatch_additional_field(table_schema_simple: Schema) -> None: - other_schema = pa.schema(( - pa.field("foo", pa.string(), nullable=True), - pa.field("bar", pa.int32(), nullable=False), - pa.field("baz", pa.bool_(), nullable=True), - pa.field("new_field", pa.date32(), nullable=True), - )) + other_schema = pa.schema( + ( + pa.field("foo", pa.string(), nullable=True), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + pa.field("new_field", pa.date32(), nullable=True), + ) + ) with pytest.raises( ValueError, match=r"PyArrow table contains more columns: new_field. Update the schema first \(hint, use union_by_name\)." @@ -1942,10 +1958,12 @@ def test_schema_compatible(table_schema_simple: Schema) -> None: def test_schema_projection(table_schema_simple: Schema) -> None: # remove optional `baz` field from `table_schema_simple` - other_schema = pa.schema(( - pa.field("foo", pa.string(), nullable=True), - pa.field("bar", pa.int32(), nullable=False), - )) + other_schema = pa.schema( + ( + pa.field("foo", pa.string(), nullable=True), + pa.field("bar", pa.int32(), nullable=False), + ) + ) try: _check_pyarrow_schema_compatible(table_schema_simple, other_schema) except Exception: @@ -1954,11 +1972,13 @@ def test_schema_projection(table_schema_simple: Schema) -> None: def test_schema_downcast(table_schema_simple: Schema) -> None: # large_string type is compatible with string type - other_schema = pa.schema(( - pa.field("foo", pa.large_string(), nullable=True), - pa.field("bar", pa.int32(), nullable=False), - pa.field("baz", pa.bool_(), nullable=True), - )) + other_schema = pa.schema( + ( + pa.field("foo", pa.large_string(), nullable=True), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + ) + ) try: _check_pyarrow_schema_compatible(table_schema_simple, other_schema) @@ -2037,11 +2057,13 @@ def test_identity_partition_on_multi_columns() -> None: assert {table_partition.partition_key.partition for table_partition in result} == expected concatenated_arrow_table = pa.concat_tables([table_partition.arrow_table_partition for table_partition in result]) assert concatenated_arrow_table.num_rows == arrow_table.num_rows - assert concatenated_arrow_table.sort_by([ - ("born_year", "ascending"), - ("n_legs", "ascending"), - ("animal", "ascending"), - ]) == arrow_table.sort_by([("born_year", "ascending"), ("n_legs", "ascending"), ("animal", "ascending")]) + assert concatenated_arrow_table.sort_by( + [ + ("born_year", "ascending"), + ("n_legs", "ascending"), + ("animal", "ascending"), + ] + ) == arrow_table.sort_by([("born_year", "ascending"), ("n_legs", "ascending"), ("animal", "ascending")]) def test__to_requested_schema_timestamps( diff --git a/tests/io/test_pyarrow_visitor.py b/tests/io/test_pyarrow_visitor.py index 9e6df720c6..027fccae7c 100644 --- a/tests/io/test_pyarrow_visitor.py +++ b/tests/io/test_pyarrow_visitor.py @@ -239,11 +239,13 @@ def test_pyarrow_variable_binary_to_iceberg() -> None: def test_pyarrow_struct_to_iceberg() -> None: - pyarrow_struct = pa.struct([ - pa.field("foo", pa.string(), nullable=True, metadata={"PARQUET:field_id": "1", "doc": "foo doc"}), - pa.field("bar", pa.int32(), nullable=False, metadata={"PARQUET:field_id": "2"}), - pa.field("baz", pa.bool_(), nullable=True, metadata={"PARQUET:field_id": "3"}), - ]) + pyarrow_struct = pa.struct( + [ + pa.field("foo", pa.string(), nullable=True, metadata={"PARQUET:field_id": "1", "doc": "foo doc"}), + pa.field("bar", pa.int32(), nullable=False, metadata={"PARQUET:field_id": "2"}), + pa.field("baz", pa.bool_(), nullable=True, metadata={"PARQUET:field_id": "3"}), + ] + ) expected = StructType( NestedField(field_id=1, name="foo", field_type=StringType(), required=False, doc="foo doc"), NestedField(field_id=2, name="bar", field_type=IntegerType(), required=True), @@ -344,84 +346,94 @@ def test_round_schema_large_string() -> None: def test_simple_schema_has_missing_ids() -> None: - schema = pa.schema([ - pa.field("foo", pa.string(), nullable=False), - ]) + schema = pa.schema( + [ + pa.field("foo", pa.string(), nullable=False), + ] + ) visitor = _HasIds() has_ids = visit_pyarrow(schema, visitor) assert not has_ids def test_simple_schema_has_missing_ids_partial() -> None: - schema = pa.schema([ - pa.field("foo", pa.string(), nullable=False, metadata={"PARQUET:field_id": "1", "doc": "foo doc"}), - pa.field("bar", pa.int32(), nullable=False), - ]) + schema = pa.schema( + [ + pa.field("foo", pa.string(), nullable=False, metadata={"PARQUET:field_id": "1", "doc": "foo doc"}), + pa.field("bar", pa.int32(), nullable=False), + ] + ) visitor = _HasIds() has_ids = visit_pyarrow(schema, visitor) assert not has_ids def test_nested_schema_has_missing_ids() -> None: - schema = pa.schema([ - pa.field("foo", pa.string(), nullable=False), - pa.field( - "quux", - pa.map_( - pa.string(), - pa.map_(pa.string(), pa.int32()), + schema = pa.schema( + [ + pa.field("foo", pa.string(), nullable=False), + pa.field( + "quux", + pa.map_( + pa.string(), + pa.map_(pa.string(), pa.int32()), + ), + nullable=False, ), - nullable=False, - ), - ]) + ] + ) visitor = _HasIds() has_ids = visit_pyarrow(schema, visitor) assert not has_ids def test_nested_schema_has_ids() -> None: - schema = pa.schema([ - pa.field("foo", pa.string(), nullable=False, metadata={"PARQUET:field_id": "1", "doc": "foo doc"}), - pa.field( - "quux", - pa.map_( - pa.field("key", pa.string(), nullable=False, metadata={"PARQUET:field_id": "7"}), - pa.field( - "value", - pa.map_( - pa.field("key", pa.string(), nullable=False, metadata={"PARQUET:field_id": "9"}), - pa.field("value", pa.int32(), metadata={"PARQUET:field_id": "10"}), + schema = pa.schema( + [ + pa.field("foo", pa.string(), nullable=False, metadata={"PARQUET:field_id": "1", "doc": "foo doc"}), + pa.field( + "quux", + pa.map_( + pa.field("key", pa.string(), nullable=False, metadata={"PARQUET:field_id": "7"}), + pa.field( + "value", + pa.map_( + pa.field("key", pa.string(), nullable=False, metadata={"PARQUET:field_id": "9"}), + pa.field("value", pa.int32(), metadata={"PARQUET:field_id": "10"}), + ), + nullable=False, + metadata={"PARQUET:field_id": "8"}, ), - nullable=False, - metadata={"PARQUET:field_id": "8"}, ), + nullable=False, + metadata={"PARQUET:field_id": "6", "doc": "quux doc"}, ), - nullable=False, - metadata={"PARQUET:field_id": "6", "doc": "quux doc"}, - ), - ]) + ] + ) visitor = _HasIds() has_ids = visit_pyarrow(schema, visitor) assert has_ids def test_nested_schema_has_partial_missing_ids() -> None: - schema = pa.schema([ - pa.field("foo", pa.string(), nullable=False, metadata={"PARQUET:field_id": "1", "doc": "foo doc"}), - pa.field( - "quux", - pa.map_( - pa.field("key", pa.string(), nullable=False, metadata={"PARQUET:field_id": "7"}), - pa.field( - "value", - pa.map_(pa.field("key", pa.string(), nullable=False), pa.field("value", pa.int32())), - nullable=False, + schema = pa.schema( + [ + pa.field("foo", pa.string(), nullable=False, metadata={"PARQUET:field_id": "1", "doc": "foo doc"}), + pa.field( + "quux", + pa.map_( + pa.field("key", pa.string(), nullable=False, metadata={"PARQUET:field_id": "7"}), + pa.field( + "value", + pa.map_(pa.field("key", pa.string(), nullable=False), pa.field("value", pa.int32())), + nullable=False, + ), ), + nullable=False, + metadata={"PARQUET:field_id": "6", "doc": "quux doc"}, ), - nullable=False, - metadata={"PARQUET:field_id": "6", "doc": "quux doc"}, - ), - ]) + ] + ) visitor = _HasIds() has_ids = visit_pyarrow(schema, visitor) assert not has_ids @@ -441,11 +453,13 @@ def test_simple_pyarrow_schema_to_schema_missing_ids_using_name_mapping( pyarrow_schema_simple_without_ids: pa.Schema, iceberg_schema_simple: Schema ) -> None: schema = pyarrow_schema_simple_without_ids - name_mapping = NameMapping([ - MappedField(field_id=1, names=["foo"]), - MappedField(field_id=2, names=["bar"]), - MappedField(field_id=3, names=["baz"]), - ]) + name_mapping = NameMapping( + [ + MappedField(field_id=1, names=["foo"]), + MappedField(field_id=2, names=["bar"]), + MappedField(field_id=3, names=["baz"]), + ] + ) assert pyarrow_to_schema(schema, name_mapping) == iceberg_schema_simple @@ -454,9 +468,11 @@ def test_simple_pyarrow_schema_to_schema_missing_ids_using_name_mapping_partial_ pyarrow_schema_simple_without_ids: pa.Schema, ) -> None: schema = pyarrow_schema_simple_without_ids - name_mapping = NameMapping([ - MappedField(field_id=1, names=["foo"]), - ]) + name_mapping = NameMapping( + [ + MappedField(field_id=1, names=["foo"]), + ] + ) with pytest.raises(ValueError) as exc_info: _ = pyarrow_to_schema(schema, name_mapping) assert "Could not find field with name: bar" in str(exc_info.value) @@ -467,83 +483,89 @@ def test_nested_pyarrow_schema_to_schema_missing_ids_using_name_mapping( ) -> None: schema = pyarrow_schema_nested_without_ids - name_mapping = NameMapping([ - MappedField(field_id=1, names=["foo"]), - MappedField(field_id=2, names=["bar"]), - MappedField(field_id=3, names=["baz"]), - MappedField(field_id=4, names=["qux"], fields=[MappedField(field_id=5, names=["element"])]), - MappedField( - field_id=6, - names=["quux"], - fields=[ - MappedField(field_id=7, names=["key"]), - MappedField( - field_id=8, - names=["value"], - fields=[ - MappedField(field_id=9, names=["key"]), - MappedField(field_id=10, names=["value"]), - ], - ), - ], - ), - MappedField( - field_id=11, - names=["location"], - fields=[ - MappedField( - field_id=12, - names=["element"], - fields=[ - MappedField(field_id=13, names=["latitude"]), - MappedField(field_id=14, names=["longitude"]), - ], - ) - ], - ), - MappedField( - field_id=15, - names=["person"], - fields=[ - MappedField(field_id=16, names=["name"]), - MappedField(field_id=17, names=["age"]), - ], - ), - ]) + name_mapping = NameMapping( + [ + MappedField(field_id=1, names=["foo"]), + MappedField(field_id=2, names=["bar"]), + MappedField(field_id=3, names=["baz"]), + MappedField(field_id=4, names=["qux"], fields=[MappedField(field_id=5, names=["element"])]), + MappedField( + field_id=6, + names=["quux"], + fields=[ + MappedField(field_id=7, names=["key"]), + MappedField( + field_id=8, + names=["value"], + fields=[ + MappedField(field_id=9, names=["key"]), + MappedField(field_id=10, names=["value"]), + ], + ), + ], + ), + MappedField( + field_id=11, + names=["location"], + fields=[ + MappedField( + field_id=12, + names=["element"], + fields=[ + MappedField(field_id=13, names=["latitude"]), + MappedField(field_id=14, names=["longitude"]), + ], + ) + ], + ), + MappedField( + field_id=15, + names=["person"], + fields=[ + MappedField(field_id=16, names=["name"]), + MappedField(field_id=17, names=["age"]), + ], + ), + ] + ) assert pyarrow_to_schema(schema, name_mapping) == iceberg_schema_nested def test_pyarrow_schema_to_schema_missing_ids_using_name_mapping_nested_missing_id() -> None: - schema = pa.schema([ - pa.field("foo", pa.string(), nullable=False), - pa.field( - "quux", - pa.map_( - pa.string(), - pa.map_(pa.string(), pa.int32()), - ), - nullable=False, - ), - ]) - - name_mapping = NameMapping([ - MappedField(field_id=1, names=["foo"]), - MappedField( - field_id=6, - names=["quux"], - fields=[ - MappedField(field_id=7, names=["key"]), - MappedField( - field_id=8, - names=["value"], - fields=[ - MappedField(field_id=10, names=["value"]), - ], + schema = pa.schema( + [ + pa.field("foo", pa.string(), nullable=False), + pa.field( + "quux", + pa.map_( + pa.string(), + pa.map_(pa.string(), pa.int32()), ), - ], - ), - ]) + nullable=False, + ), + ] + ) + + name_mapping = NameMapping( + [ + MappedField(field_id=1, names=["foo"]), + MappedField( + field_id=6, + names=["quux"], + fields=[ + MappedField(field_id=7, names=["key"]), + MappedField( + field_id=8, + names=["value"], + fields=[ + MappedField(field_id=10, names=["value"]), + ], + ), + ], + ), + ] + ) with pytest.raises(ValueError) as exc_info: _ = pyarrow_to_schema(schema, name_mapping) assert "Could not find field with name: quux.value.key" in str(exc_info.value) @@ -562,38 +584,44 @@ def test_pyarrow_schema_to_schema_fresh_ids_nested_schema( def test_pyarrow_schema_ensure_large_types(pyarrow_schema_nested_without_ids: pa.Schema) -> None: - expected_schema = pa.schema([ - pa.field("foo", pa.large_string(), nullable=False), - pa.field("bar", pa.int32(), nullable=False), - pa.field("baz", pa.bool_(), nullable=True), - pa.field("qux", pa.large_list(pa.large_string()), nullable=False), - pa.field( - "quux", - pa.map_( - pa.large_string(), - pa.map_(pa.large_string(), pa.int32()), + expected_schema = pa.schema( + [ + pa.field("foo", pa.large_string(), nullable=False), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + pa.field("qux", pa.large_list(pa.large_string()), nullable=False), + pa.field( + "quux", + pa.map_( + pa.large_string(), + pa.map_(pa.large_string(), pa.int32()), + ), + nullable=False, ), - nullable=False, - ), - pa.field( - "location", - pa.large_list( - pa.struct([ - pa.field("latitude", pa.float32(), nullable=False), - pa.field("longitude", pa.float32(), nullable=False), - ]), + pa.field( + "location", + pa.large_list( + pa.struct( + [ + pa.field("latitude", pa.float32(), nullable=False), + pa.field("longitude", pa.float32(), nullable=False), + ] + ), + ), + nullable=False, ), - nullable=False, - ), - pa.field( - "person", - pa.struct([ - pa.field("name", pa.large_string(), nullable=True), - pa.field("age", pa.int32(), nullable=False), - ]), - nullable=True, - ), - ]) + pa.field( + "person", + pa.struct( + [ + pa.field("name", pa.large_string(), nullable=True), + pa.field("age", pa.int32(), nullable=False), + ] + ), + nullable=True, + ), + ] + ) assert _pyarrow_schema_ensure_large_types(pyarrow_schema_nested_without_ids) == expected_schema diff --git a/tests/table/test_init.py b/tests/table/test_init.py index 397fa9f537..bcb2d643dc 100644 --- a/tests/table/test_init.py +++ b/tests/table/test_init.py @@ -538,15 +538,15 @@ def test_update_column(table_v1: Table, table_v2: Table) -> None: assert new_schema3.find_field("z").required is False, "failed to update existing field required" # assert the above two updates also works with union_by_name - assert table.update_schema().union_by_name(new_schema)._apply() == new_schema, ( - "failed to update existing field doc with union_by_name" - ) - assert table.update_schema().union_by_name(new_schema2)._apply() == new_schema2, ( - "failed to remove existing field doc with union_by_name" - ) - assert table.update_schema().union_by_name(new_schema3)._apply() == new_schema3, ( - "failed to update existing field required with union_by_name" - ) + assert ( + table.update_schema().union_by_name(new_schema)._apply() == new_schema + ), "failed to update existing field doc with union_by_name" + assert ( + table.update_schema().union_by_name(new_schema2)._apply() == new_schema2 + ), "failed to remove existing field doc with union_by_name" + assert ( + table.update_schema().union_by_name(new_schema3)._apply() == new_schema3 + ), "failed to update existing field required with union_by_name" def test_add_primitive_type_column(table_v2: Table) -> None: @@ -1077,52 +1077,56 @@ def test_assert_default_sort_order_id(table_v2: Table) -> None: def test_correct_schema() -> None: - table_metadata = TableMetadataV2(**{ - "format-version": 2, - "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", - "location": "s3://bucket/test/location", - "last-sequence-number": 34, - "last-updated-ms": 1602638573590, - "last-column-id": 3, - "current-schema-id": 1, - "schemas": [ - {"type": "struct", "schema-id": 0, "fields": [{"id": 1, "name": "x", "required": True, "type": "long"}]}, - { - "type": "struct", - "schema-id": 1, - "identifier-field-ids": [1, 2], - "fields": [ - {"id": 1, "name": "x", "required": True, "type": "long"}, - {"id": 2, "name": "y", "required": True, "type": "long"}, - {"id": 3, "name": "z", "required": True, "type": "long"}, - ], - }, - ], - "default-spec-id": 0, - "partition-specs": [{"spec-id": 0, "fields": [{"name": "x", "transform": "identity", "source-id": 1, "field-id": 1000}]}], - "last-partition-id": 1000, - "default-sort-order-id": 0, - "sort-orders": [], - "current-snapshot-id": 123, - "snapshots": [ - { - "snapshot-id": 234, - "timestamp-ms": 1515100955770, - "sequence-number": 0, - "summary": {"operation": "append"}, - "manifest-list": "s3://a/b/1.avro", - "schema-id": 10, - }, - { - "snapshot-id": 123, - "timestamp-ms": 1515100955770, - "sequence-number": 0, - "summary": {"operation": "append"}, - "manifest-list": "s3://a/b/1.avro", - "schema-id": 0, - }, - ], - }) + table_metadata = TableMetadataV2( + **{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 1, + "schemas": [ + {"type": "struct", "schema-id": 0, "fields": [{"id": 1, "name": "x", "required": True, "type": "long"}]}, + { + "type": "struct", + "schema-id": 1, + "identifier-field-ids": [1, 2], + "fields": [ + {"id": 1, "name": "x", "required": True, "type": "long"}, + {"id": 2, "name": "y", "required": True, "type": "long"}, + {"id": 3, "name": "z", "required": True, "type": "long"}, + ], + }, + ], + "default-spec-id": 0, + "partition-specs": [ + {"spec-id": 0, "fields": [{"name": "x", "transform": "identity", "source-id": 1, "field-id": 1000}]} + ], + "last-partition-id": 1000, + "default-sort-order-id": 0, + "sort-orders": [], + "current-snapshot-id": 123, + "snapshots": [ + { + "snapshot-id": 234, + "timestamp-ms": 1515100955770, + "sequence-number": 0, + "summary": {"operation": "append"}, + "manifest-list": "s3://a/b/1.avro", + "schema-id": 10, + }, + { + "snapshot-id": 123, + "timestamp-ms": 1515100955770, + "sequence-number": 0, + "summary": {"operation": "append"}, + "manifest-list": "s3://a/b/1.avro", + "schema-id": 0, + }, + ], + } + ) t = Table( identifier=("default", "t1"), diff --git a/tests/table/test_name_mapping.py b/tests/table/test_name_mapping.py index bd271f59f8..c567f3ffb4 100644 --- a/tests/table/test_name_mapping.py +++ b/tests/table/test_name_mapping.py @@ -30,49 +30,51 @@ @pytest.fixture(scope="session") def table_name_mapping_nested() -> NameMapping: - return NameMapping([ - MappedField(field_id=1, names=["foo"]), - MappedField(field_id=2, names=["bar"]), - MappedField(field_id=3, names=["baz"]), - MappedField(field_id=4, names=["qux"], fields=[MappedField(field_id=5, names=["element"])]), - MappedField( - field_id=6, - names=["quux"], - fields=[ - MappedField(field_id=7, names=["key"]), - MappedField( - field_id=8, - names=["value"], - fields=[ - MappedField(field_id=9, names=["key"]), - MappedField(field_id=10, names=["value"]), - ], - ), - ], - ), - MappedField( - field_id=11, - names=["location"], - fields=[ - MappedField( - field_id=12, - names=["element"], - fields=[ - MappedField(field_id=13, names=["latitude"]), - MappedField(field_id=14, names=["longitude"]), - ], - ) - ], - ), - MappedField( - field_id=15, - names=["person"], - fields=[ - MappedField(field_id=16, names=["name"]), - MappedField(field_id=17, names=["age"]), - ], - ), - ]) + return NameMapping( + [ + MappedField(field_id=1, names=["foo"]), + MappedField(field_id=2, names=["bar"]), + MappedField(field_id=3, names=["baz"]), + MappedField(field_id=4, names=["qux"], fields=[MappedField(field_id=5, names=["element"])]), + MappedField( + field_id=6, + names=["quux"], + fields=[ + MappedField(field_id=7, names=["key"]), + MappedField( + field_id=8, + names=["value"], + fields=[ + MappedField(field_id=9, names=["key"]), + MappedField(field_id=10, names=["value"]), + ], + ), + ], + ), + MappedField( + field_id=11, + names=["location"], + fields=[ + MappedField( + field_id=12, + names=["element"], + fields=[ + MappedField(field_id=13, names=["latitude"]), + MappedField(field_id=14, names=["longitude"]), + ], + ) + ], + ), + MappedField( + field_id=15, + names=["person"], + fields=[ + MappedField(field_id=16, names=["name"]), + MappedField(field_id=17, names=["age"]), + ], + ), + ] + ) def test_json_mapped_field_deserialization() -> None: @@ -165,26 +167,30 @@ def test_json_name_mapping_deserialization() -> None: ] """ - assert parse_mapping_from_json(name_mapping) == NameMapping([ - MappedField(field_id=1, names=["id", "record_id"]), - MappedField(field_id=2, names=["data"]), - MappedField( - names=["location"], - field_id=3, - fields=[ - MappedField(field_id=4, names=["latitude", "lat"]), - MappedField(field_id=5, names=["longitude", "long"]), - ], - ), - ]) + assert parse_mapping_from_json(name_mapping) == NameMapping( + [ + MappedField(field_id=1, names=["id", "record_id"]), + MappedField(field_id=2, names=["data"]), + MappedField( + names=["location"], + field_id=3, + fields=[ + MappedField(field_id=4, names=["latitude", "lat"]), + MappedField(field_id=5, names=["longitude", "long"]), + ], + ), + ] + ) def test_json_mapped_field_no_field_id_serialization() -> None: - table_name_mapping_nested_no_field_id = NameMapping([ - MappedField(field_id=1, names=["foo"]), - MappedField(field_id=None, names=["bar"]), - MappedField(field_id=2, names=["qux"], fields=[MappedField(field_id=None, names=["element"])]), - ]) + table_name_mapping_nested_no_field_id = NameMapping( + [ + MappedField(field_id=1, names=["foo"]), + MappedField(field_id=None, names=["bar"]), + MappedField(field_id=2, names=["qux"], fields=[MappedField(field_id=None, names=["element"])]), + ] + ) assert ( table_name_mapping_nested_no_field_id.model_dump_json() @@ -200,18 +206,20 @@ def test_json_serialization(table_name_mapping_nested: NameMapping) -> None: def test_name_mapping_to_string() -> None: - nm = NameMapping([ - MappedField(field_id=1, names=["id", "record_id"]), - MappedField(field_id=2, names=["data"]), - MappedField( - names=["location"], - field_id=3, - fields=[ - MappedField(field_id=4, names=["lat", "latitude"]), - MappedField(field_id=5, names=["long", "longitude"]), - ], - ), - ]) + nm = NameMapping( + [ + MappedField(field_id=1, names=["id", "record_id"]), + MappedField(field_id=2, names=["data"]), + MappedField( + names=["location"], + field_id=3, + fields=[ + MappedField(field_id=4, names=["lat", "latitude"]), + MappedField(field_id=5, names=["long", "longitude"]), + ], + ), + ] + ) assert ( str(nm) @@ -294,51 +302,53 @@ def test_update_mapping(table_name_mapping_nested: NameMapping) -> None: 15: [NestedField(19, "name", StringType(), True), NestedField(20, "add_20", StringType(), True)], } - expected = NameMapping([ - MappedField(field_id=1, names=["foo", "foo_update"]), - MappedField(field_id=2, names=["bar"]), - MappedField(field_id=3, names=["baz"]), - MappedField(field_id=4, names=["qux"], fields=[MappedField(field_id=5, names=["element"])]), - MappedField( - field_id=6, - names=["quux"], - fields=[ - MappedField(field_id=7, names=["key"]), - MappedField( - field_id=8, - names=["value"], - fields=[ - MappedField(field_id=9, names=["key"]), - MappedField(field_id=10, names=["value"]), - ], - ), - ], - ), - MappedField( - field_id=11, - names=["location"], - fields=[ - MappedField( - field_id=12, - names=["element"], - fields=[ - MappedField(field_id=13, names=["latitude"]), - MappedField(field_id=14, names=["longitude"]), - ], - ) - ], - ), - MappedField( - field_id=15, - names=["person"], - fields=[ - MappedField(field_id=17, names=["age"]), - MappedField(field_id=19, names=["name"]), - MappedField(field_id=20, names=["add_20"]), - ], - ), - MappedField(field_id=18, names=["add_18"]), - ]) + expected = NameMapping( + [ + MappedField(field_id=1, names=["foo", "foo_update"]), + MappedField(field_id=2, names=["bar"]), + MappedField(field_id=3, names=["baz"]), + MappedField(field_id=4, names=["qux"], fields=[MappedField(field_id=5, names=["element"])]), + MappedField( + field_id=6, + names=["quux"], + fields=[ + MappedField(field_id=7, names=["key"]), + MappedField( + field_id=8, + names=["value"], + fields=[ + MappedField(field_id=9, names=["key"]), + MappedField(field_id=10, names=["value"]), + ], + ), + ], + ), + MappedField( + field_id=11, + names=["location"], + fields=[ + MappedField( + field_id=12, + names=["element"], + fields=[ + MappedField(field_id=13, names=["latitude"]), + MappedField(field_id=14, names=["longitude"]), + ], + ) + ], + ), + MappedField( + field_id=15, + names=["person"], + fields=[ + MappedField(field_id=17, names=["age"]), + MappedField(field_id=19, names=["name"]), + MappedField(field_id=20, names=["add_20"]), + ], + ), + MappedField(field_id=18, names=["add_18"]), + ] + ) assert update_mapping(table_name_mapping_nested, updates, adds) == expected diff --git a/tests/test_schema.py b/tests/test_schema.py index d1fc19df77..daa46dee1f 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -1618,11 +1618,13 @@ def test_append_nested_lists() -> None: def test_union_with_pa_schema(primitive_fields: NestedField) -> None: base_schema = Schema(NestedField(field_id=1, name="foo", field_type=StringType(), required=True)) - pa_schema = pa.schema([ - pa.field("foo", pa.string(), nullable=False), - pa.field("bar", pa.int32(), nullable=True), - pa.field("baz", pa.bool_(), nullable=True), - ]) + pa_schema = pa.schema( + [ + pa.field("foo", pa.string(), nullable=False), + pa.field("bar", pa.int32(), nullable=True), + pa.field("baz", pa.bool_(), nullable=True), + ] + ) new_schema = UpdateSchema(transaction=None, schema=base_schema).union_by_name(pa_schema)._apply() # type: ignore @@ -1642,10 +1644,12 @@ def test_arrow_schema() -> None: NestedField(field_id=3, name="baz", field_type=BooleanType(), required=False), ) - expected_schema = pa.schema([ - pa.field("foo", pa.large_string(), nullable=False), - pa.field("bar", pa.int32(), nullable=True), - pa.field("baz", pa.bool_(), nullable=True), - ]) + expected_schema = pa.schema( + [ + pa.field("foo", pa.large_string(), nullable=False), + pa.field("bar", pa.int32(), nullable=True), + pa.field("baz", pa.bool_(), nullable=True), + ] + ) assert base_schema.as_arrow() == expected_schema diff --git a/tests/utils/test_manifest.py b/tests/utils/test_manifest.py index 154671c92e..3b1fc6f013 100644 --- a/tests/utils/test_manifest.py +++ b/tests/utils/test_manifest.py @@ -621,9 +621,9 @@ def test_write_manifest_list( def test_file_format_case_insensitive(raw_file_format: str, expected_file_format: FileFormat) -> None: if expected_file_format: parsed_file_format = FileFormat(raw_file_format) - assert parsed_file_format == expected_file_format, ( - f"File format {raw_file_format}: {parsed_file_format} != {expected_file_format}" - ) + assert ( + parsed_file_format == expected_file_format + ), f"File format {raw_file_format}: {parsed_file_format} != {expected_file_format}" else: with pytest.raises(ValueError): _ = FileFormat(raw_file_format) From e5bfa1e49eda103c0808cff1e7c6a489f84982ea Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Mon, 6 Jan 2025 01:53:12 -0500 Subject: [PATCH 12/32] Move `mkdocs` to use poetry as `docs` (#1486) * poetry add $(cat mkdocs/requirements.txt | grep -v #) --group dev * add `make docs` * update instructions * strict mode * make docs-build * docs-serve * add comment * add docs as dep group * add make install-poetry --- .github/workflows/python-ci-docs.yml | 10 +- .github/workflows/python-release-docs.yml | 12 +- Makefile | 11 +- mkdocs/README.md | 5 +- mkdocs/requirements.txt | 28 -- poetry.lock | 368 +++++++++++++++++++++- pyproject.toml | 319 +++++++++++++++++++ 7 files changed, 707 insertions(+), 46 deletions(-) delete mode 100644 mkdocs/requirements.txt diff --git a/.github/workflows/python-ci-docs.yml b/.github/workflows/python-ci-docs.yml index 19c4bb6ac1..d6e14c8400 100644 --- a/.github/workflows/python-ci-docs.yml +++ b/.github/workflows/python-ci-docs.yml @@ -36,12 +36,12 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Install poetry + run: make install-poetry - uses: actions/setup-python@v5 with: python-version: 3.12 - name: Install - working-directory: ./mkdocs - run: pip install -r requirements.txt - - name: Build - working-directory: ./mkdocs - run: mkdocs build --strict + run: make docs-install + - name: Build docs + run: make docs-build diff --git a/.github/workflows/python-release-docs.yml b/.github/workflows/python-release-docs.yml index 2f1b1155e9..2823563fe5 100644 --- a/.github/workflows/python-release-docs.yml +++ b/.github/workflows/python-release-docs.yml @@ -31,15 +31,15 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Install poetry + run: make install-poetry - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python }} - - name: Install - working-directory: ./mkdocs - run: pip install -r requirements.txt - - name: Build - working-directory: ./mkdocs - run: mkdocs build --strict + - name: Install docs + run: make docs-install + - name: Build docs + run: make docs-build - name: Copy working-directory: ./mkdocs run: mv ./site /tmp/site diff --git a/Makefile b/Makefile index f2bb6f6871..b53a98da61 100644 --- a/Makefile +++ b/Makefile @@ -27,7 +27,7 @@ install-poetry: ## Install poetry if the user has not done that yet. echo "Poetry is already installed."; \ fi -install-dependencies: ## Install dependencies including dev and all extras +install-dependencies: ## Install dependencies including dev, docs, and all extras poetry install --all-extras install: | install-poetry install-dependencies @@ -97,3 +97,12 @@ clean: ## Clean up the project Python working environment @find . -name "*.pyd" -exec echo Deleting {} \; -delete @find . -name "*.pyo" -exec echo Deleting {} \; -delete @echo "Cleanup complete" + +docs-install: + poetry install --with docs + +docs-serve: + poetry run mkdocs serve -f mkdocs/mkdocs.yml + +docs-build: + poetry run mkdocs build -f mkdocs/mkdocs.yml --strict diff --git a/mkdocs/README.md b/mkdocs/README.md index e9e0462bee..271025a726 100644 --- a/mkdocs/README.md +++ b/mkdocs/README.md @@ -22,7 +22,6 @@ The pyiceberg docs are stored in `docs/`. ## Running docs locally ```sh -pip3 install -r requirements.txt -mkdocs serve -open http://localhost:8000/ +make docs-install +make docs-serve ``` diff --git a/mkdocs/requirements.txt b/mkdocs/requirements.txt deleted file mode 100644 index f374b85bea..0000000000 --- a/mkdocs/requirements.txt +++ /dev/null @@ -1,28 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -mkdocs==1.6.1 -griffe==1.5.4 -jinja2==3.1.5 -mkdocstrings==0.27.0 -mkdocstrings-python==1.13.0 -mkdocs-literate-nav==0.6.1 -mkdocs-autorefs==1.2.0 -mkdocs-gen-files==0.5.0 -mkdocs-material==9.5.49 -mkdocs-material-extensions==1.3.1 -mkdocs-section-index==0.3.9 diff --git a/poetry.lock b/poetry.lock index 4fd524bb3f..b1b73746c1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -345,6 +345,20 @@ typing-extensions = ">=4.6.0" [package.extras] aio = ["azure-core[aio] (>=1.30.0)"] +[[package]] +name = "babel" +version = "2.16.0" +description = "Internationalization utilities" +optional = false +python-versions = ">=3.8" +files = [ + {file = "babel-2.16.0-py3-none-any.whl", hash = "sha256:368b5b98b37c06b7daf6696391c3240c938b37767d4584413e8438c5c435fa8b"}, + {file = "babel-2.16.0.tar.gz", hash = "sha256:d1f3554ca26605fe173f3de0c65f750f5a42f924499bf134de6423582298e316"}, +] + +[package.extras] +dev = ["freezegun (>=1.0,<2.0)", "pytest (>=6.0)", "pytest-cov"] + [[package]] name = "blinker" version = "1.9.0" @@ -1461,6 +1475,23 @@ ray = ["packaging", "ray[client,data] (>=2.0.0)", "ray[client,data] (>=2.10.0)"] sql = ["connectorx", "sqlalchemy", "sqlglot"] unity = ["unitycatalog"] +[[package]] +name = "ghp-import" +version = "2.1.0" +description = "Copy your docs directly to the gh-pages branch." +optional = false +python-versions = "*" +files = [ + {file = "ghp-import-2.1.0.tar.gz", hash = "sha256:9c535c4c61193c2df8871222567d7fd7e5014d835f97dc7b7439069e2413d343"}, + {file = "ghp_import-2.1.0-py3-none-any.whl", hash = "sha256:8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619"}, +] + +[package.dependencies] +python-dateutil = ">=2.8.1" + +[package.extras] +dev = ["flake8", "markdown", "twine", "wheel"] + [[package]] name = "google-api-core" version = "2.24.0" @@ -1745,6 +1776,20 @@ files = [ docs = ["Sphinx", "furo"] test = ["objgraph", "psutil"] +[[package]] +name = "griffe" +version = "1.5.4" +description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API." +optional = false +python-versions = ">=3.9" +files = [ + {file = "griffe-1.5.4-py3-none-any.whl", hash = "sha256:ed33af890586a5bebc842fcb919fc694b3dc1bc55b7d9e0228de41ce566b4a1d"}, + {file = "griffe-1.5.4.tar.gz", hash = "sha256:073e78ad3e10c8378c2f798bd4ef87b92d8411e9916e157fd366a17cc4fd4e52"}, +] + +[package.dependencies] +colorama = ">=0.4" + [[package]] name = "identify" version = "2.6.3" @@ -1896,8 +1941,6 @@ optional = false python-versions = "*" files = [ {file = "jsonpath-ng-1.7.0.tar.gz", hash = "sha256:f6f5f7fd4e5ff79c785f1573b394043b39849fb2bb47bcead935d12b00beab3c"}, - {file = "jsonpath_ng-1.7.0-py2-none-any.whl", hash = "sha256:898c93fc173f0c336784a3fa63d7434297544b7198124a68f9a3ef9597b0ae6e"}, - {file = "jsonpath_ng-1.7.0-py3-none-any.whl", hash = "sha256:f3d7f9e848cba1b6da28c55b1c26ff915dc9e0b1ba7e752a53d6da8d5cbd00b6"}, ] [package.dependencies] @@ -2012,6 +2055,24 @@ files = [ {file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"}, ] +[[package]] +name = "markdown" +version = "3.7" +description = "Python implementation of John Gruber's Markdown." +optional = false +python-versions = ">=3.8" +files = [ + {file = "Markdown-3.7-py3-none-any.whl", hash = "sha256:7eb6df5690b81a1d7942992c97fad2938e956e79df20cbc6186e9c3a77b1c803"}, + {file = "markdown-3.7.tar.gz", hash = "sha256:2ae2471477cfd02dbbf038d5d9bc226d40def84b4fe2986e49b59b6b472bbed2"}, +] + +[package.dependencies] +importlib-metadata = {version = ">=4.4", markers = "python_version < \"3.10\""} + +[package.extras] +docs = ["mdx-gh-links (>=0.2)", "mkdocs (>=1.5)", "mkdocs-gen-files", "mkdocs-literate-nav", "mkdocs-nature (>=0.6)", "mkdocs-section-index", "mkdocstrings[python]"] +testing = ["coverage", "pyyaml"] + [[package]] name = "markdown-it-py" version = "3.0.0" @@ -2117,6 +2178,207 @@ files = [ {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, ] +[[package]] +name = "mergedeep" +version = "1.3.4" +description = "A deep merge function for 🐍." +optional = false +python-versions = ">=3.6" +files = [ + {file = "mergedeep-1.3.4-py3-none-any.whl", hash = "sha256:70775750742b25c0d8f36c55aed03d24c3384d17c951b3175d898bd778ef0307"}, + {file = "mergedeep-1.3.4.tar.gz", hash = "sha256:0096d52e9dad9939c3d975a774666af186eda617e6ca84df4c94dec30004f2a8"}, +] + +[[package]] +name = "mkdocs" +version = "1.6.1" +description = "Project documentation with Markdown." +optional = false +python-versions = ">=3.8" +files = [ + {file = "mkdocs-1.6.1-py3-none-any.whl", hash = "sha256:db91759624d1647f3f34aa0c3f327dd2601beae39a366d6e064c03468d35c20e"}, + {file = "mkdocs-1.6.1.tar.gz", hash = "sha256:7b432f01d928c084353ab39c57282f29f92136665bdd6abf7c1ec8d822ef86f2"}, +] + +[package.dependencies] +click = ">=7.0" +colorama = {version = ">=0.4", markers = "platform_system == \"Windows\""} +ghp-import = ">=1.0" +importlib-metadata = {version = ">=4.4", markers = "python_version < \"3.10\""} +jinja2 = ">=2.11.1" +markdown = ">=3.3.6" +markupsafe = ">=2.0.1" +mergedeep = ">=1.3.4" +mkdocs-get-deps = ">=0.2.0" +packaging = ">=20.5" +pathspec = ">=0.11.1" +pyyaml = ">=5.1" +pyyaml-env-tag = ">=0.1" +watchdog = ">=2.0" + +[package.extras] +i18n = ["babel (>=2.9.0)"] +min-versions = ["babel (==2.9.0)", "click (==7.0)", "colorama (==0.4)", "ghp-import (==1.0)", "importlib-metadata (==4.4)", "jinja2 (==2.11.1)", "markdown (==3.3.6)", "markupsafe (==2.0.1)", "mergedeep (==1.3.4)", "mkdocs-get-deps (==0.2.0)", "packaging (==20.5)", "pathspec (==0.11.1)", "pyyaml (==5.1)", "pyyaml-env-tag (==0.1)", "watchdog (==2.0)"] + +[[package]] +name = "mkdocs-autorefs" +version = "1.2.0" +description = "Automatically link across pages in MkDocs." +optional = false +python-versions = ">=3.8" +files = [ + {file = "mkdocs_autorefs-1.2.0-py3-none-any.whl", hash = "sha256:d588754ae89bd0ced0c70c06f58566a4ee43471eeeee5202427da7de9ef85a2f"}, + {file = "mkdocs_autorefs-1.2.0.tar.gz", hash = "sha256:a86b93abff653521bda71cf3fc5596342b7a23982093915cb74273f67522190f"}, +] + +[package.dependencies] +Markdown = ">=3.3" +markupsafe = ">=2.0.1" +mkdocs = ">=1.1" + +[[package]] +name = "mkdocs-gen-files" +version = "0.5.0" +description = "MkDocs plugin to programmatically generate documentation pages during the build" +optional = false +python-versions = ">=3.7" +files = [ + {file = "mkdocs_gen_files-0.5.0-py3-none-any.whl", hash = "sha256:7ac060096f3f40bd19039e7277dd3050be9a453c8ac578645844d4d91d7978ea"}, + {file = "mkdocs_gen_files-0.5.0.tar.gz", hash = "sha256:4c7cf256b5d67062a788f6b1d035e157fc1a9498c2399be9af5257d4ff4d19bc"}, +] + +[package.dependencies] +mkdocs = ">=1.0.3" + +[[package]] +name = "mkdocs-get-deps" +version = "0.2.0" +description = "MkDocs extension that lists all dependencies according to a mkdocs.yml file" +optional = false +python-versions = ">=3.8" +files = [ + {file = "mkdocs_get_deps-0.2.0-py3-none-any.whl", hash = "sha256:2bf11d0b133e77a0dd036abeeb06dec8775e46efa526dc70667d8863eefc6134"}, + {file = "mkdocs_get_deps-0.2.0.tar.gz", hash = "sha256:162b3d129c7fad9b19abfdcb9c1458a651628e4b1dea628ac68790fb3061c60c"}, +] + +[package.dependencies] +importlib-metadata = {version = ">=4.3", markers = "python_version < \"3.10\""} +mergedeep = ">=1.3.4" +platformdirs = ">=2.2.0" +pyyaml = ">=5.1" + +[[package]] +name = "mkdocs-literate-nav" +version = "0.6.1" +description = "MkDocs plugin to specify the navigation in Markdown instead of YAML" +optional = false +python-versions = ">=3.7" +files = [ + {file = "mkdocs_literate_nav-0.6.1-py3-none-any.whl", hash = "sha256:e70bdc4a07050d32da79c0b697bd88e9a104cf3294282e9cb20eec94c6b0f401"}, + {file = "mkdocs_literate_nav-0.6.1.tar.gz", hash = "sha256:78a7ab6d878371728acb0cdc6235c9b0ffc6e83c997b037f4a5c6ff7cef7d759"}, +] + +[package.dependencies] +mkdocs = ">=1.0.3" + +[[package]] +name = "mkdocs-material" +version = "9.5.49" +description = "Documentation that simply works" +optional = false +python-versions = ">=3.8" +files = [ + {file = "mkdocs_material-9.5.49-py3-none-any.whl", hash = "sha256:c3c2d8176b18198435d3a3e119011922f3e11424074645c24019c2dcf08a360e"}, + {file = "mkdocs_material-9.5.49.tar.gz", hash = "sha256:3671bb282b4f53a1c72e08adbe04d2481a98f85fed392530051f80ff94a9621d"}, +] + +[package.dependencies] +babel = ">=2.10,<3.0" +colorama = ">=0.4,<1.0" +jinja2 = ">=3.0,<4.0" +markdown = ">=3.2,<4.0" +mkdocs = ">=1.6,<2.0" +mkdocs-material-extensions = ">=1.3,<2.0" +paginate = ">=0.5,<1.0" +pygments = ">=2.16,<3.0" +pymdown-extensions = ">=10.2,<11.0" +regex = ">=2022.4" +requests = ">=2.26,<3.0" + +[package.extras] +git = ["mkdocs-git-committers-plugin-2 (>=1.1,<2.0)", "mkdocs-git-revision-date-localized-plugin (>=1.2.4,<2.0)"] +imaging = ["cairosvg (>=2.6,<3.0)", "pillow (>=10.2,<11.0)"] +recommended = ["mkdocs-minify-plugin (>=0.7,<1.0)", "mkdocs-redirects (>=1.2,<2.0)", "mkdocs-rss-plugin (>=1.6,<2.0)"] + +[[package]] +name = "mkdocs-material-extensions" +version = "1.3.1" +description = "Extension pack for Python Markdown and MkDocs Material." +optional = false +python-versions = ">=3.8" +files = [ + {file = "mkdocs_material_extensions-1.3.1-py3-none-any.whl", hash = "sha256:adff8b62700b25cb77b53358dad940f3ef973dd6db797907c49e3c2ef3ab4e31"}, + {file = "mkdocs_material_extensions-1.3.1.tar.gz", hash = "sha256:10c9511cea88f568257f960358a467d12b970e1f7b2c0e5fb2bb48cab1928443"}, +] + +[[package]] +name = "mkdocs-section-index" +version = "0.3.9" +description = "MkDocs plugin to allow clickable sections that lead to an index page" +optional = false +python-versions = ">=3.8" +files = [ + {file = "mkdocs_section_index-0.3.9-py3-none-any.whl", hash = "sha256:5e5eb288e8d7984d36c11ead5533f376fdf23498f44e903929d72845b24dfe34"}, + {file = "mkdocs_section_index-0.3.9.tar.gz", hash = "sha256:b66128d19108beceb08b226ee1ba0981840d14baf8a652b6c59e650f3f92e4f8"}, +] + +[package.dependencies] +mkdocs = ">=1.2" + +[[package]] +name = "mkdocstrings" +version = "0.27.0" +description = "Automatic documentation from sources, for MkDocs." +optional = false +python-versions = ">=3.9" +files = [ + {file = "mkdocstrings-0.27.0-py3-none-any.whl", hash = "sha256:6ceaa7ea830770959b55a16203ac63da24badd71325b96af950e59fd37366332"}, + {file = "mkdocstrings-0.27.0.tar.gz", hash = "sha256:16adca6d6b0a1f9e0c07ff0b02ced8e16f228a9d65a37c063ec4c14d7b76a657"}, +] + +[package.dependencies] +click = ">=7.0" +importlib-metadata = {version = ">=4.6", markers = "python_version < \"3.10\""} +Jinja2 = ">=2.11.1" +Markdown = ">=3.6" +MarkupSafe = ">=1.1" +mkdocs = ">=1.4" +mkdocs-autorefs = ">=1.2" +platformdirs = ">=2.2" +pymdown-extensions = ">=6.3" +typing-extensions = {version = ">=4.1", markers = "python_version < \"3.10\""} + +[package.extras] +crystal = ["mkdocstrings-crystal (>=0.3.4)"] +python = ["mkdocstrings-python (>=0.5.2)"] +python-legacy = ["mkdocstrings-python-legacy (>=0.2.1)"] + +[[package]] +name = "mkdocstrings-python" +version = "1.13.0" +description = "A Python handler for mkdocstrings." +optional = false +python-versions = ">=3.9" +files = [ + {file = "mkdocstrings_python-1.13.0-py3-none-any.whl", hash = "sha256:b88bbb207bab4086434743849f8e796788b373bd32e7bfefbf8560ac45d88f97"}, + {file = "mkdocstrings_python-1.13.0.tar.gz", hash = "sha256:2dbd5757e8375b9720e81db16f52f1856bf59905428fd7ef88005d1370e2f64c"}, +] + +[package.dependencies] +griffe = ">=0.49" +mkdocs-autorefs = ">=1.2" +mkdocstrings = ">=0.26" + [[package]] name = "mmh3" version = "5.0.1" @@ -2667,6 +2929,21 @@ files = [ {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, ] +[[package]] +name = "paginate" +version = "0.5.7" +description = "Divides large result sets into pages for easier browsing" +optional = false +python-versions = "*" +files = [ + {file = "paginate-0.5.7-py2.py3-none-any.whl", hash = "sha256:b885e2af73abcf01d9559fd5216b57ef722f8c42affbb63942377668e35c7591"}, + {file = "paginate-0.5.7.tar.gz", hash = "sha256:22bd083ab41e1a8b4f3690544afb2c60c25e5c9a63a30fa2f483f6c60c8e5945"}, +] + +[package.extras] +dev = ["pytest", "tox"] +lint = ["black"] + [[package]] name = "pandas" version = "2.2.3" @@ -2764,6 +3041,17 @@ files = [ {file = "pathable-0.4.3.tar.gz", hash = "sha256:5c869d315be50776cc8a993f3af43e0c60dc01506b399643f919034ebf4cdcab"}, ] +[[package]] +name = "pathspec" +version = "0.12.1" +description = "Utility library for gitignore style pattern matching of file paths." +optional = false +python-versions = ">=3.8" +files = [ + {file = "pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08"}, + {file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"}, +] + [[package]] name = "platformdirs" version = "4.3.6" @@ -3328,6 +3616,24 @@ dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.4.0)", "pre-commit", "pyte docs = ["sphinx", "sphinx-rtd-theme", "zope.interface"] tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] +[[package]] +name = "pymdown-extensions" +version = "10.13" +description = "Extension pack for Python Markdown." +optional = false +python-versions = ">=3.8" +files = [ + {file = "pymdown_extensions-10.13-py3-none-any.whl", hash = "sha256:80bc33d715eec68e683e04298946d47d78c7739e79d808203df278ee8ef89428"}, + {file = "pymdown_extensions-10.13.tar.gz", hash = "sha256:e0b351494dc0d8d14a1f52b39b1499a00ef1566b4ba23dc74f1eba75c736f5dd"}, +] + +[package.dependencies] +markdown = ">=3.6" +pyyaml = "*" + +[package.extras] +extra = ["pygments (>=2.12)"] + [[package]] name = "pyparsing" version = "3.2.1" @@ -3574,6 +3880,20 @@ files = [ {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"}, ] +[[package]] +name = "pyyaml-env-tag" +version = "0.1" +description = "A custom YAML tag for referencing environment variables in YAML files. " +optional = false +python-versions = ">=3.6" +files = [ + {file = "pyyaml_env_tag-0.1-py3-none-any.whl", hash = "sha256:af31106dec8a4d68c60207c1886031cbf839b68aa7abccdb19868200532c2069"}, + {file = "pyyaml_env_tag-0.1.tar.gz", hash = "sha256:70092675bda14fdec33b31ba77e7543de9ddc88f2e5b99160396572d11525bdb"}, +] + +[package.dependencies] +pyyaml = "*" + [[package]] name = "ray" version = "2.40.0" @@ -4384,6 +4704,48 @@ platformdirs = ">=3.9.1,<5" docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"] +[[package]] +name = "watchdog" +version = "6.0.0" +description = "Filesystem events monitoring" +optional = false +python-versions = ">=3.9" +files = [ + {file = "watchdog-6.0.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d1cdb490583ebd691c012b3d6dae011000fe42edb7a82ece80965b42abd61f26"}, + {file = "watchdog-6.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bc64ab3bdb6a04d69d4023b29422170b74681784ffb9463ed4870cf2f3e66112"}, + {file = "watchdog-6.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c897ac1b55c5a1461e16dae288d22bb2e412ba9807df8397a635d88f671d36c3"}, + {file = "watchdog-6.0.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6eb11feb5a0d452ee41f824e271ca311a09e250441c262ca2fd7ebcf2461a06c"}, + {file = "watchdog-6.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ef810fbf7b781a5a593894e4f439773830bdecb885e6880d957d5b9382a960d2"}, + {file = "watchdog-6.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:afd0fe1b2270917c5e23c2a65ce50c2a4abb63daafb0d419fde368e272a76b7c"}, + {file = "watchdog-6.0.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:bdd4e6f14b8b18c334febb9c4425a878a2ac20efd1e0b231978e7b150f92a948"}, + {file = "watchdog-6.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c7c15dda13c4eb00d6fb6fc508b3c0ed88b9d5d374056b239c4ad1611125c860"}, + {file = "watchdog-6.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6f10cb2d5902447c7d0da897e2c6768bca89174d0c6e1e30abec5421af97a5b0"}, + {file = "watchdog-6.0.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:490ab2ef84f11129844c23fb14ecf30ef3d8a6abafd3754a6f75ca1e6654136c"}, + {file = "watchdog-6.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:76aae96b00ae814b181bb25b1b98076d5fc84e8a53cd8885a318b42b6d3a5134"}, + {file = "watchdog-6.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a175f755fc2279e0b7312c0035d52e27211a5bc39719dd529625b1930917345b"}, + {file = "watchdog-6.0.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e6f0e77c9417e7cd62af82529b10563db3423625c5fce018430b249bf977f9e8"}, + {file = "watchdog-6.0.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:90c8e78f3b94014f7aaae121e6b909674df5b46ec24d6bebc45c44c56729af2a"}, + {file = "watchdog-6.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e7631a77ffb1f7d2eefa4445ebbee491c720a5661ddf6df3498ebecae5ed375c"}, + {file = "watchdog-6.0.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:c7ac31a19f4545dd92fc25d200694098f42c9a8e391bc00bdd362c5736dbf881"}, + {file = "watchdog-6.0.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:9513f27a1a582d9808cf21a07dae516f0fab1cf2d7683a742c498b93eedabb11"}, + {file = "watchdog-6.0.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7a0e56874cfbc4b9b05c60c8a1926fedf56324bb08cfbc188969777940aef3aa"}, + {file = "watchdog-6.0.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:e6439e374fc012255b4ec786ae3c4bc838cd7309a540e5fe0952d03687d8804e"}, + {file = "watchdog-6.0.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7607498efa04a3542ae3e05e64da8202e58159aa1fa4acddf7678d34a35d4f13"}, + {file = "watchdog-6.0.0-py3-none-manylinux2014_armv7l.whl", hash = "sha256:9041567ee8953024c83343288ccc458fd0a2d811d6a0fd68c4c22609e3490379"}, + {file = "watchdog-6.0.0-py3-none-manylinux2014_i686.whl", hash = "sha256:82dc3e3143c7e38ec49d61af98d6558288c415eac98486a5c581726e0737c00e"}, + {file = "watchdog-6.0.0-py3-none-manylinux2014_ppc64.whl", hash = "sha256:212ac9b8bf1161dc91bd09c048048a95ca3a4c4f5e5d4a7d1b1a7d5752a7f96f"}, + {file = "watchdog-6.0.0-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:e3df4cbb9a450c6d49318f6d14f4bbc80d763fa587ba46ec86f99f9e6876bb26"}, + {file = "watchdog-6.0.0-py3-none-manylinux2014_s390x.whl", hash = "sha256:2cce7cfc2008eb51feb6aab51251fd79b85d9894e98ba847408f662b3395ca3c"}, + {file = "watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl", hash = "sha256:20ffe5b202af80ab4266dcd3e91aae72bf2da48c0d33bdb15c66658e685e94e2"}, + {file = "watchdog-6.0.0-py3-none-win32.whl", hash = "sha256:07df1fdd701c5d4c8e55ef6cf55b8f0120fe1aef7ef39a1c6fc6bc2e606d517a"}, + {file = "watchdog-6.0.0-py3-none-win_amd64.whl", hash = "sha256:cbafb470cf848d93b5d013e2ecb245d4aa1c8fd0504e863ccefa32445359d680"}, + {file = "watchdog-6.0.0-py3-none-win_ia64.whl", hash = "sha256:a1914259fa9e1454315171103c6a30961236f508b9b623eae470268bbcc6a22f"}, + {file = "watchdog-6.0.0.tar.gz", hash = "sha256:9ddf7c82fda3ae8e24decda1338ede66e1c99883db93711d8fb941eaa2d8c282"}, +] + +[package.extras] +watchmedo = ["PyYAML (>=3.10)"] + [[package]] name = "werkzeug" version = "3.1.3" @@ -4734,4 +5096,4 @@ zstandard = ["zstandard"] [metadata] lock-version = "2.0" python-versions = "^3.9, !=3.9.7" -content-hash = "2084f03c93f2d1085a5671a171c6cbeb96d9688079270ceca38b0854fe9e0520" +content-hash = "3f9ea520ceb12bb56d371c19ee4c59f14ba258878a65067c37684dfc209f85b9" diff --git a/pyproject.toml b/pyproject.toml index a2737c3f92..66a95a1561 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -95,6 +95,21 @@ pyspark = "3.5.3" cython = "3.0.11" deptry = ">=0.14,<0.22" docutils = "!=0.21.post1" # https://github.com/python-poetry/poetry/issues/9248#issuecomment-2026240520 + +[tool.poetry.group.docs.dependencies] +# for mkdocs +mkdocs = "1.6.1" +griffe = "1.5.4" +jinja2 = "3.1.5" +mkdocstrings = "0.27.0" +mkdocstrings-python = "1.13.0" +mkdocs-literate-nav = "0.6.1" +mkdocs-autorefs = "1.2.0" +mkdocs-gen-files = "0.5.0" +mkdocs-material = "9.5.49" +mkdocs-material-extensions = "1.3.1" +mkdocs-section-index = "0.3.9" + [[tool.mypy.overrides]] module = "pytest_mock.*" ignore_missing_imports = true @@ -859,6 +874,310 @@ ignore_missing_imports = true module = "tenacity.*" ignore_missing_imports = true +[[tool.mypy.overrides]] +module = "pyarrow.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "pandas.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "snappy.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "zstandard.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "pydantic.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "pydantic_core.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "pytest.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "fastavro.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "mmh3.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "hive_metastore.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "thrift.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "requests_mock.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "click.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "rich.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "fsspec.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "s3fs.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "azure.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "adlfs.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "gcsfs.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "packaging.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "tests.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "boto3" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "botocore.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "mypy_boto3_glue.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "moto" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "aiobotocore.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "aiohttp.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "duckdb.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "ray.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "daft.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "pyparsing.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "pyspark.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "strictyaml.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "sortedcontainers.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "sqlalchemy.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "Cython.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "setuptools.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "tenacity.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "pyarrow.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "pandas.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "snappy.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "zstandard.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "pydantic.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "pydantic_core.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "pytest.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "fastavro.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "mmh3.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "hive_metastore.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "thrift.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "requests_mock.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "click.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "rich.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "fsspec.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "s3fs.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "azure.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "adlfs.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "gcsfs.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "packaging.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "tests.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "boto3" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "botocore.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "mypy_boto3_glue.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "moto" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "aiobotocore.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "aiohttp.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "duckdb.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "ray.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "daft.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "pyparsing.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "pyspark.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "strictyaml.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "sortedcontainers.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "sqlalchemy.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "Cython.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "setuptools.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "tenacity.*" +ignore_missing_imports = true + [tool.poetry.scripts] pyiceberg = "pyiceberg.cli.console:run" From 551f524170b12900cfaa3fef1ec8a0f9f437ee4c Mon Sep 17 00:00:00 2001 From: Jiakai Li <50531391+jiakai-li@users.noreply.github.com> Date: Tue, 7 Jan 2025 03:47:43 +1300 Subject: [PATCH 13/32] Fix read from multiple s3 regions (#1453) * Take netloc into account for s3 filesystem when calling `_initialize_fs` * Fix unit test for s3 fileystem * Update ArrowScan to use different FileSystem per file * Add unit test for `PyArrorFileIO.fs_by_scheme` cache behavior * Add error handling * Update tests/io/test_pyarrow.py Co-authored-by: Kevin Liu * Update `s3.region` document and a test case * Add test case for `PyArrowFileIO.new_input` multi region * Shuffle code location for better maintainability * Comment for future integration test * Typo fix * Document wording * Add warning when the bucket region for a file cannot be resolved (for `pyarrow.S3FileSystem`) * Fix code linting * Update mkdocs/docs/configuration.md Co-authored-by: Kevin Liu * Code refactoring * Unit test * Code refactoring * Test cases * Code format * Code tidy-up * Update pyiceberg/io/pyarrow.py Co-authored-by: Kevin Liu --------- Co-authored-by: Kevin Liu --- mkdocs/docs/configuration.md | 30 ++--- pyiceberg/io/pyarrow.py | 212 +++++++++++++++++++++----------- tests/integration/test_reads.py | 29 +++++ tests/io/test_pyarrow.py | 96 ++++++++++++++- 4 files changed, 273 insertions(+), 94 deletions(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 621b313613..06eaac1bed 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -102,21 +102,21 @@ For the FileIO there are several configuration options available: -| Key | Example | Description | -|----------------------|----------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| s3.endpoint | | Configure an alternative endpoint of the S3 service for the FileIO to access. This could be used to use S3FileIO with any s3-compatible object storage service that has a different endpoint, or access a private S3 endpoint in a virtual private cloud. | -| s3.access-key-id | admin | Configure the static access key id used to access the FileIO. | -| s3.secret-access-key | password | Configure the static secret access key used to access the FileIO. | -| s3.session-token | AQoDYXdzEJr... | Configure the static session token used to access the FileIO. | -| s3.role-session-name | session | An optional identifier for the assumed role session. | -| s3.role-arn | arn:aws:... | AWS Role ARN. If provided instead of access_key and secret_key, temporary credentials will be fetched by assuming this role. | -| s3.signer | bearer | Configure the signature version of the FileIO. | -| s3.signer.uri | | Configure the remote signing uri if it differs from the catalog uri. Remote signing is only implemented for `FsspecFileIO`. The final request is sent to `/`. | -| s3.signer.endpoint | v1/main/s3-sign | Configure the remote signing endpoint. Remote signing is only implemented for `FsspecFileIO`. The final request is sent to `/`. (default : v1/aws/s3/sign). | -| s3.region | us-west-2 | Sets the region of the bucket | -| s3.proxy-uri | | Configure the proxy server to be used by the FileIO. | -| s3.connect-timeout | 60.0 | Configure socket connection timeout, in seconds. | -| s3.force-virtual-addressing | False | Whether to use virtual addressing of buckets. If true, then virtual addressing is always enabled. If false, then virtual addressing is only enabled if endpoint_override is empty. This can be used for non-AWS backends that only support virtual hosted-style access. | +| Key | Example | Description | +|----------------------|----------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| s3.endpoint | | Configure an alternative endpoint of the S3 service for the FileIO to access. This could be used to use S3FileIO with any s3-compatible object storage service that has a different endpoint, or access a private S3 endpoint in a virtual private cloud. | +| s3.access-key-id | admin | Configure the static access key id used to access the FileIO. | +| s3.secret-access-key | password | Configure the static secret access key used to access the FileIO. | +| s3.session-token | AQoDYXdzEJr... | Configure the static session token used to access the FileIO. | +| s3.role-session-name | session | An optional identifier for the assumed role session. | +| s3.role-arn | arn:aws:... | AWS Role ARN. If provided instead of access_key and secret_key, temporary credentials will be fetched by assuming this role. | +| s3.signer | bearer | Configure the signature version of the FileIO. | +| s3.signer.uri | | Configure the remote signing uri if it differs from the catalog uri. Remote signing is only implemented for `FsspecFileIO`. The final request is sent to `/`. | +| s3.signer.endpoint | v1/main/s3-sign | Configure the remote signing endpoint. Remote signing is only implemented for `FsspecFileIO`. The final request is sent to `/`. (default : v1/aws/s3/sign). | +| s3.region | us-west-2 | Configure the default region used to initialize an `S3FileSystem`. `PyArrowFileIO` attempts to automatically resolve the region for each S3 bucket, falling back to this value if resolution fails. | +| s3.proxy-uri | | Configure the proxy server to be used by the FileIO. | +| s3.connect-timeout | 60.0 | Configure socket connection timeout, in seconds. | +| s3.force-virtual-addressing | False | Whether to use virtual addressing of buckets. If true, then virtual addressing is always enabled. If false, then virtual addressing is only enabled if endpoint_override is empty. This can be used for non-AWS backends that only support virtual hosted-style access. | diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index dc41a7d6a1..ad7e4f4f85 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -351,77 +351,141 @@ def parse_location(location: str) -> Tuple[str, str, str]: return uri.scheme, uri.netloc, f"{uri.netloc}{uri.path}" def _initialize_fs(self, scheme: str, netloc: Optional[str] = None) -> FileSystem: - if scheme in {"s3", "s3a", "s3n", "oss"}: - from pyarrow.fs import S3FileSystem - - client_kwargs: Dict[str, Any] = { - "endpoint_override": self.properties.get(S3_ENDPOINT), - "access_key": get_first_property_value(self.properties, S3_ACCESS_KEY_ID, AWS_ACCESS_KEY_ID), - "secret_key": get_first_property_value(self.properties, S3_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY), - "session_token": get_first_property_value(self.properties, S3_SESSION_TOKEN, AWS_SESSION_TOKEN), - "region": get_first_property_value(self.properties, S3_REGION, AWS_REGION), - } - - if proxy_uri := self.properties.get(S3_PROXY_URI): - client_kwargs["proxy_options"] = proxy_uri - - if connect_timeout := self.properties.get(S3_CONNECT_TIMEOUT): - client_kwargs["connect_timeout"] = float(connect_timeout) - - if role_arn := get_first_property_value(self.properties, S3_ROLE_ARN, AWS_ROLE_ARN): - client_kwargs["role_arn"] = role_arn - - if session_name := get_first_property_value(self.properties, S3_ROLE_SESSION_NAME, AWS_ROLE_SESSION_NAME): - client_kwargs["session_name"] = session_name - - if force_virtual_addressing := self.properties.get(S3_FORCE_VIRTUAL_ADDRESSING): - client_kwargs["force_virtual_addressing"] = property_as_bool(self.properties, force_virtual_addressing, False) - - return S3FileSystem(**client_kwargs) - elif scheme in ("hdfs", "viewfs"): - from pyarrow.fs import HadoopFileSystem - - hdfs_kwargs: Dict[str, Any] = {} - if netloc: - return HadoopFileSystem.from_uri(f"{scheme}://{netloc}") - if host := self.properties.get(HDFS_HOST): - hdfs_kwargs["host"] = host - if port := self.properties.get(HDFS_PORT): - # port should be an integer type - hdfs_kwargs["port"] = int(port) - if user := self.properties.get(HDFS_USER): - hdfs_kwargs["user"] = user - if kerb_ticket := self.properties.get(HDFS_KERB_TICKET): - hdfs_kwargs["kerb_ticket"] = kerb_ticket - - return HadoopFileSystem(**hdfs_kwargs) + """Initialize FileSystem for different scheme.""" + if scheme in {"oss"}: + return self._initialize_oss_fs() + + elif scheme in {"s3", "s3a", "s3n"}: + return self._initialize_s3_fs(netloc) + + elif scheme in {"hdfs", "viewfs"}: + return self._initialize_hdfs_fs(scheme, netloc) + elif scheme in {"gs", "gcs"}: - from pyarrow.fs import GcsFileSystem - - gcs_kwargs: Dict[str, Any] = {} - if access_token := self.properties.get(GCS_TOKEN): - gcs_kwargs["access_token"] = access_token - if expiration := self.properties.get(GCS_TOKEN_EXPIRES_AT_MS): - gcs_kwargs["credential_token_expiration"] = millis_to_datetime(int(expiration)) - if bucket_location := self.properties.get(GCS_DEFAULT_LOCATION): - gcs_kwargs["default_bucket_location"] = bucket_location - if endpoint := get_first_property_value(self.properties, GCS_SERVICE_HOST, GCS_ENDPOINT): - if self.properties.get(GCS_ENDPOINT): - deprecation_message( - deprecated_in="0.8.0", - removed_in="0.9.0", - help_message=f"The property {GCS_ENDPOINT} is deprecated, please use {GCS_SERVICE_HOST} instead", - ) - url_parts = urlparse(endpoint) - gcs_kwargs["scheme"] = url_parts.scheme - gcs_kwargs["endpoint_override"] = url_parts.netloc + return self._initialize_gcs_fs() + + elif scheme in {"file"}: + return self._initialize_local_fs() - return GcsFileSystem(**gcs_kwargs) - elif scheme == "file": - return PyArrowLocalFileSystem() else: raise ValueError(f"Unrecognized filesystem type in URI: {scheme}") + def _initialize_oss_fs(self) -> FileSystem: + from pyarrow.fs import S3FileSystem + + client_kwargs: Dict[str, Any] = { + "endpoint_override": self.properties.get(S3_ENDPOINT), + "access_key": get_first_property_value(self.properties, S3_ACCESS_KEY_ID, AWS_ACCESS_KEY_ID), + "secret_key": get_first_property_value(self.properties, S3_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY), + "session_token": get_first_property_value(self.properties, S3_SESSION_TOKEN, AWS_SESSION_TOKEN), + "region": get_first_property_value(self.properties, S3_REGION, AWS_REGION), + } + + if proxy_uri := self.properties.get(S3_PROXY_URI): + client_kwargs["proxy_options"] = proxy_uri + + if connect_timeout := self.properties.get(S3_CONNECT_TIMEOUT): + client_kwargs["connect_timeout"] = float(connect_timeout) + + if role_arn := get_first_property_value(self.properties, S3_ROLE_ARN, AWS_ROLE_ARN): + client_kwargs["role_arn"] = role_arn + + if session_name := get_first_property_value(self.properties, S3_ROLE_SESSION_NAME, AWS_ROLE_SESSION_NAME): + client_kwargs["session_name"] = session_name + + if force_virtual_addressing := self.properties.get(S3_FORCE_VIRTUAL_ADDRESSING): + client_kwargs["force_virtual_addressing"] = property_as_bool(self.properties, force_virtual_addressing, False) + + return S3FileSystem(**client_kwargs) + + def _initialize_s3_fs(self, netloc: Optional[str]) -> FileSystem: + from pyarrow.fs import S3FileSystem, resolve_s3_region + + # Resolve region from netloc(bucket), fallback to user-provided region + provided_region = get_first_property_value(self.properties, S3_REGION, AWS_REGION) + + try: + bucket_region = resolve_s3_region(bucket=netloc) + except (OSError, TypeError): + bucket_region = None + logger.warning(f"Unable to resolve region for bucket {netloc}, using default region {provided_region}") + + bucket_region = bucket_region or provided_region + if bucket_region != provided_region: + logger.warning( + f"PyArrow FileIO overriding S3 bucket region for bucket {netloc}: " + f"provided region {provided_region}, actual region {bucket_region}" + ) + + client_kwargs: Dict[str, Any] = { + "endpoint_override": self.properties.get(S3_ENDPOINT), + "access_key": get_first_property_value(self.properties, S3_ACCESS_KEY_ID, AWS_ACCESS_KEY_ID), + "secret_key": get_first_property_value(self.properties, S3_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY), + "session_token": get_first_property_value(self.properties, S3_SESSION_TOKEN, AWS_SESSION_TOKEN), + "region": bucket_region, + } + + if proxy_uri := self.properties.get(S3_PROXY_URI): + client_kwargs["proxy_options"] = proxy_uri + + if connect_timeout := self.properties.get(S3_CONNECT_TIMEOUT): + client_kwargs["connect_timeout"] = float(connect_timeout) + + if role_arn := get_first_property_value(self.properties, S3_ROLE_ARN, AWS_ROLE_ARN): + client_kwargs["role_arn"] = role_arn + + if session_name := get_first_property_value(self.properties, S3_ROLE_SESSION_NAME, AWS_ROLE_SESSION_NAME): + client_kwargs["session_name"] = session_name + + if force_virtual_addressing := self.properties.get(S3_FORCE_VIRTUAL_ADDRESSING): + client_kwargs["force_virtual_addressing"] = property_as_bool(self.properties, force_virtual_addressing, False) + + return S3FileSystem(**client_kwargs) + + def _initialize_hdfs_fs(self, scheme: str, netloc: Optional[str]) -> FileSystem: + from pyarrow.fs import HadoopFileSystem + + hdfs_kwargs: Dict[str, Any] = {} + if netloc: + return HadoopFileSystem.from_uri(f"{scheme}://{netloc}") + if host := self.properties.get(HDFS_HOST): + hdfs_kwargs["host"] = host + if port := self.properties.get(HDFS_PORT): + # port should be an integer type + hdfs_kwargs["port"] = int(port) + if user := self.properties.get(HDFS_USER): + hdfs_kwargs["user"] = user + if kerb_ticket := self.properties.get(HDFS_KERB_TICKET): + hdfs_kwargs["kerb_ticket"] = kerb_ticket + + return HadoopFileSystem(**hdfs_kwargs) + + def _initialize_gcs_fs(self) -> FileSystem: + from pyarrow.fs import GcsFileSystem + + gcs_kwargs: Dict[str, Any] = {} + if access_token := self.properties.get(GCS_TOKEN): + gcs_kwargs["access_token"] = access_token + if expiration := self.properties.get(GCS_TOKEN_EXPIRES_AT_MS): + gcs_kwargs["credential_token_expiration"] = millis_to_datetime(int(expiration)) + if bucket_location := self.properties.get(GCS_DEFAULT_LOCATION): + gcs_kwargs["default_bucket_location"] = bucket_location + if endpoint := get_first_property_value(self.properties, GCS_SERVICE_HOST, GCS_ENDPOINT): + if self.properties.get(GCS_ENDPOINT): + deprecation_message( + deprecated_in="0.8.0", + removed_in="0.9.0", + help_message=f"The property {GCS_ENDPOINT} is deprecated, please use {GCS_SERVICE_HOST} instead", + ) + url_parts = urlparse(endpoint) + gcs_kwargs["scheme"] = url_parts.scheme + gcs_kwargs["endpoint_override"] = url_parts.netloc + + return GcsFileSystem(**gcs_kwargs) + + def _initialize_local_fs(self) -> FileSystem: + return PyArrowLocalFileSystem() + def new_input(self, location: str) -> PyArrowFile: """Get a PyArrowFile instance to read bytes from the file at the given location. @@ -1326,13 +1390,14 @@ def _task_to_table( return None -def _read_all_delete_files(fs: FileSystem, tasks: Iterable[FileScanTask]) -> Dict[str, List[ChunkedArray]]: +def _read_all_delete_files(io: FileIO, tasks: Iterable[FileScanTask]) -> Dict[str, List[ChunkedArray]]: deletes_per_file: Dict[str, List[ChunkedArray]] = {} unique_deletes = set(itertools.chain.from_iterable([task.delete_files for task in tasks])) if len(unique_deletes) > 0: executor = ExecutorFactory.get_or_create() deletes_per_files: Iterator[Dict[str, ChunkedArray]] = executor.map( - lambda args: _read_deletes(*args), [(fs, delete) for delete in unique_deletes] + lambda args: _read_deletes(*args), + [(_fs_from_file_path(io, delete_file.file_path), delete_file) for delete_file in unique_deletes], ) for delete in deletes_per_files: for file, arr in delete.items(): @@ -1344,7 +1409,7 @@ def _read_all_delete_files(fs: FileSystem, tasks: Iterable[FileScanTask]) -> Dic return deletes_per_file -def _fs_from_file_path(file_path: str, io: FileIO) -> FileSystem: +def _fs_from_file_path(io: FileIO, file_path: str) -> FileSystem: scheme, netloc, _ = _parse_location(file_path) if isinstance(io, PyArrowFileIO): return io.fs_by_scheme(scheme, netloc) @@ -1366,7 +1431,6 @@ def _fs_from_file_path(file_path: str, io: FileIO) -> FileSystem: class ArrowScan: _table_metadata: TableMetadata _io: FileIO - _fs: FileSystem _projected_schema: Schema _bound_row_filter: BooleanExpression _case_sensitive: bool @@ -1376,7 +1440,6 @@ class ArrowScan: Attributes: _table_metadata: Current table metadata of the Iceberg table _io: PyIceberg FileIO implementation from which to fetch the io properties - _fs: PyArrow FileSystem to use to read the files _projected_schema: Iceberg Schema to project onto the data files _bound_row_filter: Schema bound row expression to filter the data with _case_sensitive: Case sensitivity when looking up column names @@ -1394,7 +1457,6 @@ def __init__( ) -> None: self._table_metadata = table_metadata self._io = io - self._fs = _fs_from_file_path(table_metadata.location, io) # TODO: use different FileSystem per file self._projected_schema = projected_schema self._bound_row_filter = bind(table_metadata.schema(), row_filter, case_sensitive=case_sensitive) self._case_sensitive = case_sensitive @@ -1434,7 +1496,7 @@ def to_table(self, tasks: Iterable[FileScanTask]) -> pa.Table: ResolveError: When a required field cannot be found in the file ValueError: When a field type in the file cannot be projected to the schema type """ - deletes_per_file = _read_all_delete_files(self._fs, tasks) + deletes_per_file = _read_all_delete_files(self._io, tasks) executor = ExecutorFactory.get_or_create() def _table_from_scan_task(task: FileScanTask) -> pa.Table: @@ -1497,7 +1559,7 @@ def to_record_batches(self, tasks: Iterable[FileScanTask]) -> Iterator[pa.Record ResolveError: When a required field cannot be found in the file ValueError: When a field type in the file cannot be projected to the schema type """ - deletes_per_file = _read_all_delete_files(self._fs, tasks) + deletes_per_file = _read_all_delete_files(self._io, tasks) return self._record_batches_from_scan_tasks_and_deletes(tasks, deletes_per_file) def _record_batches_from_scan_tasks_and_deletes( @@ -1508,7 +1570,7 @@ def _record_batches_from_scan_tasks_and_deletes( if self._limit is not None and total_row_count >= self._limit: break batches = _task_to_record_batches( - self._fs, + _fs_from_file_path(self._io, task.file.file_path), task, self._bound_row_filter, self._projected_schema, diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py index 8d13724087..f2e79bae60 100644 --- a/tests/integration/test_reads.py +++ b/tests/integration/test_reads.py @@ -19,6 +19,7 @@ import math import time import uuid +from pathlib import PosixPath from urllib.parse import urlparse import pyarrow as pa @@ -921,3 +922,31 @@ def test_table_scan_empty_table(catalog: Catalog) -> None: result_table = tbl.scan().to_arrow() assert len(result_table) == 0 + + +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) +def test_read_from_s3_and_local_fs(catalog: Catalog, tmp_path: PosixPath) -> None: + identifier = "default.test_read_from_s3_and_local_fs" + schema = pa.schema([pa.field("colA", pa.string())]) + arrow_table = pa.Table.from_arrays([pa.array(["one"])], schema=schema) + + tmp_dir = tmp_path / "data" + tmp_dir.mkdir() + local_file = tmp_dir / "local_file.parquet" + + try: + catalog.drop_table(identifier) + except NoSuchTableError: + pass + tbl = catalog.create_table(identifier, schema=schema) + + # Append table to s3 endpoint + tbl.append(arrow_table) + + # Append a local file + pq.write_table(arrow_table, local_file) + tbl.add_files([str(local_file)]) + + result_table = tbl.scan().to_arrow() + assert result_table["colA"].to_pylist() == ["one", "one"] diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index 8bb97e150a..8beb750f49 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=protected-access,unused-argument,redefined-outer-name - +import logging import os import tempfile import uuid @@ -27,7 +27,7 @@ import pyarrow as pa import pyarrow.parquet as pq import pytest -from pyarrow.fs import FileType, LocalFileSystem +from pyarrow.fs import FileType, LocalFileSystem, S3FileSystem from pyiceberg.exceptions import ResolveError from pyiceberg.expressions import ( @@ -360,10 +360,12 @@ def test_pyarrow_s3_session_properties() -> None: **UNIFIED_AWS_SESSION_PROPERTIES, } - with patch("pyarrow.fs.S3FileSystem") as mock_s3fs: + with patch("pyarrow.fs.S3FileSystem") as mock_s3fs, patch("pyarrow.fs.resolve_s3_region") as mock_s3_region_resolver: s3_fileio = PyArrowFileIO(properties=session_properties) filename = str(uuid.uuid4()) + # Mock `resolve_s3_region` to prevent from the location used resolving to a different s3 region + mock_s3_region_resolver.side_effect = OSError("S3 bucket is not found") s3_fileio.new_input(location=f"s3://warehouse/{filename}") mock_s3fs.assert_called_with( @@ -381,10 +383,11 @@ def test_pyarrow_unified_session_properties() -> None: **UNIFIED_AWS_SESSION_PROPERTIES, } - with patch("pyarrow.fs.S3FileSystem") as mock_s3fs: + with patch("pyarrow.fs.S3FileSystem") as mock_s3fs, patch("pyarrow.fs.resolve_s3_region") as mock_s3_region_resolver: s3_fileio = PyArrowFileIO(properties=session_properties) filename = str(uuid.uuid4()) + mock_s3_region_resolver.return_value = "client.region" s3_fileio.new_input(location=f"s3://warehouse/{filename}") mock_s3fs.assert_called_with( @@ -2096,3 +2099,88 @@ def test__to_requested_schema_timestamps_without_downcast_raises_exception( _to_requested_schema(requested_schema, file_schema, batch, downcast_ns_timestamp_to_us=False, include_field_ids=False) assert "Unsupported schema projection from timestamp[ns] to timestamp[us]" in str(exc_info.value) + + +def test_pyarrow_file_io_fs_by_scheme_cache() -> None: + # It's better to set up multi-region minio servers for an integration test once `endpoint_url` argument becomes available for `resolve_s3_region` + # Refer to: https://github.com/apache/arrow/issues/43713 + + pyarrow_file_io = PyArrowFileIO() + us_east_1_region = "us-east-1" + ap_southeast_2_region = "ap-southeast-2" + + with patch("pyarrow.fs.resolve_s3_region") as mock_s3_region_resolver: + # Call with new argument resolves region automatically + mock_s3_region_resolver.return_value = us_east_1_region + filesystem_us = pyarrow_file_io.fs_by_scheme("s3", "us-east-1-bucket") + assert filesystem_us.region == us_east_1_region + assert pyarrow_file_io.fs_by_scheme.cache_info().misses == 1 # type: ignore + assert pyarrow_file_io.fs_by_scheme.cache_info().currsize == 1 # type: ignore + + # Call with different argument also resolves region automatically + mock_s3_region_resolver.return_value = ap_southeast_2_region + filesystem_ap_southeast_2 = pyarrow_file_io.fs_by_scheme("s3", "ap-southeast-2-bucket") + assert filesystem_ap_southeast_2.region == ap_southeast_2_region + assert pyarrow_file_io.fs_by_scheme.cache_info().misses == 2 # type: ignore + assert pyarrow_file_io.fs_by_scheme.cache_info().currsize == 2 # type: ignore + + # Call with same argument hits cache + filesystem_us_cached = pyarrow_file_io.fs_by_scheme("s3", "us-east-1-bucket") + assert filesystem_us_cached.region == us_east_1_region + assert pyarrow_file_io.fs_by_scheme.cache_info().hits == 1 # type: ignore + + # Call with same argument hits cache + filesystem_ap_southeast_2_cached = pyarrow_file_io.fs_by_scheme("s3", "ap-southeast-2-bucket") + assert filesystem_ap_southeast_2_cached.region == ap_southeast_2_region + assert pyarrow_file_io.fs_by_scheme.cache_info().hits == 2 # type: ignore + + +def test_pyarrow_io_new_input_multi_region(caplog: Any) -> None: + # It's better to set up multi-region minio servers for an integration test once `endpoint_url` argument becomes available for `resolve_s3_region` + # Refer to: https://github.com/apache/arrow/issues/43713 + user_provided_region = "ap-southeast-1" + bucket_regions = [ + ("us-east-2-bucket", "us-east-2"), + ("ap-southeast-2-bucket", "ap-southeast-2"), + ] + + def _s3_region_map(bucket: str) -> str: + for bucket_region in bucket_regions: + if bucket_region[0] == bucket: + return bucket_region[1] + raise OSError("Unknown bucket") + + # For a pyarrow io instance with configured default s3 region + pyarrow_file_io = PyArrowFileIO({"s3.region": user_provided_region}) + with patch("pyarrow.fs.resolve_s3_region") as mock_s3_region_resolver: + mock_s3_region_resolver.side_effect = _s3_region_map + + # The region is set to provided region if bucket region cannot be resolved + with caplog.at_level(logging.WARNING): + assert pyarrow_file_io.new_input("s3://non-exist-bucket/path/to/file")._filesystem.region == user_provided_region + assert f"Unable to resolve region for bucket non-exist-bucket, using default region {user_provided_region}" in caplog.text + + for bucket_region in bucket_regions: + # For s3 scheme, region is overwritten by resolved bucket region if different from user provided region + with caplog.at_level(logging.WARNING): + assert pyarrow_file_io.new_input(f"s3://{bucket_region[0]}/path/to/file")._filesystem.region == bucket_region[1] + assert ( + f"PyArrow FileIO overriding S3 bucket region for bucket {bucket_region[0]}: " + f"provided region {user_provided_region}, actual region {bucket_region[1]}" in caplog.text + ) + + # For oss scheme, user provided region is used instead + assert pyarrow_file_io.new_input(f"oss://{bucket_region[0]}/path/to/file")._filesystem.region == user_provided_region + + +def test_pyarrow_io_multi_fs() -> None: + pyarrow_file_io = PyArrowFileIO({"s3.region": "ap-southeast-1"}) + + with patch("pyarrow.fs.resolve_s3_region") as mock_s3_region_resolver: + mock_s3_region_resolver.return_value = None + + # The PyArrowFileIO instance resolves s3 file input to S3FileSystem + assert isinstance(pyarrow_file_io.new_input("s3://bucket/path/to/file")._filesystem, S3FileSystem) + + # Same PyArrowFileIO instance resolves local file input to LocalFileSystem + assert isinstance(pyarrow_file_io.new_input("file:///path/to/file")._filesystem, LocalFileSystem) From e39f91a03d652b84c96acbf8ceac29777514344d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 7 Jan 2025 10:33:53 -0500 Subject: [PATCH 14/32] Bump moto from 5.0.25 to 5.0.26 (#1490) Bumps [moto](https://github.com/getmoto/moto) from 5.0.25 to 5.0.26. - [Release notes](https://github.com/getmoto/moto/releases) - [Changelog](https://github.com/getmoto/moto/blob/master/CHANGELOG.md) - [Commits](https://github.com/getmoto/moto/compare/5.0.25...5.0.26) --- updated-dependencies: - dependency-name: moto dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index b1b73746c1..c95252517a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1941,6 +1941,8 @@ optional = false python-versions = "*" files = [ {file = "jsonpath-ng-1.7.0.tar.gz", hash = "sha256:f6f5f7fd4e5ff79c785f1573b394043b39849fb2bb47bcead935d12b00beab3c"}, + {file = "jsonpath_ng-1.7.0-py2-none-any.whl", hash = "sha256:898c93fc173f0c336784a3fa63d7434297544b7198124a68f9a3ef9597b0ae6e"}, + {file = "jsonpath_ng-1.7.0-py3-none-any.whl", hash = "sha256:f3d7f9e848cba1b6da28c55b1c26ff915dc9e0b1ba7e752a53d6da8d5cbd00b6"}, ] [package.dependencies] @@ -2494,13 +2496,13 @@ type = ["mypy (==1.11.2)"] [[package]] name = "moto" -version = "5.0.25" +version = "5.0.26" description = "A library that allows you to easily mock out tests based on AWS infrastructure" optional = false python-versions = ">=3.8" files = [ - {file = "moto-5.0.25-py3-none-any.whl", hash = "sha256:ab790f9d7d08f30667a196af7cacead03e76c10be2d1148ea00a731d47918a1e"}, - {file = "moto-5.0.25.tar.gz", hash = "sha256:deea8b158cec5a65c9635ae1fff4579d735b11ac8a0e5226fbbeb742ce0ce6b2"}, + {file = "moto-5.0.26-py3-none-any.whl", hash = "sha256:803831f427ca6c0452ae4fb898d731cfc19906466a33a88cbc1076abcbfcbba7"}, + {file = "moto-5.0.26.tar.gz", hash = "sha256:6829f58a670a087e7c5b63f8183c6b72d64a1444e420c212250b7326b69a9183"}, ] [package.dependencies] @@ -3313,6 +3315,7 @@ files = [ {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bb89f0a835bcfc1d42ccd5f41f04870c1b936d8507c6df12b7737febc40f0909"}, {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f0c2d907a1e102526dd2986df638343388b94c33860ff3bbe1384130828714b1"}, {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f8157bed2f51db683f31306aa497311b560f2265998122abe1dce6428bd86567"}, + {file = "psycopg2_binary-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:27422aa5f11fbcd9b18da48373eb67081243662f9b46e6fd07c3eb46e4535142"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:eb09aa7f9cecb45027683bb55aebaaf45a0df8bf6de68801a6afdc7947bb09d4"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b73d6d7f0ccdad7bc43e6d34273f70d587ef62f824d7261c4ae9b8b1b6af90e8"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce5ab4bf46a211a8e924d307c1b1fcda82368586a19d0a24f8ae166f5c784864"}, From 3b580111760f0749922ea593dbe0b1d602952438 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 7 Jan 2025 12:23:06 -0500 Subject: [PATCH 15/32] Build: Bump pytest-checkdocs from 2.10.1 to 2.13.0 (#682) Bumps [pytest-checkdocs](https://github.com/jaraco/pytest-checkdocs) from 2.10.1 to 2.13.0. - [Release notes](https://github.com/jaraco/pytest-checkdocs/releases) - [Changelog](https://github.com/jaraco/pytest-checkdocs/blob/main/NEWS.rst) - [Commits](https://github.com/jaraco/pytest-checkdocs/compare/v2.10.1...v2.13.0) --- updated-dependencies: - dependency-name: pytest-checkdocs dependency-type: direct:development update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 265 +++++++++++++++++++++++++++++++++++++++++++++++-- pyproject.toml | 2 +- 2 files changed, 259 insertions(+), 8 deletions(-) diff --git a/poetry.lock b/poetry.lock index c95252517a..7bc22bec33 100644 --- a/poetry.lock +++ b/poetry.lock @@ -185,6 +185,17 @@ files = [ [package.dependencies] frozenlist = ">=1.1.0" +[[package]] +name = "alabaster" +version = "0.7.16" +description = "A light, configurable Sphinx theme" +optional = false +python-versions = ">=3.9" +files = [ + {file = "alabaster-0.7.16-py3-none-any.whl", hash = "sha256:b46733c07dce03ae4e150330b975c75737fa60f0a7c591b6c8bf4928a28e2c92"}, + {file = "alabaster-0.7.16.tar.gz", hash = "sha256:75a8b99c28a5dad50dd7f8ccdd447a121ddb3892da9e53d1ca5cca3106d58d65"}, +] + [[package]] name = "annotated-types" version = "0.7.0" @@ -359,6 +370,21 @@ files = [ [package.extras] dev = ["freezegun (>=1.0,<2.0)", "pytest (>=6.0)", "pytest-cov"] +[[package]] +name = "backports-tarfile" +version = "1.2.0" +description = "Backport of CPython tarfile module" +optional = false +python-versions = ">=3.8" +files = [ + {file = "backports.tarfile-1.2.0-py3-none-any.whl", hash = "sha256:77e284d754527b01fb1e6fa8a1afe577858ebe4e9dad8919e34c862cb399bc34"}, + {file = "backports_tarfile-1.2.0.tar.gz", hash = "sha256:d75e02c268746e1b8144c278978b6e98e85de6ad16f8e4b0844a154557eca991"}, +] + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9.3)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +testing = ["jaraco.test", "pytest (!=8.0.*)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)"] + [[package]] name = "blinker" version = "1.9.0" @@ -428,6 +454,7 @@ importlib-metadata = {version = ">=4.6", markers = "python_full_version < \"3.10 packaging = ">=19.1" pyproject_hooks = "*" tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} +virtualenv = {version = ">=20.0.35", optional = true, markers = "extra == \"virtualenv\""} [package.extras] docs = ["furo (>=2023.08.17)", "sphinx (>=7.0,<8.0)", "sphinx-argparse-cli (>=1.5)", "sphinx-autodoc-typehints (>=1.10)", "sphinx-issues (>=3.0.0)"] @@ -1103,6 +1130,25 @@ files = [ {file = "docutils-0.21.2.tar.gz", hash = "sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f"}, ] +[[package]] +name = "domdf-python-tools" +version = "3.9.0" +description = "Helpful functions for Python 🐍 🛠️" +optional = false +python-versions = ">=3.6" +files = [ + {file = "domdf_python_tools-3.9.0-py3-none-any.whl", hash = "sha256:4e1ef365cbc24627d6d1e90cf7d46d8ab8df967e1237f4a26885f6986c78872e"}, + {file = "domdf_python_tools-3.9.0.tar.gz", hash = "sha256:1f8a96971178333a55e083e35610d7688cd7620ad2b99790164e1fc1a3614c18"}, +] + +[package.dependencies] +natsort = ">=7.0.1" +typing-extensions = ">=3.7.4.1" + +[package.extras] +all = ["pytz (>=2019.1)"] +dates = ["pytz (>=2019.1)"] + [[package]] name = "duckdb" version = "1.1.3" @@ -1818,6 +1864,17 @@ files = [ [package.extras] all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"] +[[package]] +name = "imagesize" +version = "1.4.1" +description = "Getting image size from png/jpeg/jpeg2000/gif file" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "imagesize-1.4.1-py2.py3-none-any.whl", hash = "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b"}, + {file = "imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a"}, +] + [[package]] name = "importlib-metadata" version = "8.5.0" @@ -1874,6 +1931,45 @@ files = [ {file = "itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173"}, ] +[[package]] +name = "jaraco-context" +version = "6.0.1" +description = "Useful decorators and context managers" +optional = false +python-versions = ">=3.8" +files = [ + {file = "jaraco.context-6.0.1-py3-none-any.whl", hash = "sha256:f797fc481b490edb305122c9181830a3a5b76d84ef6d1aef2fb9b47ab956f9e4"}, + {file = "jaraco_context-6.0.1.tar.gz", hash = "sha256:9bae4ea555cf0b14938dc0aee7c9f32ed303aa20a3b73e7dc80111628792d1b3"}, +] + +[package.dependencies] +"backports.tarfile" = {version = "*", markers = "python_version < \"3.12\""} + +[package.extras] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +test = ["portend", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-ruff (>=0.2.1)"] + +[[package]] +name = "jaraco-packaging" +version = "10.2.3" +description = "tools to supplement packaging Python releases" +optional = false +python-versions = ">=3.8" +files = [ + {file = "jaraco.packaging-10.2.3-py3-none-any.whl", hash = "sha256:ceb5806d2ac5731ba5b265d196e4cb848afa2a958f01d0bf3a1dfaa3969ed92c"}, + {file = "jaraco_packaging-10.2.3.tar.gz", hash = "sha256:d726cc42faa62b2f70585cbe1176b4b469fe6d75f21b19034b688b4340917933"}, +] + +[package.dependencies] +build = {version = "*", extras = ["virtualenv"]} +domdf-python-tools = "*" +"jaraco.context" = "*" +sphinx = "*" + +[package.extras] +doc = ["furo", "jaraco.packaging (>=9.3)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +test = ["pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-ruff (>=0.2.1)", "types-docutils"] + [[package]] name = "jinja2" version = "3.1.5" @@ -2795,6 +2891,21 @@ files = [ [package.dependencies] typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.12\""} +[[package]] +name = "natsort" +version = "8.4.0" +description = "Simple yet flexible natural sorting in Python." +optional = false +python-versions = ">=3.7" +files = [ + {file = "natsort-8.4.0-py3-none-any.whl", hash = "sha256:4732914fb471f56b5cce04d7bae6f164a592c7712e1c85f9ef585e197299521c"}, + {file = "natsort-8.4.0.tar.gz", hash = "sha256:45312c4a0e5507593da193dedd04abb1469253b601ecaf63445ad80f0a1ea581"}, +] + +[package.extras] +fast = ["fastnumbers (>=2.0.0)"] +icu = ["PyICU (>=1.0.0)"] + [[package]] name = "networkx" version = "3.2.1" @@ -3706,23 +3817,22 @@ testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "no [[package]] name = "pytest-checkdocs" -version = "2.10.1" +version = "2.13.0" description = "check the README when running tests" optional = false python-versions = ">=3.8" files = [ - {file = "pytest-checkdocs-2.10.1.tar.gz", hash = "sha256:393868583f2d0314f8c5828fd94f7d28699543f6a0a925356d7e274e2952297e"}, - {file = "pytest_checkdocs-2.10.1-py3-none-any.whl", hash = "sha256:f069d6408633697023298ebf66c9bb1cb915c3ae5f047457b507229a4784e153"}, + {file = "pytest_checkdocs-2.13.0-py3-none-any.whl", hash = "sha256:5df5bbd7e9753aa51a5f6954a301a4066bd4a04eb7e0c712c5d5d7ede1cbe153"}, + {file = "pytest_checkdocs-2.13.0.tar.gz", hash = "sha256:b0e67169c543986142e15afbc17c772da87fcdb0922c7b1e4f6c60f8769f11f9"}, ] [package.dependencies] -build = "*" docutils = ">=0.15" -importlib-metadata = {version = ">=4", markers = "python_version < \"3.10\""} +"jaraco.packaging" = ">=9.5" [package.extras] docs = ["furo", "jaraco.packaging (>=9.3)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] -testing = ["pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-ruff", "types-docutils"] +testing = ["pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-ruff (>=0.2.1)", "types-docutils"] [[package]] name = "pytest-lazy-fixture" @@ -4389,6 +4499,17 @@ files = [ {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, ] +[[package]] +name = "snowballstemmer" +version = "2.2.0" +description = "This package provides 29 stemmers for 28 languages generated from Snowball algorithms." +optional = false +python-versions = "*" +files = [ + {file = "snowballstemmer-2.2.0-py2.py3-none-any.whl", hash = "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a"}, + {file = "snowballstemmer-2.2.0.tar.gz", hash = "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1"}, +] + [[package]] name = "sortedcontainers" version = "2.4.0" @@ -4400,6 +4521,136 @@ files = [ {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, ] +[[package]] +name = "sphinx" +version = "7.4.7" +description = "Python documentation generator" +optional = false +python-versions = ">=3.9" +files = [ + {file = "sphinx-7.4.7-py3-none-any.whl", hash = "sha256:c2419e2135d11f1951cd994d6eb18a1835bd8fdd8429f9ca375dc1f3281bd239"}, + {file = "sphinx-7.4.7.tar.gz", hash = "sha256:242f92a7ea7e6c5b406fdc2615413890ba9f699114a9c09192d7dfead2ee9cfe"}, +] + +[package.dependencies] +alabaster = ">=0.7.14,<0.8.0" +babel = ">=2.13" +colorama = {version = ">=0.4.6", markers = "sys_platform == \"win32\""} +docutils = ">=0.20,<0.22" +imagesize = ">=1.3" +importlib-metadata = {version = ">=6.0", markers = "python_version < \"3.10\""} +Jinja2 = ">=3.1" +packaging = ">=23.0" +Pygments = ">=2.17" +requests = ">=2.30.0" +snowballstemmer = ">=2.2" +sphinxcontrib-applehelp = "*" +sphinxcontrib-devhelp = "*" +sphinxcontrib-htmlhelp = ">=2.0.0" +sphinxcontrib-jsmath = "*" +sphinxcontrib-qthelp = "*" +sphinxcontrib-serializinghtml = ">=1.1.9" +tomli = {version = ">=2", markers = "python_version < \"3.11\""} + +[package.extras] +docs = ["sphinxcontrib-websupport"] +lint = ["flake8 (>=6.0)", "importlib-metadata (>=6.0)", "mypy (==1.10.1)", "pytest (>=6.0)", "ruff (==0.5.2)", "sphinx-lint (>=0.9)", "tomli (>=2)", "types-docutils (==0.21.0.20240711)", "types-requests (>=2.30.0)"] +test = ["cython (>=3.0)", "defusedxml (>=0.7.1)", "pytest (>=8.0)", "setuptools (>=70.0)", "typing_extensions (>=4.9)"] + +[[package]] +name = "sphinxcontrib-applehelp" +version = "2.0.0" +description = "sphinxcontrib-applehelp is a Sphinx extension which outputs Apple help books" +optional = false +python-versions = ">=3.9" +files = [ + {file = "sphinxcontrib_applehelp-2.0.0-py3-none-any.whl", hash = "sha256:4cd3f0ec4ac5dd9c17ec65e9ab272c9b867ea77425228e68ecf08d6b28ddbdb5"}, + {file = "sphinxcontrib_applehelp-2.0.0.tar.gz", hash = "sha256:2f29ef331735ce958efa4734873f084941970894c6090408b079c61b2e1c06d1"}, +] + +[package.extras] +lint = ["mypy", "ruff (==0.5.5)", "types-docutils"] +standalone = ["Sphinx (>=5)"] +test = ["pytest"] + +[[package]] +name = "sphinxcontrib-devhelp" +version = "2.0.0" +description = "sphinxcontrib-devhelp is a sphinx extension which outputs Devhelp documents" +optional = false +python-versions = ">=3.9" +files = [ + {file = "sphinxcontrib_devhelp-2.0.0-py3-none-any.whl", hash = "sha256:aefb8b83854e4b0998877524d1029fd3e6879210422ee3780459e28a1f03a8a2"}, + {file = "sphinxcontrib_devhelp-2.0.0.tar.gz", hash = "sha256:411f5d96d445d1d73bb5d52133377b4248ec79db5c793ce7dbe59e074b4dd1ad"}, +] + +[package.extras] +lint = ["mypy", "ruff (==0.5.5)", "types-docutils"] +standalone = ["Sphinx (>=5)"] +test = ["pytest"] + +[[package]] +name = "sphinxcontrib-htmlhelp" +version = "2.1.0" +description = "sphinxcontrib-htmlhelp is a sphinx extension which renders HTML help files" +optional = false +python-versions = ">=3.9" +files = [ + {file = "sphinxcontrib_htmlhelp-2.1.0-py3-none-any.whl", hash = "sha256:166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8"}, + {file = "sphinxcontrib_htmlhelp-2.1.0.tar.gz", hash = "sha256:c9e2916ace8aad64cc13a0d233ee22317f2b9025b9cf3295249fa985cc7082e9"}, +] + +[package.extras] +lint = ["mypy", "ruff (==0.5.5)", "types-docutils"] +standalone = ["Sphinx (>=5)"] +test = ["html5lib", "pytest"] + +[[package]] +name = "sphinxcontrib-jsmath" +version = "1.0.1" +description = "A sphinx extension which renders display math in HTML via JavaScript" +optional = false +python-versions = ">=3.5" +files = [ + {file = "sphinxcontrib-jsmath-1.0.1.tar.gz", hash = "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"}, + {file = "sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178"}, +] + +[package.extras] +test = ["flake8", "mypy", "pytest"] + +[[package]] +name = "sphinxcontrib-qthelp" +version = "2.0.0" +description = "sphinxcontrib-qthelp is a sphinx extension which outputs QtHelp documents" +optional = false +python-versions = ">=3.9" +files = [ + {file = "sphinxcontrib_qthelp-2.0.0-py3-none-any.whl", hash = "sha256:b18a828cdba941ccd6ee8445dbe72ffa3ef8cbe7505d8cd1fa0d42d3f2d5f3eb"}, + {file = "sphinxcontrib_qthelp-2.0.0.tar.gz", hash = "sha256:4fe7d0ac8fc171045be623aba3e2a8f613f8682731f9153bb2e40ece16b9bbab"}, +] + +[package.extras] +lint = ["mypy", "ruff (==0.5.5)", "types-docutils"] +standalone = ["Sphinx (>=5)"] +test = ["defusedxml (>=0.7.1)", "pytest"] + +[[package]] +name = "sphinxcontrib-serializinghtml" +version = "2.0.0" +description = "sphinxcontrib-serializinghtml is a sphinx extension which outputs \"serialized\" HTML files (json and pickle)" +optional = false +python-versions = ">=3.9" +files = [ + {file = "sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl", hash = "sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331"}, + {file = "sphinxcontrib_serializinghtml-2.0.0.tar.gz", hash = "sha256:e9d912827f872c029017a53f0ef2180b327c3f7fd23c87229f7a8e8b70031d4d"}, +] + +[package.extras] +lint = ["mypy", "ruff (==0.5.5)", "types-docutils"] +standalone = ["Sphinx (>=5)"] +test = ["pytest"] + [[package]] name = "sqlalchemy" version = "2.0.36" @@ -5099,4 +5350,4 @@ zstandard = ["zstandard"] [metadata] lock-version = "2.0" python-versions = "^3.9, !=3.9.7" -content-hash = "3f9ea520ceb12bb56d371c19ee4c59f14ba258878a65067c37684dfc209f85b9" +content-hash = "59e5678cd718f658c5bd099c03051564ee60f991e5f222bf92da13d1dd025a42" diff --git a/pyproject.toml b/pyproject.toml index 66a95a1561..58dac055ca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,7 +82,7 @@ cachetools = "^5.5.0" [tool.poetry.group.dev.dependencies] pytest = "7.4.4" -pytest-checkdocs = "2.10.1" +pytest-checkdocs = "2.13.0" pytest-lazy-fixture = "0.6.3" pre-commit = "4.0.1" fastavro = "1.10.0" From e6af50eaa09b3a8437e0a65c02a4637105503305 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 9 Jan 2025 12:55:51 -0500 Subject: [PATCH 16/32] Build: Bump boto3 from 1.35.88 to 1.35.93 (#1495) Bumps [boto3](https://github.com/boto/boto3) from 1.35.88 to 1.35.93. - [Release notes](https://github.com/boto/boto3/releases) - [Commits](https://github.com/boto/boto3/compare/1.35.88...1.35.93) --- updated-dependencies: - dependency-name: boto3 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/poetry.lock b/poetry.lock index 7bc22bec33..c96050b0df 100644 --- a/poetry.lock +++ b/poetry.lock @@ -25,24 +25,31 @@ tests = ["arrow", "dask[dataframe]", "docker", "pytest", "pytest-mock"] [[package]] name = "aiobotocore" -version = "2.16.1" +version = "2.17.0" description = "Async client for aws services using botocore and aiohttp" optional = true python-versions = ">=3.8" files = [ - {file = "aiobotocore-2.16.1-py3-none-any.whl", hash = "sha256:e7cf6295471224c82a111deaf31c2c3a4bcd6dbd6973e75c7fc4739fcccd5b0b"}, - {file = "aiobotocore-2.16.1.tar.gz", hash = "sha256:0f94904c6a1d14d5aac0502fcc1d721b95ee60d46d8a0e546f6203de0410d522"}, + {file = "aiobotocore-2.17.0-py3-none-any.whl", hash = "sha256:aedccd5368a64401233ef9f27983d3d3cb6a507a6ca981f5ec1df014c00e260e"}, + {file = "aiobotocore-2.17.0.tar.gz", hash = "sha256:a3041333c565bff9d63b4468bee4944f2d81cff63a45b10e5cc652f3837f9cc2"}, ] [package.dependencies] aiohttp = ">=3.9.2,<4.0.0" aioitertools = ">=0.5.1,<1.0.0" -botocore = ">=1.35.74,<1.35.89" +botocore = ">=1.35.74,<1.35.94" +jmespath = ">=0.7.1,<2.0.0" +multidict = ">=6.0.0,<7.0.0" +python-dateutil = ">=2.1,<3.0.0" +urllib3 = [ + {version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""}, + {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >= \"3.10\""}, +] wrapt = ">=1.10.10,<2.0.0" [package.extras] -awscli = ["awscli (>=1.36.15,<1.36.30)"] -boto3 = ["boto3 (>=1.35.74,<1.35.89)"] +awscli = ["awscli (>=1.36.15,<1.36.35)"] +boto3 = ["boto3 (>=1.35.74,<1.35.94)"] [[package]] name = "aiohappyeyeballs" @@ -398,17 +405,17 @@ files = [ [[package]] name = "boto3" -version = "1.35.88" +version = "1.35.93" description = "The AWS SDK for Python" optional = false python-versions = ">=3.8" files = [ - {file = "boto3-1.35.88-py3-none-any.whl", hash = "sha256:7bc9b27ad87607256470c70a86c8b8c319ddd6ecae89cc191687cbf8ccb7b6a6"}, - {file = "boto3-1.35.88.tar.gz", hash = "sha256:43c6a7a70bb226770a82a601870136e3bb3bf2808f4576ab5b9d7d140dbf1323"}, + {file = "boto3-1.35.93-py3-none-any.whl", hash = "sha256:7de2c44c960e486f3c57e5203ea6393c6c4f0914c5f81c789ceb8b5d2ba5d1c5"}, + {file = "boto3-1.35.93.tar.gz", hash = "sha256:2446e819cf4e295833474cdcf2c92bc82718ce537e9ee1f17f7e3d237f60e69b"}, ] [package.dependencies] -botocore = ">=1.35.88,<1.36.0" +botocore = ">=1.35.93,<1.36.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.10.0,<0.11.0" @@ -417,13 +424,13 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.35.88" +version = "1.35.93" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">=3.8" files = [ - {file = "botocore-1.35.88-py3-none-any.whl", hash = "sha256:e60cc3fbe8d7a10f70e7e852d76be2b29f23ead418a5899d366ea32b1eacb5a5"}, - {file = "botocore-1.35.88.tar.gz", hash = "sha256:58dcd9a464c354b8c6c25261d8de830d175d9739eae568bf0c52e57116fb03c6"}, + {file = "botocore-1.35.93-py3-none-any.whl", hash = "sha256:47f7161000af6036f806449e3de12acdd3ec11aac7f5578e43e96241413a0f8f"}, + {file = "botocore-1.35.93.tar.gz", hash = "sha256:b8d245a01e7d64c41edcf75a42be158df57b9518a83a3dbf5c7e4b8c2bc540cc"}, ] [package.dependencies] From c9249c330e47e05a52284124ff8172bcc232c737 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 9 Jan 2025 12:55:59 -0500 Subject: [PATCH 17/32] Build: Bump mypy-boto3-glue from 1.35.87 to 1.35.93 (#1496) Bumps [mypy-boto3-glue](https://github.com/youtype/mypy_boto3_builder) from 1.35.87 to 1.35.93. - [Release notes](https://github.com/youtype/mypy_boto3_builder/releases) - [Commits](https://github.com/youtype/mypy_boto3_builder/commits) --- updated-dependencies: - dependency-name: mypy-boto3-glue dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index c96050b0df..684d304bba 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2886,17 +2886,17 @@ typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.11\""} [[package]] name = "mypy-boto3-glue" -version = "1.35.87" -description = "Type annotations for boto3 Glue 1.35.87 service generated with mypy-boto3-builder 8.7.0" +version = "1.35.93" +description = "Type annotations for boto3 Glue 1.35.93 service generated with mypy-boto3-builder 8.8.0" optional = true python-versions = ">=3.8" files = [ - {file = "mypy_boto3_glue-1.35.87-py3-none-any.whl", hash = "sha256:c4c62daf80e99ad539491b63814b7cf94a5e4f1fca732540a9aaae458af52691"}, - {file = "mypy_boto3_glue-1.35.87.tar.gz", hash = "sha256:d1d5f1bb5c5297045a1a650a6672c46a319e3cf373085d2303c2179dc5b46d7d"}, + {file = "mypy_boto3_glue-1.35.93-py3-none-any.whl", hash = "sha256:cf46553f68048124bad65345b593ec5ba3806bd9bd15a1d7516d0cb3d79a0652"}, + {file = "mypy_boto3_glue-1.35.93.tar.gz", hash = "sha256:27759a83ffa5414b2589da83625816a3c7cb97600fec68578bd3012a9ae20ee8"}, ] [package.dependencies] -typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.12\""} +typing-extensions = {version = "*", markers = "python_version < \"3.12\""} [[package]] name = "natsort" From a95f9ee6e231104319c01493cb3ada59d9e782d0 Mon Sep 17 00:00:00 2001 From: jeppe-dos Date: Thu, 9 Jan 2025 19:14:22 +0100 Subject: [PATCH 18/32] Change dot notation in add column documentation to tuple (#1433) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Change dot notation in add column documentation to tuple * Update move and rename column struct in api.md * Correct rename_column, move_before and delete_column in api.md * Change exchange to processed by on rename_column in api.md * Update mkdocs/docs/api.md Co-authored-by: Kevin Liu * Fix rename column in api.md * Update mkdocs/docs/api.md * Update mkdocs/docs/api.md --------- Co-authored-by: Jeppe Finne Sørensen Co-authored-by: Kevin Liu --- mkdocs/docs/api.md | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md index 9c48718877..8b106c1034 100644 --- a/mkdocs/docs/api.md +++ b/mkdocs/docs/api.md @@ -1072,8 +1072,12 @@ Using `add_column` you can add a column, without having to worry about the field with table.update_schema() as update: update.add_column("retries", IntegerType(), "Number of retries to place the bid") # In a struct - update.add_column("details.confirmed_by", StringType(), "Name of the exchange") + update.add_column("details", StructType()) + +with table.update_schema() as update: + update.add_column(("details", "confirmed_by"), StringType(), "Name of the exchange") ``` +A complex type must exist before columns can be added to it. Fields in complex types are added in a tuple. ### Rename column @@ -1082,20 +1086,21 @@ Renaming a field in an Iceberg table is simple: ```python with table.update_schema() as update: update.rename_column("retries", "num_retries") - # This will rename `confirmed_by` to `exchange` - update.rename_column("properties.confirmed_by", "exchange") + # This will rename `confirmed_by` to `processed_by` in the `details` struct + update.rename_column(("details", "confirmed_by"), "processed_by") ``` ### Move column -Move a field inside of struct: +Move order of fields: ```python with table.update_schema() as update: update.move_first("symbol") + # This will move `bid` after `ask` update.move_after("bid", "ask") - # This will move `confirmed_by` before `exchange` - update.move_before("details.created_by", "details.exchange") + # This will move `confirmed_by` before `exchange` in the `details` struct + update.move_before(("details", "confirmed_by"), ("details", "exchange")) ``` ### Update column @@ -1127,6 +1132,8 @@ Delete a field, careful this is a incompatible change (readers/writers might exp ```python with table.update_schema(allow_incompatible_changes=True) as update: update.delete_column("some_field") + # In a struct + update.delete_column(("details", "confirmed_by")) ``` ## Partition evolution From 19ad24ef7d32485701c4baf85565a6f3614839ff Mon Sep 17 00:00:00 2001 From: smaheshwar-pltr Date: Fri, 10 Jan 2025 20:43:28 +0000 Subject: [PATCH 19/32] Nit fixes to URL-encoding of partition field names (#1499) * Revert "Add `make_name_compatible` suggestion so test passes" This reverts commit 61cdd08c59f3f1d3119b5f907eb09dbbcf80b8c2. * Nit fixes to URL-encoding of partition field names * Fix tests * Collapse * Make lint --------- Co-authored-by: Sreesh Maheshwar --- mkdocs/docs/api.md | 1 + pyiceberg/partitioning.py | 7 +--- tests/integration/test_partitioning_key.py | 47 ++-------------------- 3 files changed, 7 insertions(+), 48 deletions(-) diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md index 8b106c1034..f1ef69b9cb 100644 --- a/mkdocs/docs/api.md +++ b/mkdocs/docs/api.md @@ -1077,6 +1077,7 @@ with table.update_schema() as update: with table.update_schema() as update: update.add_column(("details", "confirmed_by"), StringType(), "Name of the exchange") ``` + A complex type must exist before columns can be added to it. Fields in complex types are added in a tuple. ### Rename column diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py index c9b6316f59..1813772217 100644 --- a/pyiceberg/partitioning.py +++ b/pyiceberg/partitioning.py @@ -234,11 +234,8 @@ def partition_to_path(self, data: Record, schema: Schema) -> str: partition_field = self.fields[pos] value_str = partition_field.transform.to_human_string(field_types[pos].field_type, value=data[pos]) - value_str = quote_plus(value_str, safe="") - value_strs.append(value_str) - - field_str = quote_plus(partition_field.name, safe="") - field_strs.append(field_str) + value_strs.append(quote_plus(value_str, safe="")) + field_strs.append(quote_plus(partition_field.name, safe="")) path = "/".join([field_str + "=" + value_str for field_str, value_str in zip(field_strs, value_strs)]) return path diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py index 1ac808c7d0..3955259d33 100644 --- a/tests/integration/test_partitioning_key.py +++ b/tests/integration/test_partitioning_key.py @@ -18,7 +18,7 @@ import uuid from datetime import date, datetime, timedelta, timezone from decimal import Decimal -from typing import Any, Callable, List, Optional +from typing import Any, List import pytest from pyspark.sql import SparkSession @@ -26,7 +26,7 @@ from pyiceberg.catalog import Catalog from pyiceberg.partitioning import PartitionField, PartitionFieldValue, PartitionKey, PartitionSpec -from pyiceberg.schema import Schema +from pyiceberg.schema import Schema, make_compatible_name from pyiceberg.transforms import ( BucketTransform, DayTransform, @@ -78,7 +78,7 @@ @pytest.mark.parametrize( - "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification, make_compatible_name", + "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification", [ # # Identity Transform ( @@ -99,7 +99,6 @@ VALUES (false, 'Boolean field set to false'); """, - None, ), ( [PartitionField(source_id=2, field_id=1001, transform=IdentityTransform(), name="string_field")], @@ -119,7 +118,6 @@ VALUES ('sample_string', 'Another string value') """, - None, ), ( [PartitionField(source_id=4, field_id=1001, transform=IdentityTransform(), name="int_field")], @@ -139,7 +137,6 @@ VALUES (42, 'Associated string value for int 42') """, - None, ), ( [PartitionField(source_id=5, field_id=1001, transform=IdentityTransform(), name="long_field")], @@ -159,7 +156,6 @@ VALUES (1234567890123456789, 'Associated string value for long 1234567890123456789') """, - None, ), ( [PartitionField(source_id=6, field_id=1001, transform=IdentityTransform(), name="float_field")], @@ -183,7 +179,6 @@ # VALUES # (3.14, 'Associated string value for float 3.14') # """ - None, ), ( [PartitionField(source_id=7, field_id=1001, transform=IdentityTransform(), name="double_field")], @@ -207,7 +202,6 @@ # VALUES # (6.282, 'Associated string value for double 6.282') # """ - None, ), ( [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")], @@ -227,7 +221,6 @@ VALUES (CAST('2023-01-01 12:00:01.000999' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00') """, - None, ), ( [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")], @@ -247,7 +240,6 @@ VALUES (CAST('2023-01-01 12:00:01' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00') """, - None, ), ( [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")], @@ -272,7 +264,6 @@ # VALUES # (CAST('2023-01-01 12:00:00' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00') # """ - None, ), ( [PartitionField(source_id=9, field_id=1001, transform=IdentityTransform(), name="timestamptz_field")], @@ -297,7 +288,6 @@ # VALUES # (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Associated string value for timestamp 2023-01-01 12:00:01.000999+03:00') # """ - None, ), ( [PartitionField(source_id=10, field_id=1001, transform=IdentityTransform(), name="date_field")], @@ -317,7 +307,6 @@ VALUES (CAST('2023-01-01' AS DATE), 'Associated string value for date 2023-01-01') """, - None, ), ( [PartitionField(source_id=14, field_id=1001, transform=IdentityTransform(), name="uuid_field")], @@ -337,7 +326,6 @@ VALUES ('f47ac10b-58cc-4372-a567-0e02b2c3d479', 'Associated string value for UUID f47ac10b-58cc-4372-a567-0e02b2c3d479') """, - None, ), ( [PartitionField(source_id=11, field_id=1001, transform=IdentityTransform(), name="binary_field")], @@ -357,7 +345,6 @@ VALUES (CAST('example' AS BINARY), 'Associated string value for binary `example`') """, - None, ), ( [PartitionField(source_id=13, field_id=1001, transform=IdentityTransform(), name="decimal_field")], @@ -377,7 +364,6 @@ VALUES (123.45, 'Associated string value for decimal 123.45') """, - None, ), # # Year Month Day Hour Transform # Month Transform @@ -399,7 +385,6 @@ VALUES (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP_NTZ), 'Event at 2023-01-01 11:55:59.999999'); """, - None, ), ( [PartitionField(source_id=9, field_id=1001, transform=MonthTransform(), name="timestamptz_field_month")], @@ -419,7 +404,6 @@ VALUES (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00'); """, - None, ), ( [PartitionField(source_id=10, field_id=1001, transform=MonthTransform(), name="date_field_month")], @@ -439,7 +423,6 @@ VALUES (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01'); """, - None, ), # Year Transform ( @@ -460,7 +443,6 @@ VALUES (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event at 2023-01-01 11:55:59.999999'); """, - None, ), ( [PartitionField(source_id=9, field_id=1001, transform=YearTransform(), name="timestamptz_field_year")], @@ -480,7 +462,6 @@ VALUES (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00'); """, - None, ), ( [PartitionField(source_id=10, field_id=1001, transform=YearTransform(), name="date_field_year")], @@ -500,7 +481,6 @@ VALUES (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01'); """, - None, ), # # Day Transform ( @@ -521,7 +501,6 @@ VALUES (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01'); """, - None, ), ( [PartitionField(source_id=9, field_id=1001, transform=DayTransform(), name="timestamptz_field_day")], @@ -541,7 +520,6 @@ VALUES (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00'); """, - None, ), ( [PartitionField(source_id=10, field_id=1001, transform=DayTransform(), name="date_field_day")], @@ -561,7 +539,6 @@ VALUES (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01'); """, - None, ), # Hour Transform ( @@ -582,7 +559,6 @@ VALUES (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event within the 11th hour of 2023-01-01'); """, - None, ), ( [PartitionField(source_id=9, field_id=1001, transform=HourTransform(), name="timestamptz_field_hour")], @@ -602,7 +578,6 @@ VALUES (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00'); """, - None, ), # Truncate Transform ( @@ -623,7 +598,6 @@ VALUES (12345, 'Sample data for int'); """, - None, ), ( [PartitionField(source_id=5, field_id=1001, transform=TruncateTransform(2), name="bigint_field_trunc")], @@ -643,7 +617,6 @@ VALUES (4294967297, 'Sample data for long'); """, - None, ), ( [PartitionField(source_id=2, field_id=1001, transform=TruncateTransform(3), name="string_field_trunc")], @@ -663,7 +636,6 @@ VALUES ('abcdefg', 'Another sample for string'); """, - None, ), ( [PartitionField(source_id=13, field_id=1001, transform=TruncateTransform(width=5), name="decimal_field_trunc")], @@ -683,7 +655,6 @@ VALUES (678.90, 'Associated string value for decimal 678.90') """, - None, ), ( [PartitionField(source_id=11, field_id=1001, transform=TruncateTransform(10), name="binary_field_trunc")], @@ -703,7 +674,6 @@ VALUES (binary('HELLOICEBERG'), 'Sample data for binary'); """, - None, ), # Bucket Transform ( @@ -724,7 +694,6 @@ VALUES (10, 'Integer with value 10'); """, - None, ), # Test multiple field combinations could generate the Partition record and hive partition path correctly ( @@ -753,7 +722,6 @@ VALUES (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), CAST('2023-01-01' AS DATE), 'some data'); """, - None, ), # Test that special characters are URL-encoded ( @@ -773,7 +741,6 @@ VALUES ('special string') """, - lambda name: name.replace("#", "_x23").replace("+", "_x2B"), ), ], ) @@ -787,7 +754,6 @@ def test_partition_key( expected_hive_partition_path_slice: str, spark_create_table_sql_for_justification: str, spark_data_insert_sql_for_justification: str, - make_compatible_name: Optional[Callable[[str], str]], ) -> None: partition_field_values = [PartitionFieldValue(field, value) for field, value in zip(partition_fields, partition_values)] spec = PartitionSpec(*partition_fields) @@ -823,11 +789,6 @@ def test_partition_key( snapshot.manifests(iceberg_table.io)[0].fetch_manifest_entry(iceberg_table.io)[0].data_file.file_path ) # Special characters in partition value are sanitized when written to the data file's partition field - # Use `make_compatible_name` to match the sanitize behavior - sanitized_record = ( - Record(**{make_compatible_name(k): v for k, v in vars(expected_partition_record).items()}) - if make_compatible_name - else expected_partition_record - ) + sanitized_record = Record(**{make_compatible_name(k): v for k, v in vars(expected_partition_record).items()}) assert spark_partition_for_justification == sanitized_record assert expected_hive_partition_path_slice in spark_path_for_justification From ae272b5b37a3132932548d06fb9e8acd23f2bc57 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Fri, 10 Jan 2025 16:01:36 -0500 Subject: [PATCH 20/32] bump version to 0.9.0 (#1489) * bump to 0.8.1 * bump to 0.9.0 --- pyiceberg/__init__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyiceberg/__init__.py b/pyiceberg/__init__.py index 42c6e12f1b..e97de9276f 100644 --- a/pyiceberg/__init__.py +++ b/pyiceberg/__init__.py @@ -15,4 +15,4 @@ # specific language governing permissions and limitations # under the License. -__version__ = "0.8.0" +__version__ = "0.9.0" diff --git a/pyproject.toml b/pyproject.toml index 58dac055ca..56be937305 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ # under the License. [tool.poetry] name = "pyiceberg" -version = "0.8.0" +version = "0.9.0" readme = "README.md" homepage = "https://py.iceberg.apache.org/" repository = "https://github.com/apache/iceberg-python" From d9c5d6b4adf8c300ca47e3ac32cbe41c41f0bbd3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 10 Jan 2025 17:06:37 -0500 Subject: [PATCH 21/32] Build: Bump pydantic from 2.10.4 to 2.10.5 (#1504) Bumps [pydantic](https://github.com/pydantic/pydantic) from 2.10.4 to 2.10.5. - [Release notes](https://github.com/pydantic/pydantic/releases) - [Changelog](https://github.com/pydantic/pydantic/blob/main/HISTORY.md) - [Commits](https://github.com/pydantic/pydantic/compare/v2.10.4...v2.10.5) --- updated-dependencies: - dependency-name: pydantic dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index 684d304bba..156595db29 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3573,13 +3573,13 @@ files = [ [[package]] name = "pydantic" -version = "2.10.4" +version = "2.10.5" description = "Data validation using Python type hints" optional = false python-versions = ">=3.8" files = [ - {file = "pydantic-2.10.4-py3-none-any.whl", hash = "sha256:597e135ea68be3a37552fb524bc7d0d66dcf93d395acd93a00682f1efcb8ee3d"}, - {file = "pydantic-2.10.4.tar.gz", hash = "sha256:82f12e9723da6de4fe2ba888b5971157b3be7ad914267dea8f05f82b28254f06"}, + {file = "pydantic-2.10.5-py3-none-any.whl", hash = "sha256:4dd4e322dbe55472cb7ca7e73f4b63574eecccf2835ffa2af9021ce113c83c53"}, + {file = "pydantic-2.10.5.tar.gz", hash = "sha256:278b38dbbaec562011d659ee05f63346951b3a248a6f3642e1bc68894ea2b4ff"}, ] [package.dependencies] From 52665512466c50c5fa62d026f8d7436b63bafcb9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 10 Jan 2025 17:06:51 -0500 Subject: [PATCH 22/32] Build: Bump getdaft from 0.4.1 to 0.4.2 (#1503) Bumps [getdaft](https://github.com/Eventual-Inc/Daft) from 0.4.1 to 0.4.2. - [Release notes](https://github.com/Eventual-Inc/Daft/releases) - [Commits](https://github.com/Eventual-Inc/Daft/compare/v0.4.1...v0.4.2) --- updated-dependencies: - dependency-name: getdaft dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/poetry.lock b/poetry.lock index 156595db29..2c1ace347e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1496,17 +1496,17 @@ gcsfuse = ["fusepy"] [[package]] name = "getdaft" -version = "0.4.1" +version = "0.4.2" description = "Distributed Dataframes for Multimodal Data" optional = true python-versions = ">=3.9" files = [ - {file = "getdaft-0.4.1-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:04b91c019be87415138edfa61c379174a49760c4474c60eb37b1c24ae010a7d5"}, - {file = "getdaft-0.4.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:6254f33b5292b3198b6a0e4fdd0d2f568ff624930203d9af75bbc3b7e40e8c0b"}, - {file = "getdaft-0.4.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a642f786175f543cb0d2dc585577c554b135f5ac2e7b34bfbe359dd86adbdbae"}, - {file = "getdaft-0.4.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c1e1b0c283e0efc5102dea04db9a98bad6bcf36829a6c3d6cd511e8805514c0"}, - {file = "getdaft-0.4.1-cp39-abi3-win_amd64.whl", hash = "sha256:46985b2ec980134b97d3b8e95becd2b654cb74e2952d7b24b6f3b55d28d16de2"}, - {file = "getdaft-0.4.1.tar.gz", hash = "sha256:d3ad8b11b06bbf25b62a091444917593933ff53c39fb4a8abca8cbc6dde3b917"}, + {file = "getdaft-0.4.2-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:3760e69e66e571dbb42ad354954bd52d3ce8eafdfc93c9bdaf2c1ed42017808e"}, + {file = "getdaft-0.4.2-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:2b1c072f69663b87e4f3aa926cf7441d1d150fe46a6d2b32c8b01f72a237680b"}, + {file = "getdaft-0.4.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0e6450fd90743bd981575dc3a1b6694fe1e4a9fe2fc31ea5ad1ca92e1dabef2"}, + {file = "getdaft-0.4.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0852c71f81e1ff4fffd60ee7542ff325d1e93ec857adff8c26494a0188dc79ae"}, + {file = "getdaft-0.4.2-cp39-abi3-win_amd64.whl", hash = "sha256:687031e101dd4df151f387cc8a2a60bfc6bda640d4deb2d3a74a4f742eb57edf"}, + {file = "getdaft-0.4.2.tar.gz", hash = "sha256:9d253a5dce0ee798be9737ef1da60f313235fd459b4ff3b48e6aafe30538ff21"}, ] [package.dependencies] From 691740df974cc584b890110784ff8b6ac733cfdc Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 10 Jan 2025 17:07:00 -0500 Subject: [PATCH 23/32] Build: Bump sqlalchemy from 2.0.36 to 2.0.37 (#1502) Bumps [sqlalchemy](https://github.com/sqlalchemy/sqlalchemy) from 2.0.36 to 2.0.37. - [Release notes](https://github.com/sqlalchemy/sqlalchemy/releases) - [Changelog](https://github.com/sqlalchemy/sqlalchemy/blob/main/CHANGES.rst) - [Commits](https://github.com/sqlalchemy/sqlalchemy/commits) --- updated-dependencies: - dependency-name: sqlalchemy dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 118 ++++++++++++++++++++++++++-------------------------- 1 file changed, 59 insertions(+), 59 deletions(-) diff --git a/poetry.lock b/poetry.lock index 2c1ace347e..687ff5a3a8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4660,72 +4660,72 @@ test = ["pytest"] [[package]] name = "sqlalchemy" -version = "2.0.36" +version = "2.0.37" description = "Database Abstraction Library" optional = true python-versions = ">=3.7" files = [ - {file = "SQLAlchemy-2.0.36-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:59b8f3adb3971929a3e660337f5dacc5942c2cdb760afcabb2614ffbda9f9f72"}, - {file = "SQLAlchemy-2.0.36-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:37350015056a553e442ff672c2d20e6f4b6d0b2495691fa239d8aa18bb3bc908"}, - {file = "SQLAlchemy-2.0.36-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8318f4776c85abc3f40ab185e388bee7a6ea99e7fa3a30686580b209eaa35c08"}, - {file = "SQLAlchemy-2.0.36-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c245b1fbade9c35e5bd3b64270ab49ce990369018289ecfde3f9c318411aaa07"}, - {file = "SQLAlchemy-2.0.36-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:69f93723edbca7342624d09f6704e7126b152eaed3cdbb634cb657a54332a3c5"}, - {file = "SQLAlchemy-2.0.36-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f9511d8dd4a6e9271d07d150fb2f81874a3c8c95e11ff9af3a2dfc35fe42ee44"}, - {file = "SQLAlchemy-2.0.36-cp310-cp310-win32.whl", hash = "sha256:c3f3631693003d8e585d4200730616b78fafd5a01ef8b698f6967da5c605b3fa"}, - {file = "SQLAlchemy-2.0.36-cp310-cp310-win_amd64.whl", hash = "sha256:a86bfab2ef46d63300c0f06936bd6e6c0105faa11d509083ba8f2f9d237fb5b5"}, - {file = "SQLAlchemy-2.0.36-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:fd3a55deef00f689ce931d4d1b23fa9f04c880a48ee97af488fd215cf24e2a6c"}, - {file = "SQLAlchemy-2.0.36-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4f5e9cd989b45b73bd359f693b935364f7e1f79486e29015813c338450aa5a71"}, - {file = "SQLAlchemy-2.0.36-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d0ddd9db6e59c44875211bc4c7953a9f6638b937b0a88ae6d09eb46cced54eff"}, - {file = "SQLAlchemy-2.0.36-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2519f3a5d0517fc159afab1015e54bb81b4406c278749779be57a569d8d1bb0d"}, - {file = "SQLAlchemy-2.0.36-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:59b1ee96617135f6e1d6f275bbe988f419c5178016f3d41d3c0abb0c819f75bb"}, - {file = "SQLAlchemy-2.0.36-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:39769a115f730d683b0eb7b694db9789267bcd027326cccc3125e862eb03bfd8"}, - {file = "SQLAlchemy-2.0.36-cp311-cp311-win32.whl", hash = "sha256:66bffbad8d6271bb1cc2f9a4ea4f86f80fe5e2e3e501a5ae2a3dc6a76e604e6f"}, - {file = "SQLAlchemy-2.0.36-cp311-cp311-win_amd64.whl", hash = "sha256:23623166bfefe1487d81b698c423f8678e80df8b54614c2bf4b4cfcd7c711959"}, - {file = "SQLAlchemy-2.0.36-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f7b64e6ec3f02c35647be6b4851008b26cff592a95ecb13b6788a54ef80bbdd4"}, - {file = "SQLAlchemy-2.0.36-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:46331b00096a6db1fdc052d55b101dbbfc99155a548e20a0e4a8e5e4d1362855"}, - {file = "SQLAlchemy-2.0.36-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fdf3386a801ea5aba17c6410dd1dc8d39cf454ca2565541b5ac42a84e1e28f53"}, - {file = "SQLAlchemy-2.0.36-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac9dfa18ff2a67b09b372d5db8743c27966abf0e5344c555d86cc7199f7ad83a"}, - {file = "SQLAlchemy-2.0.36-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:90812a8933df713fdf748b355527e3af257a11e415b613dd794512461eb8a686"}, - {file = "SQLAlchemy-2.0.36-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1bc330d9d29c7f06f003ab10e1eaced295e87940405afe1b110f2eb93a233588"}, - {file = "SQLAlchemy-2.0.36-cp312-cp312-win32.whl", hash = "sha256:79d2e78abc26d871875b419e1fd3c0bca31a1cb0043277d0d850014599626c2e"}, - {file = "SQLAlchemy-2.0.36-cp312-cp312-win_amd64.whl", hash = "sha256:b544ad1935a8541d177cb402948b94e871067656b3a0b9e91dbec136b06a2ff5"}, - {file = "SQLAlchemy-2.0.36-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b5cc79df7f4bc3d11e4b542596c03826063092611e481fcf1c9dfee3c94355ef"}, - {file = "SQLAlchemy-2.0.36-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3c01117dd36800f2ecaa238c65365b7b16497adc1522bf84906e5710ee9ba0e8"}, - {file = "SQLAlchemy-2.0.36-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9bc633f4ee4b4c46e7adcb3a9b5ec083bf1d9a97c1d3854b92749d935de40b9b"}, - {file = "SQLAlchemy-2.0.36-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e46ed38affdfc95d2c958de328d037d87801cfcbea6d421000859e9789e61c2"}, - {file = "SQLAlchemy-2.0.36-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:b2985c0b06e989c043f1dc09d4fe89e1616aadd35392aea2844f0458a989eacf"}, - {file = "SQLAlchemy-2.0.36-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4a121d62ebe7d26fec9155f83f8be5189ef1405f5973ea4874a26fab9f1e262c"}, - {file = "SQLAlchemy-2.0.36-cp313-cp313-win32.whl", hash = "sha256:0572f4bd6f94752167adfd7c1bed84f4b240ee6203a95e05d1e208d488d0d436"}, - {file = "SQLAlchemy-2.0.36-cp313-cp313-win_amd64.whl", hash = "sha256:8c78ac40bde930c60e0f78b3cd184c580f89456dd87fc08f9e3ee3ce8765ce88"}, - {file = "SQLAlchemy-2.0.36-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:be9812b766cad94a25bc63bec11f88c4ad3629a0cec1cd5d4ba48dc23860486b"}, - {file = "SQLAlchemy-2.0.36-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50aae840ebbd6cdd41af1c14590e5741665e5272d2fee999306673a1bb1fdb4d"}, - {file = "SQLAlchemy-2.0.36-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4557e1f11c5f653ebfdd924f3f9d5ebfc718283b0b9beebaa5dd6b77ec290971"}, - {file = "SQLAlchemy-2.0.36-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:07b441f7d03b9a66299ce7ccf3ef2900abc81c0db434f42a5694a37bd73870f2"}, - {file = "SQLAlchemy-2.0.36-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:28120ef39c92c2dd60f2721af9328479516844c6b550b077ca450c7d7dc68575"}, - {file = "SQLAlchemy-2.0.36-cp37-cp37m-win32.whl", hash = "sha256:b81ee3d84803fd42d0b154cb6892ae57ea6b7c55d8359a02379965706c7efe6c"}, - {file = "SQLAlchemy-2.0.36-cp37-cp37m-win_amd64.whl", hash = "sha256:f942a799516184c855e1a32fbc7b29d7e571b52612647866d4ec1c3242578fcb"}, - {file = "SQLAlchemy-2.0.36-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3d6718667da04294d7df1670d70eeddd414f313738d20a6f1d1f379e3139a545"}, - {file = "SQLAlchemy-2.0.36-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:72c28b84b174ce8af8504ca28ae9347d317f9dba3999e5981a3cd441f3712e24"}, - {file = "SQLAlchemy-2.0.36-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b11d0cfdd2b095e7b0686cf5fabeb9c67fae5b06d265d8180715b8cfa86522e3"}, - {file = "SQLAlchemy-2.0.36-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e32092c47011d113dc01ab3e1d3ce9f006a47223b18422c5c0d150af13a00687"}, - {file = "SQLAlchemy-2.0.36-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:6a440293d802d3011028e14e4226da1434b373cbaf4a4bbb63f845761a708346"}, - {file = "SQLAlchemy-2.0.36-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:c54a1e53a0c308a8e8a7dffb59097bff7facda27c70c286f005327f21b2bd6b1"}, - {file = "SQLAlchemy-2.0.36-cp38-cp38-win32.whl", hash = "sha256:1e0d612a17581b6616ff03c8e3d5eff7452f34655c901f75d62bd86449d9750e"}, - {file = "SQLAlchemy-2.0.36-cp38-cp38-win_amd64.whl", hash = "sha256:8958b10490125124463095bbdadda5aa22ec799f91958e410438ad6c97a7b793"}, - {file = "SQLAlchemy-2.0.36-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:dc022184d3e5cacc9579e41805a681187650e170eb2fd70e28b86192a479dcaa"}, - {file = "SQLAlchemy-2.0.36-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b817d41d692bf286abc181f8af476c4fbef3fd05e798777492618378448ee689"}, - {file = "SQLAlchemy-2.0.36-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4e46a888b54be23d03a89be510f24a7652fe6ff660787b96cd0e57a4ebcb46d"}, - {file = "SQLAlchemy-2.0.36-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c4ae3005ed83f5967f961fd091f2f8c5329161f69ce8480aa8168b2d7fe37f06"}, - {file = "SQLAlchemy-2.0.36-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:03e08af7a5f9386a43919eda9de33ffda16b44eb11f3b313e6822243770e9763"}, - {file = "SQLAlchemy-2.0.36-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:3dbb986bad3ed5ceaf090200eba750b5245150bd97d3e67343a3cfed06feecf7"}, - {file = "SQLAlchemy-2.0.36-cp39-cp39-win32.whl", hash = "sha256:9fe53b404f24789b5ea9003fc25b9a3988feddebd7e7b369c8fac27ad6f52f28"}, - {file = "SQLAlchemy-2.0.36-cp39-cp39-win_amd64.whl", hash = "sha256:af148a33ff0349f53512a049c6406923e4e02bf2f26c5fb285f143faf4f0e46a"}, - {file = "SQLAlchemy-2.0.36-py3-none-any.whl", hash = "sha256:fddbe92b4760c6f5d48162aef14824add991aeda8ddadb3c31d56eb15ca69f8e"}, - {file = "sqlalchemy-2.0.36.tar.gz", hash = "sha256:7f2767680b6d2398aea7082e45a774b2b0767b5c8d8ffb9c8b683088ea9b29c5"}, + {file = "SQLAlchemy-2.0.37-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:da36c3b0e891808a7542c5c89f224520b9a16c7f5e4d6a1156955605e54aef0e"}, + {file = "SQLAlchemy-2.0.37-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e7402ff96e2b073a98ef6d6142796426d705addd27b9d26c3b32dbaa06d7d069"}, + {file = "SQLAlchemy-2.0.37-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e6f5d254a22394847245f411a2956976401e84da4288aa70cbcd5190744062c1"}, + {file = "SQLAlchemy-2.0.37-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41296bbcaa55ef5fdd32389a35c710133b097f7b2609d8218c0eabded43a1d84"}, + {file = "SQLAlchemy-2.0.37-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:bedee60385c1c0411378cbd4dc486362f5ee88deceea50002772912d798bb00f"}, + {file = "SQLAlchemy-2.0.37-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:6c67415258f9f3c69867ec02fea1bf6508153709ecbd731a982442a590f2b7e4"}, + {file = "SQLAlchemy-2.0.37-cp310-cp310-win32.whl", hash = "sha256:650dcb70739957a492ad8acff65d099a9586b9b8920e3507ca61ec3ce650bb72"}, + {file = "SQLAlchemy-2.0.37-cp310-cp310-win_amd64.whl", hash = "sha256:93d1543cd8359040c02b6614421c8e10cd7a788c40047dbc507ed46c29ae5636"}, + {file = "SQLAlchemy-2.0.37-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:78361be6dc9073ed17ab380985d1e45e48a642313ab68ab6afa2457354ff692c"}, + {file = "SQLAlchemy-2.0.37-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b661b49d0cb0ab311a189b31e25576b7ac3e20783beb1e1817d72d9d02508bf5"}, + {file = "SQLAlchemy-2.0.37-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d57bafbab289e147d064ffbd5cca2d7b1394b63417c0636cea1f2e93d16eb9e8"}, + {file = "SQLAlchemy-2.0.37-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fa2c0913f02341d25fb858e4fb2031e6b0813494cca1ba07d417674128ce11b"}, + {file = "SQLAlchemy-2.0.37-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:9df21b8d9e5c136ea6cde1c50d2b1c29a2b5ff2b1d610165c23ff250e0704087"}, + {file = "SQLAlchemy-2.0.37-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db18ff6b8c0f1917f8b20f8eca35c28bbccb9f83afa94743e03d40203ed83de9"}, + {file = "SQLAlchemy-2.0.37-cp311-cp311-win32.whl", hash = "sha256:46954173612617a99a64aee103bcd3f078901b9a8dcfc6ae80cbf34ba23df989"}, + {file = "SQLAlchemy-2.0.37-cp311-cp311-win_amd64.whl", hash = "sha256:7b7e772dc4bc507fdec4ee20182f15bd60d2a84f1e087a8accf5b5b7a0dcf2ba"}, + {file = "SQLAlchemy-2.0.37-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2952748ecd67ed3b56773c185e85fc084f6bdcdec10e5032a7c25a6bc7d682ef"}, + {file = "SQLAlchemy-2.0.37-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3151822aa1db0eb5afd65ccfafebe0ef5cda3a7701a279c8d0bf17781a793bb4"}, + {file = "SQLAlchemy-2.0.37-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eaa8039b6d20137a4e02603aba37d12cd2dde7887500b8855356682fc33933f4"}, + {file = "SQLAlchemy-2.0.37-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1cdba1f73b64530c47b27118b7053b8447e6d6f3c8104e3ac59f3d40c33aa9fd"}, + {file = "SQLAlchemy-2.0.37-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1b2690456528a87234a75d1a1644cdb330a6926f455403c8e4f6cad6921f9098"}, + {file = "SQLAlchemy-2.0.37-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:cf5ae8a9dcf657fd72144a7fd01f243236ea39e7344e579a121c4205aedf07bb"}, + {file = "SQLAlchemy-2.0.37-cp312-cp312-win32.whl", hash = "sha256:ea308cec940905ba008291d93619d92edaf83232ec85fbd514dcb329f3192761"}, + {file = "SQLAlchemy-2.0.37-cp312-cp312-win_amd64.whl", hash = "sha256:635d8a21577341dfe4f7fa59ec394b346da12420b86624a69e466d446de16aff"}, + {file = "SQLAlchemy-2.0.37-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8c4096727193762e72ce9437e2a86a110cf081241919ce3fab8e89c02f6b6658"}, + {file = "SQLAlchemy-2.0.37-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e4fb5ac86d8fe8151966814f6720996430462e633d225497566b3996966b9bdb"}, + {file = "SQLAlchemy-2.0.37-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e56a139bfe136a22c438478a86f8204c1eb5eed36f4e15c4224e4b9db01cb3e4"}, + {file = "SQLAlchemy-2.0.37-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f95fc8e3f34b5f6b3effb49d10ac97c569ec8e32f985612d9b25dd12d0d2e94"}, + {file = "SQLAlchemy-2.0.37-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c505edd429abdfe3643fa3b2e83efb3445a34a9dc49d5f692dd087be966020e0"}, + {file = "SQLAlchemy-2.0.37-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:12b0f1ec623cccf058cf21cb544f0e74656618165b083d78145cafde156ea7b6"}, + {file = "SQLAlchemy-2.0.37-cp313-cp313-win32.whl", hash = "sha256:293f9ade06b2e68dd03cfb14d49202fac47b7bb94bffcff174568c951fbc7af2"}, + {file = "SQLAlchemy-2.0.37-cp313-cp313-win_amd64.whl", hash = "sha256:d70f53a0646cc418ca4853da57cf3ddddbccb8c98406791f24426f2dd77fd0e2"}, + {file = "SQLAlchemy-2.0.37-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:44f569d0b1eb82301b92b72085583277316e7367e038d97c3a1a899d9a05e342"}, + {file = "SQLAlchemy-2.0.37-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2eae3423e538c10d93ae3e87788c6a84658c3ed6db62e6a61bb9495b0ad16bb"}, + {file = "SQLAlchemy-2.0.37-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dfff7be361048244c3aa0f60b5e63221c5e0f0e509f4e47b8910e22b57d10ae7"}, + {file = "SQLAlchemy-2.0.37-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:5bc3339db84c5fb9130ac0e2f20347ee77b5dd2596ba327ce0d399752f4fce39"}, + {file = "SQLAlchemy-2.0.37-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:84b9f23b0fa98a6a4b99d73989350a94e4a4ec476b9a7dfe9b79ba5939f5e80b"}, + {file = "SQLAlchemy-2.0.37-cp37-cp37m-win32.whl", hash = "sha256:51bc9cfef83e0ac84f86bf2b10eaccb27c5a3e66a1212bef676f5bee6ef33ebb"}, + {file = "SQLAlchemy-2.0.37-cp37-cp37m-win_amd64.whl", hash = "sha256:8e47f1af09444f87c67b4f1bb6231e12ba6d4d9f03050d7fc88df6d075231a49"}, + {file = "SQLAlchemy-2.0.37-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6b788f14c5bb91db7f468dcf76f8b64423660a05e57fe277d3f4fad7b9dcb7ce"}, + {file = "SQLAlchemy-2.0.37-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:521ef85c04c33009166777c77e76c8a676e2d8528dc83a57836b63ca9c69dcd1"}, + {file = "SQLAlchemy-2.0.37-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75311559f5c9881a9808eadbeb20ed8d8ba3f7225bef3afed2000c2a9f4d49b9"}, + {file = "SQLAlchemy-2.0.37-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cce918ada64c956b62ca2c2af59b125767097ec1dca89650a6221e887521bfd7"}, + {file = "SQLAlchemy-2.0.37-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:9d087663b7e1feabea8c578d6887d59bb00388158e8bff3a76be11aa3f748ca2"}, + {file = "SQLAlchemy-2.0.37-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:cf95a60b36997dad99692314c4713f141b61c5b0b4cc5c3426faad570b31ca01"}, + {file = "SQLAlchemy-2.0.37-cp38-cp38-win32.whl", hash = "sha256:d75ead7dd4d255068ea0f21492ee67937bd7c90964c8f3c2bea83c7b7f81b95f"}, + {file = "SQLAlchemy-2.0.37-cp38-cp38-win_amd64.whl", hash = "sha256:74bbd1d0a9bacf34266a7907d43260c8d65d31d691bb2356f41b17c2dca5b1d0"}, + {file = "SQLAlchemy-2.0.37-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:648ec5acf95ad59255452ef759054f2176849662af4521db6cb245263ae4aa33"}, + {file = "SQLAlchemy-2.0.37-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:35bd2df269de082065d4b23ae08502a47255832cc3f17619a5cea92ce478b02b"}, + {file = "SQLAlchemy-2.0.37-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f581d365af9373a738c49e0c51e8b18e08d8a6b1b15cc556773bcd8a192fa8b"}, + {file = "SQLAlchemy-2.0.37-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:82df02816c14f8dc9f4d74aea4cb84a92f4b0620235daa76dde002409a3fbb5a"}, + {file = "SQLAlchemy-2.0.37-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:94b564e38b344d3e67d2e224f0aec6ba09a77e4582ced41e7bfd0f757d926ec9"}, + {file = "SQLAlchemy-2.0.37-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:955a2a765aa1bd81aafa69ffda179d4fe3e2a3ad462a736ae5b6f387f78bfeb8"}, + {file = "SQLAlchemy-2.0.37-cp39-cp39-win32.whl", hash = "sha256:03f0528c53ca0b67094c4764523c1451ea15959bbf0a8a8a3096900014db0278"}, + {file = "SQLAlchemy-2.0.37-cp39-cp39-win_amd64.whl", hash = "sha256:4b12885dc85a2ab2b7d00995bac6d967bffa8594123b02ed21e8eb2205a7584b"}, + {file = "SQLAlchemy-2.0.37-py3-none-any.whl", hash = "sha256:a8998bf9f8658bd3839cbc44ddbe982955641863da0c1efe5b00c1ab4f5c16b1"}, + {file = "sqlalchemy-2.0.37.tar.gz", hash = "sha256:12b28d99a9c14eaf4055810df1001557176716de0167b91026e648e65229bffb"}, ] [package.dependencies] -greenlet = {version = "!=0.4.17", markers = "python_version < \"3.13\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"} +greenlet = {version = "!=0.4.17", markers = "python_version < \"3.14\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"} typing-extensions = ">=4.6.0" [package.extras] From c68b9b1eb0530c5df2a8b114f6df54b63a8374d8 Mon Sep 17 00:00:00 2001 From: smaheshwar-pltr Date: Fri, 10 Jan 2025 22:33:48 +0000 Subject: [PATCH 24/32] Support Location Providers (#1452) * Skeletal implementation * First attempt at hashing locations * Relocate to table submodule; code and comment improvements * Add unit tests * Remove entropy check * Nit: Prefer `self.table_properties` * Remove special character testing * Add integration tests for writes * Move all `LocationProviders`-related code into locations.py * Nit: tiny for loop refactor * Fix typo * Object storage as default location provider * Update tests/integration/test_writes/test_partitioned_writes.py Co-authored-by: Kevin Liu * Test entropy in test_object_storage_injects_entropy * Refactor integration tests to use properties and omit when default once * Use a different table property for custom location provision * write.location-provider.py-impl -> write.py-location-provider.impl * Make lint * Move location provider loading into `write_file` for back-compat * Make object storage no longer the default * Add test case for partitioned paths disabled but with no partition special case * Moved constants within ObjectStoreLocationProvider --------- Co-authored-by: Sreesh Maheshwar Co-authored-by: Kevin Liu --- pyiceberg/io/pyarrow.py | 7 +- pyiceberg/table/__init__.py | 15 +- pyiceberg/table/locations.py | 145 ++++++++++++++++++ .../test_writes/test_partitioned_writes.py | 39 +++++ tests/integration/test_writes/test_writes.py | 27 ++++ tests/table/test_locations.py | 130 ++++++++++++++++ 6 files changed, 355 insertions(+), 8 deletions(-) create mode 100644 pyiceberg/table/locations.py create mode 100644 tests/table/test_locations.py diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index ad7e4f4f85..1ce0842844 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -136,6 +136,7 @@ visit, visit_with_partner, ) +from pyiceberg.table.locations import load_location_provider from pyiceberg.table.metadata import TableMetadata from pyiceberg.table.name_mapping import NameMapping, apply_name_mapping from pyiceberg.transforms import TruncateTransform @@ -2305,6 +2306,7 @@ def write_file(io: FileIO, table_metadata: TableMetadata, tasks: Iterator[WriteT property_name=TableProperties.PARQUET_ROW_GROUP_LIMIT, default=TableProperties.PARQUET_ROW_GROUP_LIMIT_DEFAULT, ) + location_provider = load_location_provider(table_location=table_metadata.location, table_properties=table_metadata.properties) def write_parquet(task: WriteTask) -> DataFile: table_schema = table_metadata.schema() @@ -2327,7 +2329,10 @@ def write_parquet(task: WriteTask) -> DataFile: for batch in task.record_batches ] arrow_table = pa.Table.from_batches(batches) - file_path = f"{table_metadata.location}/data/{task.generate_data_file_path('parquet')}" + file_path = location_provider.new_data_location( + data_file_name=task.generate_data_file_filename("parquet"), + partition_key=task.partition_key, + ) fo = io.new_output(file_path) with fo.create(overwrite=True) as fos: with pq.ParquetWriter(fos, schema=arrow_table.schema, **parquet_writer_kwargs) as writer: diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 7bc3fe838b..0c8c848c43 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -187,6 +187,14 @@ class TableProperties: WRITE_PARTITION_SUMMARY_LIMIT = "write.summary.partition-limit" WRITE_PARTITION_SUMMARY_LIMIT_DEFAULT = 0 + WRITE_PY_LOCATION_PROVIDER_IMPL = "write.py-location-provider.impl" + + OBJECT_STORE_ENABLED = "write.object-storage.enabled" + OBJECT_STORE_ENABLED_DEFAULT = False + + WRITE_OBJECT_STORE_PARTITIONED_PATHS = "write.object-storage.partitioned-paths" + WRITE_OBJECT_STORE_PARTITIONED_PATHS_DEFAULT = True + DELETE_MODE = "write.delete.mode" DELETE_MODE_COPY_ON_WRITE = "copy-on-write" DELETE_MODE_MERGE_ON_READ = "merge-on-read" @@ -1613,13 +1621,6 @@ def generate_data_file_filename(self, extension: str) -> str: # https://github.com/apache/iceberg/blob/a582968975dd30ff4917fbbe999f1be903efac02/core/src/main/java/org/apache/iceberg/io/OutputFileFactory.java#L92-L101 return f"00000-{self.task_id}-{self.write_uuid}.{extension}" - def generate_data_file_path(self, extension: str) -> str: - if self.partition_key: - file_path = f"{self.partition_key.to_path()}/{self.generate_data_file_filename(extension)}" - return file_path - else: - return self.generate_data_file_filename(extension) - @dataclass(frozen=True) class AddFileTask: diff --git a/pyiceberg/table/locations.py b/pyiceberg/table/locations.py new file mode 100644 index 0000000000..046ee32527 --- /dev/null +++ b/pyiceberg/table/locations.py @@ -0,0 +1,145 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import importlib +import logging +from abc import ABC, abstractmethod +from typing import Optional + +import mmh3 + +from pyiceberg.partitioning import PartitionKey +from pyiceberg.table import TableProperties +from pyiceberg.typedef import Properties +from pyiceberg.utils.properties import property_as_bool + +logger = logging.getLogger(__name__) + + +class LocationProvider(ABC): + """A base class for location providers, that provide data file locations for write tasks.""" + + table_location: str + table_properties: Properties + + def __init__(self, table_location: str, table_properties: Properties): + self.table_location = table_location + self.table_properties = table_properties + + @abstractmethod + def new_data_location(self, data_file_name: str, partition_key: Optional[PartitionKey] = None) -> str: + """Return a fully-qualified data file location for the given filename. + + Args: + data_file_name (str): The name of the data file. + partition_key (Optional[PartitionKey]): The data file's partition key. If None, the data is not partitioned. + + Returns: + str: A fully-qualified location URI for the data file. + """ + + +class SimpleLocationProvider(LocationProvider): + def __init__(self, table_location: str, table_properties: Properties): + super().__init__(table_location, table_properties) + + def new_data_location(self, data_file_name: str, partition_key: Optional[PartitionKey] = None) -> str: + prefix = f"{self.table_location}/data" + return f"{prefix}/{partition_key.to_path()}/{data_file_name}" if partition_key else f"{prefix}/{data_file_name}" + + +class ObjectStoreLocationProvider(LocationProvider): + HASH_BINARY_STRING_BITS = 20 + ENTROPY_DIR_LENGTH = 4 + ENTROPY_DIR_DEPTH = 3 + + _include_partition_paths: bool + + def __init__(self, table_location: str, table_properties: Properties): + super().__init__(table_location, table_properties) + self._include_partition_paths = property_as_bool( + self.table_properties, + TableProperties.WRITE_OBJECT_STORE_PARTITIONED_PATHS, + TableProperties.WRITE_OBJECT_STORE_PARTITIONED_PATHS_DEFAULT, + ) + + def new_data_location(self, data_file_name: str, partition_key: Optional[PartitionKey] = None) -> str: + if self._include_partition_paths and partition_key: + return self.new_data_location(f"{partition_key.to_path()}/{data_file_name}") + + prefix = f"{self.table_location}/data" + hashed_path = self._compute_hash(data_file_name) + + return ( + f"{prefix}/{hashed_path}/{data_file_name}" + if self._include_partition_paths + else f"{prefix}/{hashed_path}-{data_file_name}" + ) + + @staticmethod + def _compute_hash(data_file_name: str) -> str: + # Bitwise AND to combat sign-extension; bitwise OR to preserve leading zeroes that `bin` would otherwise strip. + top_mask = 1 << ObjectStoreLocationProvider.HASH_BINARY_STRING_BITS + hash_code = mmh3.hash(data_file_name) & (top_mask - 1) | top_mask + return ObjectStoreLocationProvider._dirs_from_hash(bin(hash_code)[-ObjectStoreLocationProvider.HASH_BINARY_STRING_BITS :]) + + @staticmethod + def _dirs_from_hash(file_hash: str) -> str: + """Divides hash into directories for optimized orphan removal operation using ENTROPY_DIR_DEPTH and ENTROPY_DIR_LENGTH.""" + total_entropy_length = ObjectStoreLocationProvider.ENTROPY_DIR_DEPTH * ObjectStoreLocationProvider.ENTROPY_DIR_LENGTH + + hash_with_dirs = [] + for i in range(0, total_entropy_length, ObjectStoreLocationProvider.ENTROPY_DIR_LENGTH): + hash_with_dirs.append(file_hash[i : i + ObjectStoreLocationProvider.ENTROPY_DIR_LENGTH]) + + if len(file_hash) > total_entropy_length: + hash_with_dirs.append(file_hash[total_entropy_length:]) + + return "/".join(hash_with_dirs) + + +def _import_location_provider( + location_provider_impl: str, table_location: str, table_properties: Properties +) -> Optional[LocationProvider]: + try: + path_parts = location_provider_impl.split(".") + if len(path_parts) < 2: + raise ValueError( + f"{TableProperties.WRITE_PY_LOCATION_PROVIDER_IMPL} should be full path (module.CustomLocationProvider), got: {location_provider_impl}" + ) + module_name, class_name = ".".join(path_parts[:-1]), path_parts[-1] + module = importlib.import_module(module_name) + class_ = getattr(module, class_name) + return class_(table_location, table_properties) + except ModuleNotFoundError: + logger.warning("Could not initialize LocationProvider: %s", location_provider_impl) + return None + + +def load_location_provider(table_location: str, table_properties: Properties) -> LocationProvider: + table_location = table_location.rstrip("/") + + if location_provider_impl := table_properties.get(TableProperties.WRITE_PY_LOCATION_PROVIDER_IMPL): + if location_provider := _import_location_provider(location_provider_impl, table_location, table_properties): + logger.info("Loaded LocationProvider: %s", location_provider_impl) + return location_provider + else: + raise ValueError(f"Could not initialize LocationProvider: {location_provider_impl}") + + if property_as_bool(table_properties, TableProperties.OBJECT_STORE_ENABLED, TableProperties.OBJECT_STORE_ENABLED_DEFAULT): + return ObjectStoreLocationProvider(table_location, table_properties) + else: + return SimpleLocationProvider(table_location, table_properties) diff --git a/tests/integration/test_writes/test_partitioned_writes.py b/tests/integration/test_writes/test_partitioned_writes.py index 8a3a5c9acc..50a1bc8c38 100644 --- a/tests/integration/test_writes/test_partitioned_writes.py +++ b/tests/integration/test_writes/test_partitioned_writes.py @@ -28,6 +28,7 @@ from pyiceberg.exceptions import NoSuchTableError from pyiceberg.partitioning import PartitionField, PartitionSpec from pyiceberg.schema import Schema +from pyiceberg.table import TableProperties from pyiceberg.transforms import ( BucketTransform, DayTransform, @@ -280,6 +281,44 @@ def test_query_filter_v1_v2_append_null( assert df.where(f"{col} is null").count() == 2, f"Expected 2 null rows for {col}" +@pytest.mark.integration +@pytest.mark.parametrize( + "part_col", ["int", "bool", "string", "string_long", "long", "float", "double", "date", "timestamp", "timestamptz", "binary"] +) +@pytest.mark.parametrize("format_version", [1, 2]) +def test_object_storage_location_provider_excludes_partition_path( + session_catalog: Catalog, spark: SparkSession, arrow_table_with_null: pa.Table, part_col: str, format_version: int +) -> None: + nested_field = TABLE_SCHEMA.find_field(part_col) + partition_spec = PartitionSpec( + PartitionField(source_id=nested_field.field_id, field_id=1001, transform=IdentityTransform(), name=part_col) + ) + + tbl = _create_table( + session_catalog=session_catalog, + identifier=f"default.arrow_table_v{format_version}_with_null_partitioned_on_col_{part_col}", + # write.object-storage.partitioned-paths defaults to True + properties={"format-version": str(format_version), TableProperties.OBJECT_STORE_ENABLED: True}, + data=[arrow_table_with_null], + partition_spec=partition_spec, + ) + + original_paths = tbl.inspect.data_files().to_pydict()["file_path"] + assert len(original_paths) == 3 + + # Update props to exclude partitioned paths and append data + with tbl.transaction() as tx: + tx.set_properties({TableProperties.WRITE_OBJECT_STORE_PARTITIONED_PATHS: False}) + tbl.append(arrow_table_with_null) + + added_paths = set(tbl.inspect.data_files().to_pydict()["file_path"]) - set(original_paths) + assert len(added_paths) == 3 + + # All paths before the props update should contain the partition, while all paths after should not + assert all(f"{part_col}=" in path for path in original_paths) + assert all(f"{part_col}=" not in path for path in added_paths) + + @pytest.mark.integration @pytest.mark.parametrize( "spec", diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py index c23e836554..fff48b9373 100644 --- a/tests/integration/test_writes/test_writes.py +++ b/tests/integration/test_writes/test_writes.py @@ -285,6 +285,33 @@ def test_data_files(spark: SparkSession, session_catalog: Catalog, arrow_table_w assert [row.deleted_data_files_count for row in rows] == [0, 1, 0, 0, 0] +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_object_storage_data_files( + spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int +) -> None: + tbl = _create_table( + session_catalog=session_catalog, + identifier="default.object_stored", + properties={"format-version": format_version, TableProperties.OBJECT_STORE_ENABLED: True}, + data=[arrow_table_with_null], + ) + tbl.append(arrow_table_with_null) + + paths = tbl.inspect.data_files().to_pydict()["file_path"] + assert len(paths) == 2 + + for location in paths: + assert location.startswith("s3://warehouse/default/object_stored/data/") + parts = location.split("/") + assert len(parts) == 11 + + # Entropy binary directories should have been injected + for dir_name in parts[6:10]: + assert dir_name + assert all(c in "01" for c in dir_name) + + @pytest.mark.integration def test_python_writes_with_spark_snapshot_reads( spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table diff --git a/tests/table/test_locations.py b/tests/table/test_locations.py new file mode 100644 index 0000000000..bda2442aca --- /dev/null +++ b/tests/table/test_locations.py @@ -0,0 +1,130 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from typing import Optional + +import pytest + +from pyiceberg.partitioning import PartitionField, PartitionFieldValue, PartitionKey, PartitionSpec +from pyiceberg.schema import Schema +from pyiceberg.table.locations import LocationProvider, load_location_provider +from pyiceberg.transforms import IdentityTransform +from pyiceberg.typedef import EMPTY_DICT +from pyiceberg.types import NestedField, StringType + +PARTITION_FIELD = PartitionField(source_id=1, field_id=1002, transform=IdentityTransform(), name="string_field") +PARTITION_KEY = PartitionKey( + raw_partition_field_values=[PartitionFieldValue(PARTITION_FIELD, "example_string")], + partition_spec=PartitionSpec(PARTITION_FIELD), + schema=Schema(NestedField(field_id=1, name="string_field", field_type=StringType(), required=False)), +) + + +class CustomLocationProvider(LocationProvider): + def new_data_location(self, data_file_name: str, partition_key: Optional[PartitionKey] = None) -> str: + return f"custom_location_provider/{data_file_name}" + + +def test_default_location_provider() -> None: + provider = load_location_provider(table_location="table_location", table_properties=EMPTY_DICT) + + assert provider.new_data_location("my_file") == "table_location/data/my_file" + + +def test_custom_location_provider() -> None: + qualified_name = CustomLocationProvider.__module__ + "." + CustomLocationProvider.__name__ + provider = load_location_provider( + table_location="table_location", table_properties={"write.py-location-provider.impl": qualified_name} + ) + + assert provider.new_data_location("my_file") == "custom_location_provider/my_file" + + +def test_custom_location_provider_single_path() -> None: + with pytest.raises(ValueError, match=r"write\.py-location-provider\.impl should be full path"): + load_location_provider(table_location="table_location", table_properties={"write.py-location-provider.impl": "not_found"}) + + +def test_custom_location_provider_not_found() -> None: + with pytest.raises(ValueError, match=r"Could not initialize LocationProvider"): + load_location_provider( + table_location="table_location", table_properties={"write.py-location-provider.impl": "module.not_found"} + ) + + +def test_object_storage_injects_entropy() -> None: + provider = load_location_provider(table_location="table_location", table_properties={"write.object-storage.enabled": "true"}) + + location = provider.new_data_location("test.parquet") + parts = location.split("/") + + assert len(parts) == 7 + assert parts[0] == "table_location" + assert parts[1] == "data" + assert parts[-1] == "test.parquet" + + # Entropy directories in the middle + for dir_name in parts[2:-1]: + assert dir_name + assert all(c in "01" for c in dir_name) + + +@pytest.mark.parametrize("object_storage", [True, False]) +def test_partition_value_in_path(object_storage: bool) -> None: + provider = load_location_provider( + table_location="table_location", + table_properties={ + "write.object-storage.enabled": str(object_storage), + }, + ) + + location = provider.new_data_location("test.parquet", PARTITION_KEY) + partition_segment = location.split("/")[-2] + + assert partition_segment == "string_field=example_string" + + +# NB: We test here with None partition key too because disabling partitioned paths still replaces final / with - even in +# paths of un-partitioned files. This matches the behaviour of the Java implementation. +@pytest.mark.parametrize("partition_key", [PARTITION_KEY, None]) +def test_object_storage_partitioned_paths_disabled(partition_key: Optional[PartitionKey]) -> None: + provider = load_location_provider( + table_location="table_location", + table_properties={ + "write.object-storage.enabled": "true", + "write.object-storage.partitioned-paths": "false", + }, + ) + + location = provider.new_data_location("test.parquet", partition_key) + + # No partition values included in the path and last part of entropy is separated with "-" + assert location == "table_location/data/0110/1010/0011/11101000-test.parquet" + + +@pytest.mark.parametrize( + ["data_file_name", "expected_hash"], + [ + ("a", "0101/0110/1001/10110010"), + ("b", "1110/0111/1110/00000011"), + ("c", "0010/1101/0110/01011111"), + ("d", "1001/0001/0100/01110011"), + ], +) +def test_hash_injection(data_file_name: str, expected_hash: str) -> None: + provider = load_location_provider(table_location="table_location", table_properties={"write.object-storage.enabled": "true"}) + + assert provider.new_data_location(data_file_name) == f"table_location/data/{expected_hash}/{data_file_name}" From cad0ad7d9358315abe1315de2a64227d91acceaa Mon Sep 17 00:00:00 2001 From: Soumya Ghosh Date: Sat, 11 Jan 2025 06:41:46 +0530 Subject: [PATCH 25/32] Add `all_manifests` metadata table with tests (#1241) * Add `all_manifests` metadata table with tests * Move get_manifests_schema and get_all_manifests_schema to InspectTable class * Update tests for all_manifests table * Added linter changes in inspect.py --- pyiceberg/table/inspect.py | 75 +++++++++++++------- tests/integration/test_inspect_table.py | 92 +++++++++++++++++++++++++ 2 files changed, 143 insertions(+), 24 deletions(-) diff --git a/pyiceberg/table/inspect.py b/pyiceberg/table/inspect.py index 71d38a2279..6dfa78a7ac 100644 --- a/pyiceberg/table/inspect.py +++ b/pyiceberg/table/inspect.py @@ -17,13 +17,14 @@ from __future__ import annotations from datetime import datetime, timezone -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple +from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Set, Tuple from pyiceberg.conversions import from_bytes from pyiceberg.manifest import DataFile, DataFileContent, ManifestContent, PartitionFieldSummary from pyiceberg.partitioning import PartitionSpec from pyiceberg.table.snapshots import Snapshot, ancestors_of from pyiceberg.types import PrimitiveType +from pyiceberg.utils.concurrent import ExecutorFactory from pyiceberg.utils.singleton import _convert_to_hashable_type if TYPE_CHECKING: @@ -346,7 +347,7 @@ def update_partitions_map( schema=table_schema, ) - def manifests(self) -> "pa.Table": + def _get_manifests_schema(self) -> "pa.Schema": import pyarrow as pa partition_summary_schema = pa.struct( @@ -374,6 +375,17 @@ def manifests(self) -> "pa.Table": pa.field("partition_summaries", pa.list_(partition_summary_schema), nullable=False), ] ) + return manifest_schema + + def _get_all_manifests_schema(self) -> "pa.Schema": + import pyarrow as pa + + all_manifests_schema = self._get_manifests_schema() + all_manifests_schema = all_manifests_schema.append(pa.field("reference_snapshot_id", pa.int64(), nullable=False)) + return all_manifests_schema + + def _generate_manifests_table(self, snapshot: Optional[Snapshot], is_all_manifests_table: bool = False) -> "pa.Table": + import pyarrow as pa def _partition_summaries_to_rows( spec: PartitionSpec, partition_summaries: List[PartitionFieldSummary] @@ -412,36 +424,38 @@ def _partition_summaries_to_rows( specs = self.tbl.metadata.specs() manifests = [] - if snapshot := self.tbl.metadata.current_snapshot(): + if snapshot: for manifest in snapshot.manifests(self.tbl.io): is_data_file = manifest.content == ManifestContent.DATA is_delete_file = manifest.content == ManifestContent.DELETES - manifests.append( - { - "content": manifest.content, - "path": manifest.manifest_path, - "length": manifest.manifest_length, - "partition_spec_id": manifest.partition_spec_id, - "added_snapshot_id": manifest.added_snapshot_id, - "added_data_files_count": manifest.added_files_count if is_data_file else 0, - "existing_data_files_count": manifest.existing_files_count if is_data_file else 0, - "deleted_data_files_count": manifest.deleted_files_count if is_data_file else 0, - "added_delete_files_count": manifest.added_files_count if is_delete_file else 0, - "existing_delete_files_count": manifest.existing_files_count if is_delete_file else 0, - "deleted_delete_files_count": manifest.deleted_files_count if is_delete_file else 0, - "partition_summaries": _partition_summaries_to_rows( - specs[manifest.partition_spec_id], manifest.partitions - ) - if manifest.partitions - else [], - } - ) + manifest_row = { + "content": manifest.content, + "path": manifest.manifest_path, + "length": manifest.manifest_length, + "partition_spec_id": manifest.partition_spec_id, + "added_snapshot_id": manifest.added_snapshot_id, + "added_data_files_count": manifest.added_files_count if is_data_file else 0, + "existing_data_files_count": manifest.existing_files_count if is_data_file else 0, + "deleted_data_files_count": manifest.deleted_files_count if is_data_file else 0, + "added_delete_files_count": manifest.added_files_count if is_delete_file else 0, + "existing_delete_files_count": manifest.existing_files_count if is_delete_file else 0, + "deleted_delete_files_count": manifest.deleted_files_count if is_delete_file else 0, + "partition_summaries": _partition_summaries_to_rows(specs[manifest.partition_spec_id], manifest.partitions) + if manifest.partitions + else [], + } + if is_all_manifests_table: + manifest_row["reference_snapshot_id"] = snapshot.snapshot_id + manifests.append(manifest_row) return pa.Table.from_pylist( manifests, - schema=manifest_schema, + schema=self._get_all_manifests_schema() if is_all_manifests_table else self._get_manifests_schema(), ) + def manifests(self) -> "pa.Table": + return self._generate_manifests_table(self.tbl.current_snapshot()) + def metadata_log_entries(self) -> "pa.Table": import pyarrow as pa @@ -630,3 +644,16 @@ def data_files(self, snapshot_id: Optional[int] = None) -> "pa.Table": def delete_files(self, snapshot_id: Optional[int] = None) -> "pa.Table": return self._files(snapshot_id, {DataFileContent.POSITION_DELETES, DataFileContent.EQUALITY_DELETES}) + + def all_manifests(self) -> "pa.Table": + import pyarrow as pa + + snapshots = self.tbl.snapshots() + if not snapshots: + return pa.Table.from_pylist([], schema=self._get_all_manifests_schema()) + + executor = ExecutorFactory.get_or_create() + manifests_by_snapshots: Iterator["pa.Table"] = executor.map( + lambda args: self._generate_manifests_table(*args), [(snapshot, True) for snapshot in snapshots] + ) + return pa.concat_tables(manifests_by_snapshots) diff --git a/tests/integration/test_inspect_table.py b/tests/integration/test_inspect_table.py index 68b10f3262..75fe92a69a 100644 --- a/tests/integration/test_inspect_table.py +++ b/tests/integration/test_inspect_table.py @@ -846,3 +846,95 @@ def inspect_files_asserts(df: pa.Table) -> None: inspect_files_asserts(files_df) inspect_files_asserts(data_files_df) inspect_files_asserts(delete_files_df) + + +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_inspect_all_manifests(spark: SparkSession, session_catalog: Catalog, format_version: int) -> None: + from pandas.testing import assert_frame_equal + + identifier = "default.table_metadata_all_manifests" + try: + session_catalog.drop_table(identifier=identifier) + except NoSuchTableError: + pass + + spark.sql( + f""" + CREATE TABLE {identifier} ( + id int, + data string + ) + PARTITIONED BY (data) + TBLPROPERTIES ('write.update.mode'='merge-on-read', + 'write.delete.mode'='merge-on-read') + """ + ) + tbl = session_catalog.load_table(identifier) + + # check all_manifests when there are no snapshots + lhs = tbl.inspect.all_manifests().to_pandas() + rhs = spark.table(f"{identifier}.all_manifests").toPandas() + assert_frame_equal(lhs, rhs, check_dtype=False) + + spark.sql(f"INSERT INTO {identifier} VALUES (1, 'a')") + + spark.sql(f"INSERT INTO {identifier} VALUES (2, 'b')") + + spark.sql(f"UPDATE {identifier} SET data = 'c' WHERE id = 1") + + spark.sql(f"DELETE FROM {identifier} WHERE id = 2") + + spark.sql(f"INSERT OVERWRITE {identifier} VALUES (1, 'a')") + + tbl.refresh() + df = tbl.inspect.all_manifests() + + assert df.column_names == [ + "content", + "path", + "length", + "partition_spec_id", + "added_snapshot_id", + "added_data_files_count", + "existing_data_files_count", + "deleted_data_files_count", + "added_delete_files_count", + "existing_delete_files_count", + "deleted_delete_files_count", + "partition_summaries", + "reference_snapshot_id", + ] + + int_cols = [ + "content", + "length", + "partition_spec_id", + "added_snapshot_id", + "added_data_files_count", + "existing_data_files_count", + "deleted_data_files_count", + "added_delete_files_count", + "existing_delete_files_count", + "deleted_delete_files_count", + "reference_snapshot_id", + ] + + for column in int_cols: + for value in df[column]: + assert isinstance(value.as_py(), int) + + for value in df["path"]: + assert isinstance(value.as_py(), str) + + for value in df["partition_summaries"]: + assert isinstance(value.as_py(), list) + for row in value: + assert isinstance(row["contains_null"].as_py(), bool) + assert isinstance(row["contains_nan"].as_py(), (bool, type(None))) + assert isinstance(row["lower_bound"].as_py(), (str, type(None))) + assert isinstance(row["upper_bound"].as_py(), (str, type(None))) + + lhs = spark.table(f"{identifier}.all_manifests").toPandas() + rhs = df.to_pandas() + assert_frame_equal(lhs, rhs, check_dtype=False) From aface466f3393c8999bb5e2d90d9ff628044010c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 11 Jan 2025 14:13:05 -0500 Subject: [PATCH 26/32] Build: Bump deptry from 0.21.2 to 0.22.0 (#1508) Bumps [deptry](https://github.com/fpgmaas/deptry) from 0.21.2 to 0.22.0. - [Release notes](https://github.com/fpgmaas/deptry/releases) - [Changelog](https://github.com/fpgmaas/deptry/blob/main/CHANGELOG.md) - [Commits](https://github.com/fpgmaas/deptry/compare/0.21.2...0.22.0) --- updated-dependencies: - dependency-name: deptry dependency-type: direct:development update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 36 ++++++++++++++++++------------------ pyproject.toml | 2 +- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/poetry.lock b/poetry.lock index 687ff5a3a8..58e36274bf 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1063,27 +1063,27 @@ files = [ [[package]] name = "deptry" -version = "0.21.2" +version = "0.22.0" description = "A command line utility to check for unused, missing and transitive dependencies in a Python project." optional = false python-versions = ">=3.9" files = [ - {file = "deptry-0.21.2-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:e3b9e0c5ee437240b65e61107b5777a12064f78f604bf9f181a96c9b56eb896d"}, - {file = "deptry-0.21.2-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:d76bbf48bd62ecc44ca3d414769bd4b7956598d23d9ccb42fd359b831a31cab2"}, - {file = "deptry-0.21.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3080bb88c16ebd35f59cba7688416115b7aaf4630dc5a051dff2649cbf129a1b"}, - {file = "deptry-0.21.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:adb12d6678fb5dbd320a0a2e37881059d0a45bec6329df4250c977d803fe7f96"}, - {file = "deptry-0.21.2-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:7479d3079be69c3bbf5913d8e21090749c1139ee91f81520ffce90b5322476b0"}, - {file = "deptry-0.21.2-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:019167b35301edd2bdd4719c8b8f44769be4507cb8a1cd46fff4393cdbe8d31b"}, - {file = "deptry-0.21.2-cp39-abi3-win_amd64.whl", hash = "sha256:d8add495f0dd19a38aa6d1e09b14b1441bca47c9d945bc7b322efb084313eea3"}, - {file = "deptry-0.21.2-cp39-abi3-win_arm64.whl", hash = "sha256:06d48e9fa460aad02f9e1b079d9f5a69d622d291b3a0525b722fc91c88032042"}, - {file = "deptry-0.21.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3ef8aed33a2eac357f9565063bc1257bcefa03a37038299c08a4222e28f3cd34"}, - {file = "deptry-0.21.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:917745db5f8295eb5048e43d9073a9a675ffdba865e9b294d2e7aa455730cb06"}, - {file = "deptry-0.21.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:186ddbc69c1f70e684e83e202795e1054d0c2dfc03b8acc077f65dc3b6a7f4ce"}, - {file = "deptry-0.21.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3686e86ad7063b5a6e5253454f9d9e4a7a6b1511a99bd4306fda5424480be48"}, - {file = "deptry-0.21.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:1012a88500f242489066f811f6ec0c93328d9340bbf0f87f0c7d2146054d197e"}, - {file = "deptry-0.21.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:769bb658172586d1b03046bdc6b6c94f6a98ecfbac04ff7f77ec61768c75e1c2"}, - {file = "deptry-0.21.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:fb2f43747b58abeec01dc277ef22859342f3bca2ac677818c94940a009b436c0"}, - {file = "deptry-0.21.2.tar.gz", hash = "sha256:4e870553c7a1fafcd99a83ba4137259525679eecabeff61bc669741efa201541"}, + {file = "deptry-0.22.0-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:2b903c94162e30640bb7a3e6800c7afd03a6bb12b693a21290e06c713dba35af"}, + {file = "deptry-0.22.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:8b523a33bed952679c97a9f55c690803f0fbeb32649946dcc1362c3f015897c7"}, + {file = "deptry-0.22.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c68fa570be1443888d252c6f551356777e56e82e492e68e6db3d65b31100c450"}, + {file = "deptry-0.22.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:016f8a5b6c32762beea47a4d9d2d7b04f1b6e534448e5444c7a742bd2fdb260d"}, + {file = "deptry-0.22.0-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:46c868a0493556b41096f9824a15a3ce38811e6b4a2699ebec16e06e9f85cd84"}, + {file = "deptry-0.22.0-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:aebba0d1ca119f6241ff0d5b72e72a9b912fa880e81f4ab346a32d9001d6ddb1"}, + {file = "deptry-0.22.0-cp39-abi3-win_amd64.whl", hash = "sha256:2da497a9888f930b5c86c6524b29a4d284ed320edd4148ecc2e45e10f177f4fe"}, + {file = "deptry-0.22.0-cp39-abi3-win_arm64.whl", hash = "sha256:35acf2ac783ba2ec43ba593ba14e0080393c0ab24797ba55fbed30f0ba02259f"}, + {file = "deptry-0.22.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:9db9d0b8244e2b20bd75a21312c35ee628a602b00c0e2f267fb90f4600de6d2d"}, + {file = "deptry-0.22.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:edd0060065325cd70e6ce47feaa724cdb7fc3f4de673e4ed0fa38e8c1adc4155"}, + {file = "deptry-0.22.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b371a3c3194c2db9196ab1f80d5ce08138dea731eff8dd9fb2997da42941fa7"}, + {file = "deptry-0.22.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e20a8ba89078d06440316dba719c2278fdb19923e76633b808fd1b5670020c4"}, + {file = "deptry-0.22.0-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:f4872f48225d1e7dbacb1be5e427945c8f76abf6b91453e038aae076b638ba01"}, + {file = "deptry-0.22.0-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:9a12ebe86299e7bb054804464467f33c49e5a34f204b710fa10fbe1f31c56964"}, + {file = "deptry-0.22.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:fbe6211b972337acdeec6c11a82b666597c1edd6c6e2a93eb705bf49644bfb08"}, + {file = "deptry-0.22.0.tar.gz", hash = "sha256:32212cd40562f71b24da69babaed9a4233c567da390f681d86bb66f8ec4d2bfe"}, ] [package.dependencies] @@ -5357,4 +5357,4 @@ zstandard = ["zstandard"] [metadata] lock-version = "2.0" python-versions = "^3.9, !=3.9.7" -content-hash = "59e5678cd718f658c5bd099c03051564ee60f991e5f222bf92da13d1dd025a42" +content-hash = "6879624132285053b73c134d72db38b6dace947c67788387a2042d6c78569970" diff --git a/pyproject.toml b/pyproject.toml index 56be937305..db84bd27f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -93,7 +93,7 @@ typing-extensions = "4.12.2" pytest-mock = "3.14.0" pyspark = "3.5.3" cython = "3.0.11" -deptry = ">=0.14,<0.22" +deptry = ">=0.14,<0.23" docutils = "!=0.21.post1" # https://github.com/python-poetry/poetry/issues/9248#issuecomment-2026240520 [tool.poetry.group.docs.dependencies] From c409678ffb81e22f23fbed1561373a2b8e47cc86 Mon Sep 17 00:00:00 2001 From: smaheshwar-pltr Date: Mon, 13 Jan 2025 14:52:54 +0000 Subject: [PATCH 27/32] Use `ObjectStoreLocationProvider` by default (#1509) * Make object storage the default location provider * Nit: Remove comment beside property to prefer docs - Removed table proper * Nit: Add asserts for table properties defaults as well as comment in test --------- Co-authored-by: Sreesh Maheshwar --- pyiceberg/table/__init__.py | 2 +- tests/integration/test_writes/test_partitioned_writes.py | 6 ++++-- tests/table/test_locations.py | 7 +++---- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 0c8c848c43..f2df84d7ee 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -190,7 +190,7 @@ class TableProperties: WRITE_PY_LOCATION_PROVIDER_IMPL = "write.py-location-provider.impl" OBJECT_STORE_ENABLED = "write.object-storage.enabled" - OBJECT_STORE_ENABLED_DEFAULT = False + OBJECT_STORE_ENABLED_DEFAULT = True WRITE_OBJECT_STORE_PARTITIONED_PATHS = "write.object-storage.partitioned-paths" WRITE_OBJECT_STORE_PARTITIONED_PATHS_DEFAULT = True diff --git a/tests/integration/test_writes/test_partitioned_writes.py b/tests/integration/test_writes/test_partitioned_writes.py index 50a1bc8c38..9e7632852c 100644 --- a/tests/integration/test_writes/test_partitioned_writes.py +++ b/tests/integration/test_writes/test_partitioned_writes.py @@ -294,11 +294,13 @@ def test_object_storage_location_provider_excludes_partition_path( PartitionField(source_id=nested_field.field_id, field_id=1001, transform=IdentityTransform(), name=part_col) ) + # write.object-storage.enabled and write.object-storage.partitioned-paths don't need to be specified as they're on by default + assert TableProperties.OBJECT_STORE_ENABLED_DEFAULT + assert TableProperties.WRITE_OBJECT_STORE_PARTITIONED_PATHS_DEFAULT tbl = _create_table( session_catalog=session_catalog, identifier=f"default.arrow_table_v{format_version}_with_null_partitioned_on_col_{part_col}", - # write.object-storage.partitioned-paths defaults to True - properties={"format-version": str(format_version), TableProperties.OBJECT_STORE_ENABLED: True}, + properties={"format-version": str(format_version)}, data=[arrow_table_with_null], partition_spec=partition_spec, ) diff --git a/tests/table/test_locations.py b/tests/table/test_locations.py index bda2442aca..6753fe5a26 100644 --- a/tests/table/test_locations.py +++ b/tests/table/test_locations.py @@ -39,7 +39,7 @@ def new_data_location(self, data_file_name: str, partition_key: Optional[Partiti def test_default_location_provider() -> None: - provider = load_location_provider(table_location="table_location", table_properties=EMPTY_DICT) + provider = load_location_provider(table_location="table_location", table_properties={"write.object-storage.enabled": "false"}) assert provider.new_data_location("my_file") == "table_location/data/my_file" @@ -66,7 +66,7 @@ def test_custom_location_provider_not_found() -> None: def test_object_storage_injects_entropy() -> None: - provider = load_location_provider(table_location="table_location", table_properties={"write.object-storage.enabled": "true"}) + provider = load_location_provider(table_location="table_location", table_properties=EMPTY_DICT) location = provider.new_data_location("test.parquet") parts = location.split("/") @@ -104,7 +104,6 @@ def test_object_storage_partitioned_paths_disabled(partition_key: Optional[Parti provider = load_location_provider( table_location="table_location", table_properties={ - "write.object-storage.enabled": "true", "write.object-storage.partitioned-paths": "false", }, ) @@ -125,6 +124,6 @@ def test_object_storage_partitioned_paths_disabled(partition_key: Optional[Parti ], ) def test_hash_injection(data_file_name: str, expected_hash: str) -> None: - provider = load_location_provider(table_location="table_location", table_properties={"write.object-storage.enabled": "true"}) + provider = load_location_provider(table_location="table_location", table_properties=EMPTY_DICT) assert provider.new_data_location(data_file_name) == f"table_location/data/{expected_hash}/{data_file_name}" From a09bcde43c40e0a582fbfeb1e971aa52278c99c5 Mon Sep 17 00:00:00 2001 From: smaheshwar-pltr Date: Mon, 13 Jan 2025 17:38:47 +0000 Subject: [PATCH 28/32] Improve `LocationProvider` unit tests (#1511) * Improve `LocationProvider` unit tests * Renamed `test_object_storage_injects_entropy` to test_object_storage_no_partition --------- Co-authored-by: Sreesh Maheshwar --- tests/table/test_locations.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/tests/table/test_locations.py b/tests/table/test_locations.py index 6753fe5a26..67911b6271 100644 --- a/tests/table/test_locations.py +++ b/tests/table/test_locations.py @@ -38,12 +38,18 @@ def new_data_location(self, data_file_name: str, partition_key: Optional[Partiti return f"custom_location_provider/{data_file_name}" -def test_default_location_provider() -> None: +def test_simple_location_provider_no_partition() -> None: provider = load_location_provider(table_location="table_location", table_properties={"write.object-storage.enabled": "false"}) assert provider.new_data_location("my_file") == "table_location/data/my_file" +def test_simple_location_provider_with_partition() -> None: + provider = load_location_provider(table_location="table_location", table_properties={"write.object-storage.enabled": "false"}) + + assert provider.new_data_location("my_file", PARTITION_KEY) == "table_location/data/string_field=example_string/my_file" + + def test_custom_location_provider() -> None: qualified_name = CustomLocationProvider.__module__ + "." + CustomLocationProvider.__name__ provider = load_location_provider( @@ -65,7 +71,7 @@ def test_custom_location_provider_not_found() -> None: ) -def test_object_storage_injects_entropy() -> None: +def test_object_storage_no_partition() -> None: provider = load_location_provider(table_location="table_location", table_properties=EMPTY_DICT) location = provider.new_data_location("test.parquet") @@ -82,19 +88,18 @@ def test_object_storage_injects_entropy() -> None: assert all(c in "01" for c in dir_name) -@pytest.mark.parametrize("object_storage", [True, False]) -def test_partition_value_in_path(object_storage: bool) -> None: +def test_object_storage_with_partition() -> None: provider = load_location_provider( table_location="table_location", - table_properties={ - "write.object-storage.enabled": str(object_storage), - }, + table_properties={"write.object-storage.enabled": "true"}, ) location = provider.new_data_location("test.parquet", PARTITION_KEY) - partition_segment = location.split("/")[-2] - assert partition_segment == "string_field=example_string" + # Partition values AND entropy included in the path. Entropy differs to that in the test below because the partition + # key AND the data file name are used as the hash input. This matches Java behaviour; the hash below is what the + # Java implementation produces for this input too. + assert location == "table_location/data/0001/0010/1001/00000011/string_field=example_string/test.parquet" # NB: We test here with None partition key too because disabling partitioned paths still replaces final / with - even in From 61b3510ded32270418ad54f5204113000d3dd07f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 14 Jan 2025 09:03:43 +0100 Subject: [PATCH 29/32] Build: Bump mkdocs-autorefs from 1.2.0 to 1.3.0 (#1513) Bumps [mkdocs-autorefs](https://github.com/mkdocstrings/autorefs) from 1.2.0 to 1.3.0. - [Release notes](https://github.com/mkdocstrings/autorefs/releases) - [Changelog](https://github.com/mkdocstrings/autorefs/blob/main/CHANGELOG.md) - [Commits](https://github.com/mkdocstrings/autorefs/compare/1.2.0...1.3.0) --- updated-dependencies: - dependency-name: mkdocs-autorefs dependency-type: direct:development update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 10 +++++----- pyproject.toml | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/poetry.lock b/poetry.lock index 58e36274bf..b67371ecbd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2327,13 +2327,13 @@ min-versions = ["babel (==2.9.0)", "click (==7.0)", "colorama (==0.4)", "ghp-imp [[package]] name = "mkdocs-autorefs" -version = "1.2.0" +version = "1.3.0" description = "Automatically link across pages in MkDocs." optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "mkdocs_autorefs-1.2.0-py3-none-any.whl", hash = "sha256:d588754ae89bd0ced0c70c06f58566a4ee43471eeeee5202427da7de9ef85a2f"}, - {file = "mkdocs_autorefs-1.2.0.tar.gz", hash = "sha256:a86b93abff653521bda71cf3fc5596342b7a23982093915cb74273f67522190f"}, + {file = "mkdocs_autorefs-1.3.0-py3-none-any.whl", hash = "sha256:d180f9778a04e78b7134e31418f238bba56f56d6a8af97873946ff661befffb3"}, + {file = "mkdocs_autorefs-1.3.0.tar.gz", hash = "sha256:6867764c099ace9025d6ac24fd07b85a98335fbd30107ef01053697c8f46db61"}, ] [package.dependencies] @@ -5357,4 +5357,4 @@ zstandard = ["zstandard"] [metadata] lock-version = "2.0" python-versions = "^3.9, !=3.9.7" -content-hash = "6879624132285053b73c134d72db38b6dace947c67788387a2042d6c78569970" +content-hash = "306213628bcc69346e14742843c8e6bccf19c2615886943c2e1482a954a388ec" diff --git a/pyproject.toml b/pyproject.toml index db84bd27f4..4b425141b5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -104,7 +104,7 @@ jinja2 = "3.1.5" mkdocstrings = "0.27.0" mkdocstrings-python = "1.13.0" mkdocs-literate-nav = "0.6.1" -mkdocs-autorefs = "1.2.0" +mkdocs-autorefs = "1.3.0" mkdocs-gen-files = "0.5.0" mkdocs-material = "9.5.49" mkdocs-material-extensions = "1.3.1" From 4e755996c11e1768a63d3f3f663bfa77994648b7 Mon Sep 17 00:00:00 2001 From: hgollakota <43627229+hgollakota@users.noreply.github.com> Date: Wed, 15 Jan 2025 11:21:24 -0500 Subject: [PATCH 30/32] Add support for lowercase `FileFormat`(#1362) * Added support for lowercase FileFormat Modified the FileFormat class so that it utilizes EnumMeta value aliases. This allows both "AVRO" and "avro" to map to AVRO. * Make mypy happy --------- Co-authored-by: Fokko Driesprong --- pyiceberg/manifest.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pyiceberg/manifest.py b/pyiceberg/manifest.py index 5a32a6330c..598d88cdd8 100644 --- a/pyiceberg/manifest.py +++ b/pyiceberg/manifest.py @@ -94,9 +94,16 @@ def __repr__(self) -> str: class FileFormat(str, Enum): - AVRO = "AVRO" - PARQUET = "PARQUET" - ORC = "ORC" + AVRO = "AVRO", "avro" + PARQUET = "PARQUET", "parquet" + ORC = "ORC", "orc" + + def __new__(cls, value: str, *value_aliases: List[str]) -> "FileFormat": + obj = str.__new__(cls) + obj._value_ = value + for alias in value_aliases: + cls._value2member_map_[alias] = obj + return obj @classmethod def _missing_(cls, value: object) -> Union[None, str]: From 46253f353a57cb8547ef53a7d17a0161341636c0 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Wed, 15 Jan 2025 21:19:00 +0100 Subject: [PATCH 31/32] Revert "Add support for lowercase `FileFormat`(#1362)" (#1518) This reverts commit 4e755996c11e1768a63d3f3f663bfa77994648b7. --- pyiceberg/manifest.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/pyiceberg/manifest.py b/pyiceberg/manifest.py index 598d88cdd8..5a32a6330c 100644 --- a/pyiceberg/manifest.py +++ b/pyiceberg/manifest.py @@ -94,16 +94,9 @@ def __repr__(self) -> str: class FileFormat(str, Enum): - AVRO = "AVRO", "avro" - PARQUET = "PARQUET", "parquet" - ORC = "ORC", "orc" - - def __new__(cls, value: str, *value_aliases: List[str]) -> "FileFormat": - obj = str.__new__(cls) - obj._value_ = value - for alias in value_aliases: - cls._value2member_map_[alias] = obj - return obj + AVRO = "AVRO" + PARQUET = "PARQUET" + ORC = "ORC" @classmethod def _missing_(cls, value: object) -> Union[None, str]: From b806cfa34dbeca89939e20e2f8f1ef467a6381e2 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Wed, 15 Jan 2025 21:32:27 +0100 Subject: [PATCH 32/32] IO: Remove deprecations (#1519) --- pyiceberg/io/__init__.py | 9 ------ pyiceberg/io/fsspec.py | 68 +++++----------------------------------- pyiceberg/io/pyarrow.py | 10 +----- 3 files changed, 9 insertions(+), 78 deletions(-) diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py index 40186069d4..f322221e4b 100644 --- a/pyiceberg/io/__init__.py +++ b/pyiceberg/io/__init__.py @@ -48,14 +48,6 @@ logger = logging.getLogger(__name__) -ADLFS_CONNECTION_STRING = "adlfs.connection-string" -ADLFS_ACCOUNT_NAME = "adlfs.account-name" -ADLFS_ACCOUNT_KEY = "adlfs.account-key" -ADLFS_SAS_TOKEN = "adlfs.sas-token" -ADLFS_TENANT_ID = "adlfs.tenant-id" -ADLFS_CLIENT_ID = "adlfs.client-id" -ADLFS_ClIENT_SECRET = "adlfs.client-secret" -ADLFS_PREFIX = "adlfs" AWS_REGION = "client.region" AWS_ACCESS_KEY_ID = "client.access-key-id" AWS_SECRET_ACCESS_KEY = "client.secret-access-key" @@ -94,7 +86,6 @@ GCS_CACHE_TIMEOUT = "gcs.cache-timeout" GCS_REQUESTER_PAYS = "gcs.requester-pays" GCS_SESSION_KWARGS = "gcs.session-kwargs" -GCS_ENDPOINT = "gcs.endpoint" GCS_SERVICE_HOST = "gcs.service.host" GCS_DEFAULT_LOCATION = "gcs.default-bucket-location" GCS_VERSION_AWARE = "gcs.version-aware" diff --git a/pyiceberg/io/fsspec.py b/pyiceberg/io/fsspec.py index 23796d4e6a..62e9b92342 100644 --- a/pyiceberg/io/fsspec.py +++ b/pyiceberg/io/fsspec.py @@ -40,13 +40,6 @@ from pyiceberg.catalog import TOKEN from pyiceberg.exceptions import SignError from pyiceberg.io import ( - ADLFS_ACCOUNT_KEY, - ADLFS_ACCOUNT_NAME, - ADLFS_CLIENT_ID, - ADLFS_CONNECTION_STRING, - ADLFS_PREFIX, - ADLFS_SAS_TOKEN, - ADLFS_TENANT_ID, ADLS_ACCOUNT_KEY, ADLS_ACCOUNT_NAME, ADLS_CLIENT_ID, @@ -61,7 +54,6 @@ GCS_CACHE_TIMEOUT, GCS_CONSISTENCY, GCS_DEFAULT_LOCATION, - GCS_ENDPOINT, GCS_PROJECT_ID, GCS_REQUESTER_PAYS, GCS_SERVICE_HOST, @@ -78,7 +70,6 @@ S3_SIGNER_ENDPOINT, S3_SIGNER_ENDPOINT_DEFAULT, S3_SIGNER_URI, - ADLFS_ClIENT_SECRET, ADLS_ClIENT_SECRET, FileIO, InputFile, @@ -87,7 +78,6 @@ OutputStream, ) from pyiceberg.typedef import Properties -from pyiceberg.utils.deprecated import deprecation_message from pyiceberg.utils.properties import get_first_property_value, property_as_bool logger = logging.getLogger(__name__) @@ -172,12 +162,6 @@ def _gs(properties: Properties) -> AbstractFileSystem: # https://gcsfs.readthedocs.io/en/latest/api.html#gcsfs.core.GCSFileSystem from gcsfs import GCSFileSystem - if properties.get(GCS_ENDPOINT): - deprecation_message( - deprecated_in="0.8.0", - removed_in="0.9.0", - help_message=f"The property {GCS_ENDPOINT} is deprecated, please use {GCS_SERVICE_HOST} instead", - ) return GCSFileSystem( project=properties.get(GCS_PROJECT_ID), access=properties.get(GCS_ACCESS, "full_control"), @@ -186,7 +170,7 @@ def _gs(properties: Properties) -> AbstractFileSystem: cache_timeout=properties.get(GCS_CACHE_TIMEOUT), requester_pays=property_as_bool(properties, GCS_REQUESTER_PAYS, False), session_kwargs=json.loads(properties.get(GCS_SESSION_KWARGS, "{}")), - endpoint_url=get_first_property_value(properties, GCS_SERVICE_HOST, GCS_ENDPOINT), + endpoint_url=properties.get(GCS_SERVICE_HOST), default_location=properties.get(GCS_DEFAULT_LOCATION), version_aware=property_as_bool(properties, GCS_VERSION_AWARE, False), ) @@ -195,50 +179,14 @@ def _gs(properties: Properties) -> AbstractFileSystem: def _adls(properties: Properties) -> AbstractFileSystem: from adlfs import AzureBlobFileSystem - for property_name in properties: - if property_name.startswith(ADLFS_PREFIX): - deprecation_message( - deprecated_in="0.8.0", - removed_in="0.9.0", - help_message=f"The property {property_name} is deprecated. Please use properties that start with adls.", - ) - return AzureBlobFileSystem( - connection_string=get_first_property_value( - properties, - ADLS_CONNECTION_STRING, - ADLFS_CONNECTION_STRING, - ), - account_name=get_first_property_value( - properties, - ADLS_ACCOUNT_NAME, - ADLFS_ACCOUNT_NAME, - ), - account_key=get_first_property_value( - properties, - ADLS_ACCOUNT_KEY, - ADLFS_ACCOUNT_KEY, - ), - sas_token=get_first_property_value( - properties, - ADLS_SAS_TOKEN, - ADLFS_SAS_TOKEN, - ), - tenant_id=get_first_property_value( - properties, - ADLS_TENANT_ID, - ADLFS_TENANT_ID, - ), - client_id=get_first_property_value( - properties, - ADLS_CLIENT_ID, - ADLFS_CLIENT_ID, - ), - client_secret=get_first_property_value( - properties, - ADLS_ClIENT_SECRET, - ADLFS_ClIENT_SECRET, - ), + connection_string=properties.get(ADLS_CONNECTION_STRING), + account_name=properties.get(ADLS_ACCOUNT_NAME), + account_key=properties.get(ADLS_ACCOUNT_KEY), + sas_token=properties.get(ADLS_SAS_TOKEN), + tenant_id=properties.get(ADLS_TENANT_ID), + client_id=properties.get(ADLS_CLIENT_ID), + client_secret=properties.get(ADLS_ClIENT_SECRET), ) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 1ce0842844..d288e4f2f1 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -90,7 +90,6 @@ AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, GCS_DEFAULT_LOCATION, - GCS_ENDPOINT, GCS_SERVICE_HOST, GCS_TOKEN, GCS_TOKEN_EXPIRES_AT_MS, @@ -166,7 +165,6 @@ from pyiceberg.utils.concurrent import ExecutorFactory from pyiceberg.utils.config import Config from pyiceberg.utils.datetime import millis_to_datetime -from pyiceberg.utils.deprecated import deprecation_message from pyiceberg.utils.properties import get_first_property_value, property_as_bool, property_as_int from pyiceberg.utils.singleton import Singleton from pyiceberg.utils.truncate import truncate_upper_bound_binary_string, truncate_upper_bound_text_string @@ -471,13 +469,7 @@ def _initialize_gcs_fs(self) -> FileSystem: gcs_kwargs["credential_token_expiration"] = millis_to_datetime(int(expiration)) if bucket_location := self.properties.get(GCS_DEFAULT_LOCATION): gcs_kwargs["default_bucket_location"] = bucket_location - if endpoint := get_first_property_value(self.properties, GCS_SERVICE_HOST, GCS_ENDPOINT): - if self.properties.get(GCS_ENDPOINT): - deprecation_message( - deprecated_in="0.8.0", - removed_in="0.9.0", - help_message=f"The property {GCS_ENDPOINT} is deprecated, please use {GCS_SERVICE_HOST} instead", - ) + if endpoint := self.properties.get(GCS_SERVICE_HOST): url_parts = urlparse(endpoint) gcs_kwargs["scheme"] = url_parts.scheme gcs_kwargs["endpoint_override"] = url_parts.netloc