From 493bea27a14b2e1b34543f1094bef4a8dd61d7c9 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Thu, 22 May 2025 16:54:05 +0100 Subject: [PATCH 1/4] Partition statistics metadata reading --- pyiceberg/table/metadata.py | 10 +++++++++- pyiceberg/table/statistics.py | 11 ++++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/pyiceberg/table/metadata.py b/pyiceberg/table/metadata.py index f248700c02..9c2ae29cdd 100644 --- a/pyiceberg/table/metadata.py +++ b/pyiceberg/table/metadata.py @@ -36,7 +36,7 @@ SortOrder, assign_fresh_sort_order_ids, ) -from pyiceberg.table.statistics import StatisticsFile +from pyiceberg.table.statistics import PartitionStatisticsFile, StatisticsFile from pyiceberg.typedef import ( EMPTY_DICT, IcebergBaseModel, @@ -222,6 +222,14 @@ class TableMetadataCommonFields(IcebergBaseModel): table correctly. A table can contain many statistics files associated with different table snapshots.""" + partition_statistics: List[PartitionStatisticsFile] = Field(alias="partition-statistics", default_factory=list) + """A optional list of partition statistics files. + Partition statistics are not required for reading or planning + and readers may ignore them. Each table snapshot may be associated + with at most one partition statistics file. A writer can optionally + write the partition statistics file during each write operation, + or it can also be computed on demand.""" + # validators @field_validator("properties", mode="before") def transform_properties_dict_value_to_str(cls, properties: Properties) -> Dict[str, str]: diff --git a/pyiceberg/table/statistics.py b/pyiceberg/table/statistics.py index 151f5e961c..702a262092 100644 --- a/pyiceberg/table/statistics.py +++ b/pyiceberg/table/statistics.py @@ -29,15 +29,24 @@ class BlobMetadata(IcebergBaseModel): properties: Optional[Dict[str, str]] = None -class StatisticsFile(IcebergBaseModel): +class StatisticsCommonFields(IcebergBaseModel): + """Common fields between table and partition statistics structs found on metadata.""" + snapshot_id: int = Field(alias="snapshot-id") statistics_path: str = Field(alias="statistics-path") file_size_in_bytes: int = Field(alias="file-size-in-bytes") + + +class StatisticsFile(StatisticsCommonFields, IcebergBaseModel): file_footer_size_in_bytes: int = Field(alias="file-footer-size-in-bytes") key_metadata: Optional[str] = Field(alias="key-metadata", default=None) blob_metadata: List[BlobMetadata] = Field(alias="blob-metadata") +class PartitionStatisticsFile(IcebergBaseModel): + pass + + def filter_statistics_by_snapshot_id( statistics: List[StatisticsFile], reject_snapshot_id: int, From 591b95499e5bda70cd98cb75bc7589b561352900 Mon Sep 17 00:00:00 2001 From: Fokko Date: Tue, 24 Jun 2025 11:28:24 +0200 Subject: [PATCH 2/4] Add a test --- pyiceberg/table/statistics.py | 4 ++-- tests/table/test_metadata.py | 4 ++-- tests/table/test_statistics.py | 30 ++++++++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 4 deletions(-) create mode 100644 tests/table/test_statistics.py diff --git a/pyiceberg/table/statistics.py b/pyiceberg/table/statistics.py index 702a262092..a2e1b149a1 100644 --- a/pyiceberg/table/statistics.py +++ b/pyiceberg/table/statistics.py @@ -37,13 +37,13 @@ class StatisticsCommonFields(IcebergBaseModel): file_size_in_bytes: int = Field(alias="file-size-in-bytes") -class StatisticsFile(StatisticsCommonFields, IcebergBaseModel): +class StatisticsFile(StatisticsCommonFields): file_footer_size_in_bytes: int = Field(alias="file-footer-size-in-bytes") key_metadata: Optional[str] = Field(alias="key-metadata", default=None) blob_metadata: List[BlobMetadata] = Field(alias="blob-metadata") -class PartitionStatisticsFile(IcebergBaseModel): +class PartitionStatisticsFile(StatisticsCommonFields): pass diff --git a/tests/table/test_metadata.py b/tests/table/test_metadata.py index a8410cff36..27fd00bd1e 100644 --- a/tests/table/test_metadata.py +++ b/tests/table/test_metadata.py @@ -173,13 +173,13 @@ def test_updating_metadata(example_table_metadata_v2: Dict[str, Any]) -> None: def test_serialize_v1(example_table_metadata_v1: Dict[str, Any]) -> None: table_metadata = TableMetadataV1(**example_table_metadata_v1) table_metadata_json = table_metadata.model_dump_json() - expected = """{"location":"s3://bucket/test/location","table-uuid":"d20125c8-7284-442c-9aea-15fee620737c","last-updated-ms":1602638573874,"last-column-id":3,"schemas":[{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]}],"current-schema-id":0,"partition-specs":[{"spec-id":0,"fields":[{"source-id":1,"field-id":1000,"transform":"identity","name":"x"}]}],"default-spec-id":0,"last-partition-id":1000,"properties":{},"snapshots":[{"snapshot-id":1925,"timestamp-ms":1602638573822,"manifest-list":"s3://bucket/test/manifest-list"}],"snapshot-log":[],"metadata-log":[],"sort-orders":[{"order-id":0,"fields":[]}],"default-sort-order-id":0,"refs":{},"statistics":[],"format-version":1,"schema":{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]},"partition-spec":[{"name":"x","transform":"identity","source-id":1,"field-id":1000}]}""" + expected = """{"location":"s3://bucket/test/location","table-uuid":"d20125c8-7284-442c-9aea-15fee620737c","last-updated-ms":1602638573874,"last-column-id":3,"schemas":[{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]}],"current-schema-id":0,"partition-specs":[{"spec-id":0,"fields":[{"source-id":1,"field-id":1000,"transform":"identity","name":"x"}]}],"default-spec-id":0,"last-partition-id":1000,"properties":{},"snapshots":[{"snapshot-id":1925,"timestamp-ms":1602638573822,"manifest-list":"s3://bucket/test/manifest-list"}],"snapshot-log":[],"metadata-log":[],"sort-orders":[{"order-id":0,"fields":[]}],"default-sort-order-id":0,"refs":{},"[statistics]":[],"partition-statistics":[],"format-version":1,"schema":{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]},"partition-spec":[{"name":"x","transform":"identity","source-id":1,"field-id":1000}]}""" assert table_metadata_json == expected def test_serialize_v2(example_table_metadata_v2: Dict[str, Any]) -> None: table_metadata = TableMetadataV2(**example_table_metadata_v2).model_dump_json() - expected = """{"location":"s3://bucket/test/location","table-uuid":"9c12d441-03fe-4693-9a96-a0705ddf69c1","last-updated-ms":1602638573590,"last-column-id":3,"schemas":[{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]},{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":1,"identifier-field-ids":[1,2]}],"current-schema-id":1,"partition-specs":[{"spec-id":0,"fields":[{"source-id":1,"field-id":1000,"transform":"identity","name":"x"}]}],"default-spec-id":0,"last-partition-id":1000,"properties":{"read.split.target.size":"134217728"},"current-snapshot-id":3055729675574597004,"snapshots":[{"snapshot-id":3051729675574597004,"sequence-number":0,"timestamp-ms":1515100955770,"manifest-list":"s3://a/b/1.avro","summary":{"operation":"append"}},{"snapshot-id":3055729675574597004,"parent-snapshot-id":3051729675574597004,"sequence-number":1,"timestamp-ms":1555100955770,"manifest-list":"s3://a/b/2.avro","summary":{"operation":"append"},"schema-id":1}],"snapshot-log":[{"snapshot-id":3051729675574597004,"timestamp-ms":1515100955770},{"snapshot-id":3055729675574597004,"timestamp-ms":1555100955770}],"metadata-log":[{"metadata-file":"s3://bucket/.../v1.json","timestamp-ms":1515100}],"sort-orders":[{"order-id":3,"fields":[{"source-id":2,"transform":"identity","direction":"asc","null-order":"nulls-first"},{"source-id":3,"transform":"bucket[4]","direction":"desc","null-order":"nulls-last"}]}],"default-sort-order-id":3,"refs":{"test":{"snapshot-id":3051729675574597004,"type":"tag","max-ref-age-ms":10000000},"main":{"snapshot-id":3055729675574597004,"type":"branch"}},"statistics":[],"format-version":2,"last-sequence-number":34}""" + expected = """{"location":"s3://bucket/test/location","table-uuid":"9c12d441-03fe-4693-9a96-a0705ddf69c1","last-updated-ms":1602638573590,"last-column-id":3,"schemas":[{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]},{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":1,"identifier-field-ids":[1,2]}],"current-schema-id":1,"partition-specs":[{"spec-id":0,"fields":[{"source-id":1,"field-id":1000,"transform":"identity","name":"x"}]}],"default-spec-id":0,"last-partition-id":1000,"properties":{"read.split.target.size":"134217728"},"current-snapshot-id":3055729675574597004,"snapshots":[{"snapshot-id":3051729675574597004,"sequence-number":0,"timestamp-ms":1515100955770,"manifest-list":"s3://a/b/1.avro","summary":{"operation":"append"}},{"snapshot-id":3055729675574597004,"parent-snapshot-id":3051729675574597004,"sequence-number":1,"timestamp-ms":1555100955770,"manifest-list":"s3://a/b/2.avro","summary":{"operation":"append"},"schema-id":1}],"snapshot-log":[{"snapshot-id":3051729675574597004,"timestamp-ms":1515100955770},{"snapshot-id":3055729675574597004,"timestamp-ms":1555100955770}],"metadata-log":[{"metadata-file":"s3://bucket/.../v1.json","timestamp-ms":1515100}],"sort-orders":[{"order-id":3,"fields":[{"source-id":2,"transform":"identity","direction":"asc","null-order":"nulls-first"},{"source-id":3,"transform":"bucket[4]","direction":"desc","null-order":"nulls-last"}]}],"default-sort-order-id":3,"refs":{"test":{"snapshot-id":3051729675574597004,"type":"tag","max-ref-age-ms":10000000},"main":{"snapshot-id":3055729675574597004,"type":"branch"}},"statistics":[],"partition-statistics":[],"format-version":2,"last-sequence-number":34}""" assert table_metadata == expected diff --git a/tests/table/test_statistics.py b/tests/table/test_statistics.py new file mode 100644 index 0000000000..a7f1b10b8e --- /dev/null +++ b/tests/table/test_statistics.py @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from pyiceberg.table.statistics import PartitionStatisticsFile + + +def test_partition_statistics_file() -> None: + partition_statistics_file_json = ( + """{"snapshot-id":123,"statistics-path":"s3://bucket/statistics.parquet","file-size-in-bytes":345}""" + ) + partition_statistics_file = PartitionStatisticsFile.model_validate_json(partition_statistics_file_json) + + assert partition_statistics_file == PartitionStatisticsFile( + snapshot_id=123, statistics_path="s3://bucket/statistics.parquet", file_size_in_bytes=345 + ) + + assert partition_statistics_file.model_dump_json() == partition_statistics_file_json From 654abeffd4106168dabad35b14b0659d968ac403 Mon Sep 17 00:00:00 2001 From: Fokko Date: Tue, 24 Jun 2025 15:59:30 +0200 Subject: [PATCH 3/4] Fix tests --- tests/table/test_metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/table/test_metadata.py b/tests/table/test_metadata.py index 27fd00bd1e..9141189ec5 100644 --- a/tests/table/test_metadata.py +++ b/tests/table/test_metadata.py @@ -173,7 +173,7 @@ def test_updating_metadata(example_table_metadata_v2: Dict[str, Any]) -> None: def test_serialize_v1(example_table_metadata_v1: Dict[str, Any]) -> None: table_metadata = TableMetadataV1(**example_table_metadata_v1) table_metadata_json = table_metadata.model_dump_json() - expected = """{"location":"s3://bucket/test/location","table-uuid":"d20125c8-7284-442c-9aea-15fee620737c","last-updated-ms":1602638573874,"last-column-id":3,"schemas":[{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]}],"current-schema-id":0,"partition-specs":[{"spec-id":0,"fields":[{"source-id":1,"field-id":1000,"transform":"identity","name":"x"}]}],"default-spec-id":0,"last-partition-id":1000,"properties":{},"snapshots":[{"snapshot-id":1925,"timestamp-ms":1602638573822,"manifest-list":"s3://bucket/test/manifest-list"}],"snapshot-log":[],"metadata-log":[],"sort-orders":[{"order-id":0,"fields":[]}],"default-sort-order-id":0,"refs":{},"[statistics]":[],"partition-statistics":[],"format-version":1,"schema":{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]},"partition-spec":[{"name":"x","transform":"identity","source-id":1,"field-id":1000}]}""" + expected = """{"location":"s3://bucket/test/location","table-uuid":"d20125c8-7284-442c-9aea-15fee620737c","last-updated-ms":1602638573874,"last-column-id":3,"schemas":[{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]}],"current-schema-id":0,"partition-specs":[{"spec-id":0,"fields":[{"source-id":1,"field-id":1000,"transform":"identity","name":"x"}]}],"default-spec-id":0,"last-partition-id":1000,"properties":{},"snapshots":[{"snapshot-id":1925,"timestamp-ms":1602638573822,"manifest-list":"s3://bucket/test/manifest-list"}],"snapshot-log":[],"metadata-log":[],"sort-orders":[{"order-id":0,"fields":[]}],"default-sort-order-id":0,"refs":{},"statistics":[],"partition-statistics":[],"format-version":1,"schema":{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]},"partition-spec":[{"name":"x","transform":"identity","source-id":1,"field-id":1000}]}""" assert table_metadata_json == expected From fe468f753914394b9be3739f64316fcf53d890c0 Mon Sep 17 00:00:00 2001 From: Fokko Date: Tue, 24 Jun 2025 22:28:25 +0200 Subject: [PATCH 4/4] Add test for StasisticsFile --- tests/table/test_statistics.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/tests/table/test_statistics.py b/tests/table/test_statistics.py index a7f1b10b8e..6c91990ea3 100644 --- a/tests/table/test_statistics.py +++ b/tests/table/test_statistics.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -from pyiceberg.table.statistics import PartitionStatisticsFile +from pyiceberg.table.statistics import BlobMetadata, PartitionStatisticsFile, StatisticsFile def test_partition_statistics_file() -> None: @@ -28,3 +28,27 @@ def test_partition_statistics_file() -> None: ) assert partition_statistics_file.model_dump_json() == partition_statistics_file_json + + +def test_statistics_file() -> None: + statistics_file_json = """{"snapshot-id":123,"statistics-path":"s3://bucket/statistics.parquet","file-size-in-bytes":345,"file-footer-size-in-bytes":456,"blob-metadata":[{"type":"apache-datasketches-theta-v1","snapshot-id":567,"sequence-number":22,"fields":[1,2,3],"properties":{"foo":"bar"}}]}""" + statistics_file = StatisticsFile.model_validate_json(statistics_file_json) + + assert statistics_file == StatisticsFile( + snapshot_id=123, + statistics_path="s3://bucket/statistics.parquet", + file_size_in_bytes=345, + file_footer_size_in_bytes=456, + key_metadata=None, + blob_metadata=[ + BlobMetadata( + type="apache-datasketches-theta-v1", + snapshot_id=567, + sequence_number=22, + fields=[1, 2, 3], + properties={"foo": "bar"}, + ) + ], + ) + + assert statistics_file.model_dump_json() == statistics_file_json