From adfc971105adab025118ff8f67eff42f63ea9c3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=20Anast=C3=A1cio?= Date: Sun, 3 Nov 2024 22:50:38 -0300 Subject: [PATCH] Add table statistics update --- dev/provision.py | 24 +++++ mkdocs/docs/api.md | 23 +++++ pyiceberg/table/__init__.py | 18 ++++ pyiceberg/table/metadata.py | 9 ++ pyiceberg/table/statistics.py | 49 ++++++++++ pyiceberg/table/update/__init__.py | 36 +++++++ pyiceberg/table/update/statistics.py | 75 ++++++++++++++ tests/conftest.py | 98 +++++++++++++++++++ .../integration/test_statistics_operations.py | 63 ++++++++++++ tests/table/test_init.py | 98 +++++++++++++++++++ tests/table/test_metadata.py | 4 +- 11 files changed, 495 insertions(+), 2 deletions(-) create mode 100644 pyiceberg/table/statistics.py create mode 100644 pyiceberg/table/update/statistics.py create mode 100644 tests/integration/test_statistics_operations.py diff --git a/dev/provision.py b/dev/provision.py index b358da6593..a4dd213e7d 100644 --- a/dev/provision.py +++ b/dev/provision.py @@ -401,3 +401,27 @@ ) spark.sql(f"ALTER TABLE {catalog_name}.default.test_empty_scan_ordered_str WRITE ORDERED BY id") spark.sql(f"INSERT INTO {catalog_name}.default.test_empty_scan_ordered_str VALUES 'a', 'c'") + + spark.sql( + f""" + CREATE OR REPLACE TABLE {catalog_name}.default.test_table_statistics_operations ( + number integer + ) + USING iceberg + TBLPROPERTIES ( + 'format-version'='2' + ); + """ + ) + spark.sql( + f""" + INSERT INTO {catalog_name}.default.test_table_statistics_operations + VALUES (1) + """ + ) + spark.sql( + f""" + INSERT INTO {catalog_name}.default.test_table_statistics_operations + VALUES (2) + """ + ) diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md index 9c48718877..cf989dc285 100644 --- a/mkdocs/docs/api.md +++ b/mkdocs/docs/api.md @@ -1250,6 +1250,29 @@ with table.manage_snapshots() as ms: ms.create_branch(snapshot_id1, "Branch_A").create_tag(snapshot_id2, "tag789") ``` +## Table Statistics Management + +Manage table statistics with operations through the `Table` API: + +```python +# To run a specific operation +table.update_statistics().set_statistics(snapshot_id, statistics_file).commit() +# To run multiple operations +table.update_statistics() + .set_statistics(snapshot_id1, statistics_file1) + .remove_statistics(snapshot_id2) + .commit() +# Operations are applied on commit. +``` + +You can also use context managers to make more changes: + +```python +with table.update_statistics() as update: + update.set_statistics(snaphsot_id1, statistics_file) + update.remove_statistics(snapshot_id2) +``` + ## Query the data To query a table, a table scan is needed. A table scan accepts a filter, columns, optionally a limit and a snapshot ID: diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 7bc3fe838b..163447c62a 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -118,6 +118,7 @@ _FastAppendFiles, ) from pyiceberg.table.update.spec import UpdateSpec +from pyiceberg.table.update.statistics import UpdateStatistics from pyiceberg.transforms import IdentityTransform from pyiceberg.typedef import ( EMPTY_DICT, @@ -1035,6 +1036,23 @@ def manage_snapshots(self) -> ManageSnapshots: """ return ManageSnapshots(transaction=Transaction(self, autocommit=True)) + def update_statistics(self) -> UpdateStatistics: + """ + Shorthand to run statistics management operations like add statistics and remove statistics. + + Use table.update_statistics().().commit() to run a specific operation. + Use table.update_statistics().().().commit() to run multiple operations. + + Pending changes are applied on commit. + + We can also use context managers to make more changes. For example: + + with table.update_statistics() as update: + update.set_statistics(snapshot_id=1, statistics_file=statistics_file) + update.remove_statistics(snapshot_id=2) + """ + return UpdateStatistics(transaction=Transaction(self, autocommit=True)) + def update_schema(self, allow_incompatible_changes: bool = False, case_sensitive: bool = True) -> UpdateSchema: """Create a new UpdateSchema to alter the columns of this table. diff --git a/pyiceberg/table/metadata.py b/pyiceberg/table/metadata.py index 8173bb2c03..ef1a324c45 100644 --- a/pyiceberg/table/metadata.py +++ b/pyiceberg/table/metadata.py @@ -44,6 +44,7 @@ SortOrder, assign_fresh_sort_order_ids, ) +from pyiceberg.table.statistics import StatisticsFile from pyiceberg.typedef import ( EMPTY_DICT, IcebergBaseModel, @@ -221,6 +222,14 @@ class TableMetadataCommonFields(IcebergBaseModel): There is always a main branch reference pointing to the current-snapshot-id even if the refs map is null.""" + statistics: List[StatisticsFile] = Field(default_factory=list) + """A optional list of table statistics files. + Table statistics files are valid Puffin files. Statistics are + informational. A reader can choose to ignore statistics + information. Statistics support is not required to read the + table correctly. A table can contain many statistics files + associated with different table snapshots.""" + # validators @field_validator("properties", mode="before") def transform_properties_dict_value_to_str(cls, properties: Properties) -> Dict[str, str]: diff --git a/pyiceberg/table/statistics.py b/pyiceberg/table/statistics.py new file mode 100644 index 0000000000..a6c73e4aa7 --- /dev/null +++ b/pyiceberg/table/statistics.py @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from typing import ( + Dict, + List, + Optional, +) + +from pydantic import Field + +from pyiceberg.typedef import IcebergBaseModel + + +class BlobMetadata(IcebergBaseModel): + type: str + snapshot_id: int = Field(alias="snapshot-id") + sequence_number: int = Field(alias="sequence-number") + fields: List[int] + properties: Optional[Dict[str, str]] = None + + +class StatisticsFile(IcebergBaseModel): + snapshot_id: int = Field(alias="snapshot-id") + statistics_path: str = Field(alias="statistics-path") + file_size_in_bytes: int = Field(alias="file-size-in-bytes") + file_footer_size_in_bytes: int = Field(alias="file-footer-size-in-bytes") + key_metadata: Optional[str] = Field(alias="key-metadata", default=None) + blob_metadata: List[BlobMetadata] = Field(alias="blob-metadata") + + +def filter_statistics_by_snapshot_id( + statistics: List[StatisticsFile], + reject_snapshot_id: int, +) -> List[StatisticsFile]: + return [stat for stat in statistics if stat.snapshot_id != reject_snapshot_id] diff --git a/pyiceberg/table/update/__init__.py b/pyiceberg/table/update/__init__.py index d5e8c1aba1..3cf2db630d 100644 --- a/pyiceberg/table/update/__init__.py +++ b/pyiceberg/table/update/__init__.py @@ -36,6 +36,7 @@ SnapshotLogEntry, ) from pyiceberg.table.sorting import SortOrder +from pyiceberg.table.statistics import StatisticsFile, filter_statistics_by_snapshot_id from pyiceberg.typedef import ( IcebergBaseModel, Properties, @@ -174,6 +175,17 @@ class RemovePropertiesUpdate(IcebergBaseModel): removals: List[str] +class SetStatisticsUpdate(IcebergBaseModel): + action: Literal["set-statistics"] = Field(default="set-statistics") + snapshot_id: int = Field(alias="snapshot-id") + statistics: StatisticsFile + + +class RemoveStatisticsUpdate(IcebergBaseModel): + action: Literal["remove-statistics"] = Field(default="remove-statistics") + snapshot_id: int = Field(alias="snapshot-id") + + TableUpdate = Annotated[ Union[ AssignUUIDUpdate, @@ -191,6 +203,8 @@ class RemovePropertiesUpdate(IcebergBaseModel): SetLocationUpdate, SetPropertiesUpdate, RemovePropertiesUpdate, + SetStatisticsUpdate, + RemoveStatisticsUpdate, ], Field(discriminator="action"), ] @@ -475,6 +489,28 @@ def _( return base_metadata.model_copy(update={"default_sort_order_id": new_sort_order_id}) +@_apply_table_update.register(SetStatisticsUpdate) +def _(update: SetStatisticsUpdate, base_metadata: TableMetadata, context: _TableMetadataUpdateContext) -> TableMetadata: + if update.snapshot_id != update.statistics.snapshot_id: + raise ValueError("Snapshot id in statistics does not match the snapshot id in the update") + + statistics = filter_statistics_by_snapshot_id(base_metadata.statistics, update.snapshot_id) + context.add_update(update) + + return base_metadata.model_copy(update={"statistics": statistics + [update.statistics]}) + + +@_apply_table_update.register(RemoveStatisticsUpdate) +def _(update: RemoveStatisticsUpdate, base_metadata: TableMetadata, context: _TableMetadataUpdateContext) -> TableMetadata: + if not any(stat.snapshot_id == update.snapshot_id for stat in base_metadata.statistics): + raise ValueError(f"Statistics with snapshot id {update.snapshot_id} does not exist") + + statistics = filter_statistics_by_snapshot_id(base_metadata.statistics, update.snapshot_id) + context.add_update(update) + + return base_metadata.model_copy(update={"statistics": statistics}) + + def update_table_metadata( base_metadata: TableMetadata, updates: Tuple[TableUpdate, ...], diff --git a/pyiceberg/table/update/statistics.py b/pyiceberg/table/update/statistics.py new file mode 100644 index 0000000000..e31025453b --- /dev/null +++ b/pyiceberg/table/update/statistics.py @@ -0,0 +1,75 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from typing import TYPE_CHECKING, Tuple + +from pyiceberg.table.statistics import StatisticsFile +from pyiceberg.table.update import ( + RemoveStatisticsUpdate, + SetStatisticsUpdate, + TableUpdate, + UpdatesAndRequirements, + UpdateTableMetadata, +) + +if TYPE_CHECKING: + from pyiceberg.table import Transaction + + +class UpdateStatistics(UpdateTableMetadata["UpdateStatistics"]): + """ + Run statistics management operations using APIs. + + APIs include set_statistics and remove statistics operations. + + Use table.update_statistics().().commit() to run a specific operation. + Use table.update_statistics().().().commit() to run multiple operations. + + Pending changes are applied on commit. + + We can also use context managers to make more changes. For example: + + with table.update_statistics() as update: + update.set_statistics(snapshot_id=1, statistics_file=statistics_file) + update.remove_statistics(snapshot_id=2) + """ + + _updates: Tuple[TableUpdate, ...] = () + + def __init__(self, transaction: "Transaction") -> None: + super().__init__(transaction) + + def set_statistics(self, snapshot_id: int, statistics_file: StatisticsFile) -> "UpdateStatistics": + self._updates += ( + SetStatisticsUpdate( + snapshot_id=snapshot_id, + statistics=statistics_file, + ), + ) + + return self + + def remove_statistics(self, snapshot_id: int) -> "UpdateStatistics": + self._updates = ( + RemoveStatisticsUpdate( + snapshot_id=snapshot_id, + ), + ) + + return self + + def _commit(self) -> UpdatesAndRequirements: + return self._updates, () diff --git a/tests/conftest.py b/tests/conftest.py index ef980f3818..453c62cbe7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -955,6 +955,87 @@ def generate_snapshot( "refs": {"test": {"snapshot-id": 3051729675574597004, "type": "tag", "max-ref-age-ms": 10000000}}, } +TABLE_METADATA_V2_WITH_STATISTICS = { + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 0, + "schemas": [ + { + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": True, + "type": "long", + } + ], + } + ], + "default-spec-id": 0, + "partition-specs": [{"spec-id": 0, "fields": []}], + "last-partition-id": 1000, + "default-sort-order-id": 0, + "sort-orders": [{"order-id": 0, "fields": []}], + "properties": {}, + "current-snapshot-id": 3055729675574597004, + "snapshots": [ + { + "snapshot-id": 3051729675574597004, + "timestamp-ms": 1515100955770, + "sequence-number": 0, + "summary": {"operation": "append"}, + "manifest-list": "s3://a/b/1.avro", + }, + { + "snapshot-id": 3055729675574597004, + "parent-snapshot-id": 3051729675574597004, + "timestamp-ms": 1555100955770, + "sequence-number": 1, + "summary": {"operation": "append"}, + "manifest-list": "s3://a/b/2.avro", + "schema-id": 1, + }, + ], + "statistics": [ + { + "snapshot-id": 3051729675574597004, + "statistics-path": "s3://a/b/stats.puffin", + "file-size-in-bytes": 413, + "file-footer-size-in-bytes": 42, + "blob-metadata": [ + { + "type": "ndv", + "snapshot-id": 3051729675574597004, + "sequence-number": 1, + "fields": [1], + } + ], + }, + { + "snapshot-id": 3055729675574597004, + "statistics-path": "s3://a/b/stats.puffin", + "file-size-in-bytes": 413, + "file-footer-size-in-bytes": 42, + "blob-metadata": [ + { + "type": "ndv", + "snapshot-id": 3055729675574597004, + "sequence-number": 1, + "fields": [1], + } + ], + }, + ], + "snapshot-log": [], + "metadata-log": [], +} + @pytest.fixture def example_table_metadata_v2() -> Dict[str, Any]: @@ -966,6 +1047,11 @@ def table_metadata_v2_with_fixed_and_decimal_types() -> Dict[str, Any]: return TABLE_METADATA_V2_WITH_FIXED_AND_DECIMAL_TYPES +@pytest.fixture +def table_metadata_v2_with_statistics() -> Dict[str, Any]: + return TABLE_METADATA_V2_WITH_STATISTICS + + @pytest.fixture(scope="session") def metadata_location(tmp_path_factory: pytest.TempPathFactory) -> str: from pyiceberg.io.pyarrow import PyArrowFileIO @@ -2199,6 +2285,18 @@ def table_v2_with_extensive_snapshots(example_table_metadata_v2_with_extensive_s ) +@pytest.fixture +def table_v2_with_statistics(table_metadata_v2_with_statistics: Dict[str, Any]) -> Table: + table_metadata = TableMetadataV2(**table_metadata_v2_with_statistics) + return Table( + identifier=("database", "table"), + metadata=table_metadata, + metadata_location=f"{table_metadata.location}/uuid.metadata.json", + io=load_file_io(), + catalog=NoopCatalog("NoopCatalog"), + ) + + @pytest.fixture def bound_reference_str() -> BoundReference[str]: return BoundReference(field=NestedField(1, "field", StringType(), required=False), accessor=Accessor(position=0, inner=None)) diff --git a/tests/integration/test_statistics_operations.py b/tests/integration/test_statistics_operations.py new file mode 100644 index 0000000000..de9a8e691f --- /dev/null +++ b/tests/integration/test_statistics_operations.py @@ -0,0 +1,63 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import pytest + +from pyiceberg.catalog import Catalog +from pyiceberg.table.statistics import BlobMetadata, StatisticsFile + + +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) +def test_manage_statistics(catalog: Catalog) -> None: + identifier = "default.test_table_statistics_operations" + tbl = catalog.load_table(identifier) + + add_snapshot_id_1 = tbl.history()[0].snapshot_id + add_snapshot_id_2 = tbl.history()[1].snapshot_id + + def create_statistics_file(snapshot_id: int) -> StatisticsFile: + blob_metadata = BlobMetadata( + type="boring-type", + snapshot_id=snapshot_id, + sequence_number=2, + fields=[1], + properties={"prop-key": "prop-value"}, + ) + + statistics_file = StatisticsFile( + snapshot_id=snapshot_id, + statistics_path="s3://bucket/warehouse/stats.puffin", + file_size_in_bytes=124, + file_footer_size_in_bytes=27, + blob_metadata=[blob_metadata], + ) + + return statistics_file + + statistics_file_snap_1 = create_statistics_file(add_snapshot_id_1) + statistics_file_snap_2 = create_statistics_file(add_snapshot_id_2) + + with tbl.update_statistics() as update: + update.set_statistics(add_snapshot_id_1, statistics_file_snap_1) + update.set_statistics(add_snapshot_id_2, statistics_file_snap_2) + + assert len(tbl.metadata.statistics) == 2 + + with tbl.update_statistics() as update: + update.remove_statistics(add_snapshot_id_1) + + assert len(tbl.metadata.statistics) == 1 diff --git a/tests/table/test_init.py b/tests/table/test_init.py index bcb2d643dc..79f97024d0 100644 --- a/tests/table/test_init.py +++ b/tests/table/test_init.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. # pylint:disable=redefined-outer-name +import json import uuid from copy import copy from typing import Any, Dict @@ -64,6 +65,7 @@ SortField, SortOrder, ) +from pyiceberg.table.statistics import BlobMetadata, StatisticsFile from pyiceberg.table.update import ( AddSnapshotUpdate, AddSortOrderUpdate, @@ -76,9 +78,11 @@ AssertRefSnapshotId, AssertTableUUID, RemovePropertiesUpdate, + RemoveStatisticsUpdate, SetDefaultSortOrderUpdate, SetPropertiesUpdate, SetSnapshotRefUpdate, + SetStatisticsUpdate, _apply_table_update, _TableMetadataUpdateContext, update_table_metadata, @@ -1247,3 +1251,97 @@ def test_update_metadata_log_overflow(table_v2: Table) -> None: table_v2.metadata_location, ) assert len(new_metadata.metadata_log) == 1 + + +def test_set_statistics_update(table_v2_with_statistics: Table) -> None: + snapshot_id = table_v2_with_statistics.metadata.current_snapshot_id + + blob_metadata = BlobMetadata( + type="boring-type", + snapshot_id=snapshot_id, + sequence_number=2, + fields=[1], + properties={"prop-key": "prop-value"}, + ) + + statistics_file = StatisticsFile( + snapshot_id=snapshot_id, + statistics_path="s3://bucket/warehouse/stats.puffin", + file_size_in_bytes=124, + file_footer_size_in_bytes=27, + blob_metadata=[blob_metadata], + ) + + update = SetStatisticsUpdate( + snapshot_id=snapshot_id, + statistics=statistics_file, + ) + + new_metadata = update_table_metadata( + table_v2_with_statistics.metadata, + (update,), + ) + + expected = """ + { + "snapshot-id": 3055729675574597004, + "statistics-path": "s3://bucket/warehouse/stats.puffin", + "file-size-in-bytes": 124, + "file-footer-size-in-bytes": 27, + "blob-metadata": [ + { + "type": "boring-type", + "snapshot-id": 3055729675574597004, + "sequence-number": 2, + "fields": [ + 1 + ], + "properties": { + "prop-key": "prop-value" + } + } + ] + }""" + + assert len(new_metadata.statistics) == 2 + + updated_statistics = [stat for stat in new_metadata.statistics if stat.snapshot_id == snapshot_id] + + assert len(updated_statistics) == 1 + assert json.loads(updated_statistics[0].model_dump_json()) == json.loads(expected) + + update = SetStatisticsUpdate( + snapshot_id=123456789, + statistics=statistics_file, + ) + + with pytest.raises( + ValueError, + match="Snapshot id in statistics does not match the snapshot id in the update", + ): + update_table_metadata( + table_v2_with_statistics.metadata, + (update,), + ) + + +def test_remove_statistics_update(table_v2_with_statistics: Table) -> None: + update = RemoveStatisticsUpdate( + snapshot_id=3055729675574597004, + ) + + remove_metadata = update_table_metadata( + table_v2_with_statistics.metadata, + (update,), + ) + + assert len(remove_metadata.statistics) == 1 + + with pytest.raises( + ValueError, + match="Statistics with snapshot id 123456789 does not exist", + ): + update_table_metadata( + table_v2_with_statistics.metadata, + (RemoveStatisticsUpdate(snapshot_id=123456789),), + ) diff --git a/tests/table/test_metadata.py b/tests/table/test_metadata.py index 3b7ccf7c10..6423531304 100644 --- a/tests/table/test_metadata.py +++ b/tests/table/test_metadata.py @@ -168,13 +168,13 @@ def test_updating_metadata(example_table_metadata_v2: Dict[str, Any]) -> None: def test_serialize_v1(example_table_metadata_v1: Dict[str, Any]) -> None: table_metadata = TableMetadataV1(**example_table_metadata_v1) table_metadata_json = table_metadata.model_dump_json() - expected = """{"location":"s3://bucket/test/location","table-uuid":"d20125c8-7284-442c-9aea-15fee620737c","last-updated-ms":1602638573874,"last-column-id":3,"schemas":[{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]}],"current-schema-id":0,"partition-specs":[{"spec-id":0,"fields":[{"source-id":1,"field-id":1000,"transform":"identity","name":"x"}]}],"default-spec-id":0,"last-partition-id":1000,"properties":{},"snapshots":[{"snapshot-id":1925,"timestamp-ms":1602638573822,"manifest-list":"s3://bucket/test/manifest-list"}],"snapshot-log":[],"metadata-log":[],"sort-orders":[{"order-id":0,"fields":[]}],"default-sort-order-id":0,"refs":{},"format-version":1,"schema":{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]},"partition-spec":[{"name":"x","transform":"identity","source-id":1,"field-id":1000}]}""" + expected = """{"location":"s3://bucket/test/location","table-uuid":"d20125c8-7284-442c-9aea-15fee620737c","last-updated-ms":1602638573874,"last-column-id":3,"schemas":[{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]}],"current-schema-id":0,"partition-specs":[{"spec-id":0,"fields":[{"source-id":1,"field-id":1000,"transform":"identity","name":"x"}]}],"default-spec-id":0,"last-partition-id":1000,"properties":{},"snapshots":[{"snapshot-id":1925,"timestamp-ms":1602638573822,"manifest-list":"s3://bucket/test/manifest-list"}],"snapshot-log":[],"metadata-log":[],"sort-orders":[{"order-id":0,"fields":[]}],"default-sort-order-id":0,"refs":{},"statistics":[],"format-version":1,"schema":{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]},"partition-spec":[{"name":"x","transform":"identity","source-id":1,"field-id":1000}]}""" assert table_metadata_json == expected def test_serialize_v2(example_table_metadata_v2: Dict[str, Any]) -> None: table_metadata = TableMetadataV2(**example_table_metadata_v2).model_dump_json() - expected = """{"location":"s3://bucket/test/location","table-uuid":"9c12d441-03fe-4693-9a96-a0705ddf69c1","last-updated-ms":1602638573590,"last-column-id":3,"schemas":[{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]},{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":1,"identifier-field-ids":[1,2]}],"current-schema-id":1,"partition-specs":[{"spec-id":0,"fields":[{"source-id":1,"field-id":1000,"transform":"identity","name":"x"}]}],"default-spec-id":0,"last-partition-id":1000,"properties":{"read.split.target.size":"134217728"},"current-snapshot-id":3055729675574597004,"snapshots":[{"snapshot-id":3051729675574597004,"sequence-number":0,"timestamp-ms":1515100955770,"manifest-list":"s3://a/b/1.avro","summary":{"operation":"append"}},{"snapshot-id":3055729675574597004,"parent-snapshot-id":3051729675574597004,"sequence-number":1,"timestamp-ms":1555100955770,"manifest-list":"s3://a/b/2.avro","summary":{"operation":"append"},"schema-id":1}],"snapshot-log":[{"snapshot-id":3051729675574597004,"timestamp-ms":1515100955770},{"snapshot-id":3055729675574597004,"timestamp-ms":1555100955770}],"metadata-log":[{"metadata-file":"s3://bucket/.../v1.json","timestamp-ms":1515100}],"sort-orders":[{"order-id":3,"fields":[{"source-id":2,"transform":"identity","direction":"asc","null-order":"nulls-first"},{"source-id":3,"transform":"bucket[4]","direction":"desc","null-order":"nulls-last"}]}],"default-sort-order-id":3,"refs":{"test":{"snapshot-id":3051729675574597004,"type":"tag","max-ref-age-ms":10000000},"main":{"snapshot-id":3055729675574597004,"type":"branch"}},"format-version":2,"last-sequence-number":34}""" + expected = """{"location":"s3://bucket/test/location","table-uuid":"9c12d441-03fe-4693-9a96-a0705ddf69c1","last-updated-ms":1602638573590,"last-column-id":3,"schemas":[{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]},{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":1,"identifier-field-ids":[1,2]}],"current-schema-id":1,"partition-specs":[{"spec-id":0,"fields":[{"source-id":1,"field-id":1000,"transform":"identity","name":"x"}]}],"default-spec-id":0,"last-partition-id":1000,"properties":{"read.split.target.size":"134217728"},"current-snapshot-id":3055729675574597004,"snapshots":[{"snapshot-id":3051729675574597004,"sequence-number":0,"timestamp-ms":1515100955770,"manifest-list":"s3://a/b/1.avro","summary":{"operation":"append"}},{"snapshot-id":3055729675574597004,"parent-snapshot-id":3051729675574597004,"sequence-number":1,"timestamp-ms":1555100955770,"manifest-list":"s3://a/b/2.avro","summary":{"operation":"append"},"schema-id":1}],"snapshot-log":[{"snapshot-id":3051729675574597004,"timestamp-ms":1515100955770},{"snapshot-id":3055729675574597004,"timestamp-ms":1555100955770}],"metadata-log":[{"metadata-file":"s3://bucket/.../v1.json","timestamp-ms":1515100}],"sort-orders":[{"order-id":3,"fields":[{"source-id":2,"transform":"identity","direction":"asc","null-order":"nulls-first"},{"source-id":3,"transform":"bucket[4]","direction":"desc","null-order":"nulls-last"}]}],"default-sort-order-id":3,"refs":{"test":{"snapshot-id":3051729675574597004,"type":"tag","max-ref-age-ms":10000000},"main":{"snapshot-id":3055729675574597004,"type":"branch"}},"statistics":[],"format-version":2,"last-sequence-number":34}""" assert table_metadata == expected