Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove redundant snapshot_id from SetStatisticsUpdate #1566

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 16 additions & 7 deletions pyiceberg/table/update/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@
from abc import ABC, abstractmethod
from datetime import datetime
from functools import singledispatch
from typing import TYPE_CHECKING, Any, Dict, Generic, List, Literal, Optional, Tuple, TypeVar, Union
from typing import TYPE_CHECKING, Any, Dict, Generic, List, Literal, Optional, Tuple, TypeVar, Union, cast

from pydantic import Field, field_validator
from pydantic import Field, field_validator, model_validator
from typing_extensions import Annotated

from pyiceberg.exceptions import CommitFailedException
Expand Down Expand Up @@ -177,8 +177,20 @@ class RemovePropertiesUpdate(IcebergBaseModel):

class SetStatisticsUpdate(IcebergBaseModel):
action: Literal["set-statistics"] = Field(default="set-statistics")
snapshot_id: int = Field(alias="snapshot-id")
statistics: StatisticsFile
snapshot_id: Optional[int] = Field(
None,
alias="snapshot-id",
description="This optional field is **DEPRECATED for REMOVAL** since it contains redundant information. Clients should use the `statistics.snapshot-id` field instead.",
)

@model_validator(mode="before")
def validate_snapshot_id(cls, data: Dict[str, Any]) -> Dict[str, Any]:
stats = cast(StatisticsFile, data["statistics"])

data["snapshot_id"] = stats.snapshot_id

return data


class RemoveStatisticsUpdate(IcebergBaseModel):
Expand Down Expand Up @@ -491,10 +503,7 @@ def _(

@_apply_table_update.register(SetStatisticsUpdate)
def _(update: SetStatisticsUpdate, base_metadata: TableMetadata, context: _TableMetadataUpdateContext) -> TableMetadata:
if update.snapshot_id != update.statistics.snapshot_id:
raise ValueError("Snapshot id in statistics does not match the snapshot id in the update")

statistics = filter_statistics_by_snapshot_id(base_metadata.statistics, update.snapshot_id)
statistics = filter_statistics_by_snapshot_id(base_metadata.statistics, update.statistics.snapshot_id)
context.add_update(update)

return base_metadata.model_copy(update={"statistics": statistics + [update.statistics]})
Expand Down
3 changes: 1 addition & 2 deletions pyiceberg/table/update/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,9 @@ class UpdateStatistics(UpdateTableMetadata["UpdateStatistics"]):
def __init__(self, transaction: "Transaction") -> None:
super().__init__(transaction)

def set_statistics(self, snapshot_id: int, statistics_file: StatisticsFile) -> "UpdateStatistics":
def set_statistics(self, statistics_file: StatisticsFile) -> "UpdateStatistics":
self._updates += (
SetStatisticsUpdate(
snapshot_id=snapshot_id,
statistics=statistics_file,
),
)
Expand Down
4 changes: 2 additions & 2 deletions tests/integration/test_statistics_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ def create_statistics_file(snapshot_id: int, type_name: str) -> StatisticsFile:
statistics_file_snap_2 = create_statistics_file(add_snapshot_id_2, "deletion-vector-v1")

with tbl.update_statistics() as update:
update.set_statistics(add_snapshot_id_1, statistics_file_snap_1)
update.set_statistics(add_snapshot_id_2, statistics_file_snap_2)
update.set_statistics(statistics_file_snap_1)
update.set_statistics(statistics_file_snap_2)

assert len(tbl.metadata.statistics) == 2

Expand Down
14 changes: 0 additions & 14 deletions tests/table/test_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -1310,20 +1310,6 @@ def test_set_statistics_update(table_v2_with_statistics: Table) -> None:
assert len(updated_statistics) == 1
assert json.loads(updated_statistics[0].model_dump_json()) == json.loads(expected)

update = SetStatisticsUpdate(
snapshot_id=123456789,
statistics=statistics_file,
)

with pytest.raises(
ValueError,
match="Snapshot id in statistics does not match the snapshot id in the update",
):
update_table_metadata(
table_v2_with_statistics.metadata,
(update,),
)


def test_remove_statistics_update(table_v2_with_statistics: Table) -> None:
update = RemoveStatisticsUpdate(
Expand Down