diff --git a/pyiceberg/table/snapshots.py b/pyiceberg/table/snapshots.py index 980399a2ab..78cbe17ace 100644 --- a/pyiceberg/table/snapshots.py +++ b/pyiceberg/table/snapshots.py @@ -19,7 +19,6 @@ import time from collections import defaultdict from enum import Enum -from functools import lru_cache from typing import TYPE_CHECKING, Any, DefaultDict, Dict, Iterable, List, Mapping, Optional from pydantic import Field, PrivateAttr, model_serializer @@ -231,13 +230,6 @@ def __eq__(self, other: Any) -> bool: ) -@lru_cache -def _manifests(io: FileIO, manifest_list: str) -> List[ManifestFile]: - """Return the manifests from the manifest list.""" - file = io.new_input(manifest_list) - return list(read_manifest_list(file)) - - class Snapshot(IcebergBaseModel): snapshot_id: int = Field(alias="snapshot-id") parent_snapshot_id: Optional[int] = Field(alias="parent-snapshot-id", default=None) @@ -249,6 +241,9 @@ class Snapshot(IcebergBaseModel): summary: Optional[Summary] = Field(default=None) schema_id: Optional[int] = Field(alias="schema-id", default=None) + # Private attribute for caching the manifests + _manifest_cache: Optional[List[ManifestFile]] = PrivateAttr(default=None) + def __str__(self) -> str: """Return the string representation of the Snapshot class.""" operation = f"{self.summary.operation}: " if self.summary else "" @@ -258,10 +253,14 @@ def __str__(self) -> str: return result_str def manifests(self, io: FileIO) -> List[ManifestFile]: - """Return the manifests for the given snapshot.""" - if self.manifest_list: - return _manifests(io, self.manifest_list) - return [] + """Return the manifests for the given snapshot, using instance-level caching.""" + if self._manifest_cache is None: + if self.manifest_list: + file = io.new_input(self.manifest_list) + self._manifest_cache = list(read_manifest_list(file)) + else: + self._manifest_cache = [] + return self._manifest_cache class MetadataLogEntry(IcebergBaseModel): diff --git a/tests/utils/test_manifest.py b/tests/utils/test_manifest.py index ef33b16b00..4f72803ffc 100644 --- a/tests/utils/test_manifest.py +++ b/tests/utils/test_manifest.py @@ -17,6 +17,7 @@ # pylint: disable=redefined-outer-name,arguments-renamed,fixme from tempfile import TemporaryDirectory from typing import Dict +from unittest.mock import patch import fastavro import pytest @@ -306,6 +307,31 @@ def test_read_manifest_v2(generated_manifest_file_file_v2: str) -> None: assert entry.status == ManifestEntryStatus.ADDED +def test_read_manifest_cache(generated_manifest_file_file_v2: str) -> None: + with patch("pyiceberg.table.snapshots.read_manifest_list") as mocked_read_manifest_list: + # Mock the read_manifest_list function relative to the module path + io = load_file_io() + + snapshot = Snapshot( + snapshot_id=25, + parent_snapshot_id=19, + timestamp_ms=1602638573590, + manifest_list=generated_manifest_file_file_v2, + summary=Summary(Operation.APPEND), + schema_id=3, + ) + + # Access the manifests property multiple times to test caching + manifests_first_call = snapshot.manifests(io) + manifests_second_call = snapshot.manifests(io) + + # Ensure that read_manifest_list was called only once + mocked_read_manifest_list.assert_called_once() + + # Ensure that the same manifest list is returned + assert manifests_first_call == manifests_second_call + + def test_write_empty_manifest() -> None: io = load_file_io() test_schema = Schema(NestedField(1, "foo", IntegerType(), False))