From 9960aee1968a17a1be5aee92bd9b71b3eb952c0b Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Wed, 18 Sep 2024 22:31:01 -0700 Subject: [PATCH 1/3] cache manifest files in snapshot class --- pyiceberg/table/snapshots.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/pyiceberg/table/snapshots.py b/pyiceberg/table/snapshots.py index 980399a2ab..145f94b155 100644 --- a/pyiceberg/table/snapshots.py +++ b/pyiceberg/table/snapshots.py @@ -19,7 +19,6 @@ import time from collections import defaultdict from enum import Enum -from functools import lru_cache from typing import TYPE_CHECKING, Any, DefaultDict, Dict, Iterable, List, Mapping, Optional from pydantic import Field, PrivateAttr, model_serializer @@ -231,13 +230,6 @@ def __eq__(self, other: Any) -> bool: ) -@lru_cache -def _manifests(io: FileIO, manifest_list: str) -> List[ManifestFile]: - """Return the manifests from the manifest list.""" - file = io.new_input(manifest_list) - return list(read_manifest_list(file)) - - class Snapshot(IcebergBaseModel): snapshot_id: int = Field(alias="snapshot-id") parent_snapshot_id: Optional[int] = Field(alias="parent-snapshot-id", default=None) @@ -249,6 +241,8 @@ class Snapshot(IcebergBaseModel): summary: Optional[Summary] = Field(default=None) schema_id: Optional[int] = Field(alias="schema-id", default=None) + _manifest_cache: Optional[List[ManifestFile]] = None # Cache for manifests + def __str__(self) -> str: """Return the string representation of the Snapshot class.""" operation = f"{self.summary.operation}: " if self.summary else "" @@ -258,10 +252,14 @@ def __str__(self) -> str: return result_str def manifests(self, io: FileIO) -> List[ManifestFile]: - """Return the manifests for the given snapshot.""" - if self.manifest_list: - return _manifests(io, self.manifest_list) - return [] + """Return the manifests for the given snapshot, using instance-level caching.""" + if self._manifest_cache is None: + if self.manifest_list: + file = io.new_input(self.manifest_list) + self._manifest_cache = list(read_manifest_list(file)) + else: + self._manifest_cache = [] + return self._manifest_cache class MetadataLogEntry(IcebergBaseModel): From 17e69a6d87a8fcd21c89567052dcc9475f59b854 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Wed, 18 Sep 2024 22:31:51 -0700 Subject: [PATCH 2/3] use privateattr --- pyiceberg/table/snapshots.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyiceberg/table/snapshots.py b/pyiceberg/table/snapshots.py index 145f94b155..78cbe17ace 100644 --- a/pyiceberg/table/snapshots.py +++ b/pyiceberg/table/snapshots.py @@ -241,7 +241,8 @@ class Snapshot(IcebergBaseModel): summary: Optional[Summary] = Field(default=None) schema_id: Optional[int] = Field(alias="schema-id", default=None) - _manifest_cache: Optional[List[ManifestFile]] = None # Cache for manifests + # Private attribute for caching the manifests + _manifest_cache: Optional[List[ManifestFile]] = PrivateAttr(default=None) def __str__(self) -> str: """Return the string representation of the Snapshot class.""" From 1f471f84f9280cd8703b7f2ba5f3713b16243fa2 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Thu, 19 Sep 2024 10:31:40 -0700 Subject: [PATCH 3/3] add cache test --- tests/utils/test_manifest.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/utils/test_manifest.py b/tests/utils/test_manifest.py index ef33b16b00..4f72803ffc 100644 --- a/tests/utils/test_manifest.py +++ b/tests/utils/test_manifest.py @@ -17,6 +17,7 @@ # pylint: disable=redefined-outer-name,arguments-renamed,fixme from tempfile import TemporaryDirectory from typing import Dict +from unittest.mock import patch import fastavro import pytest @@ -306,6 +307,31 @@ def test_read_manifest_v2(generated_manifest_file_file_v2: str) -> None: assert entry.status == ManifestEntryStatus.ADDED +def test_read_manifest_cache(generated_manifest_file_file_v2: str) -> None: + with patch("pyiceberg.table.snapshots.read_manifest_list") as mocked_read_manifest_list: + # Mock the read_manifest_list function relative to the module path + io = load_file_io() + + snapshot = Snapshot( + snapshot_id=25, + parent_snapshot_id=19, + timestamp_ms=1602638573590, + manifest_list=generated_manifest_file_file_v2, + summary=Summary(Operation.APPEND), + schema_id=3, + ) + + # Access the manifests property multiple times to test caching + manifests_first_call = snapshot.manifests(io) + manifests_second_call = snapshot.manifests(io) + + # Ensure that read_manifest_list was called only once + mocked_read_manifest_list.assert_called_once() + + # Ensure that the same manifest list is returned + assert manifests_first_call == manifests_second_call + + def test_write_empty_manifest() -> None: io = load_file_io() test_schema = Schema(NestedField(1, "foo", IntegerType(), False))