Skip to content

Commit 6b7f07a

Browse files
authored
Use cachetools's LRUCache to cache manifest list (#1187)
* use cachetools * use LRU cache * return tuple * comment * clear global cache for tests * move _manifests to manifest.py * rebase poetry.lock
1 parent 5dcda55 commit 6b7f07a

File tree

5 files changed

+359
-12
lines changed

5 files changed

+359
-12
lines changed

poetry.lock

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyiceberg/manifest.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,12 @@
2828
List,
2929
Literal,
3030
Optional,
31+
Tuple,
3132
Type,
3233
)
3334

35+
from cachetools import LRUCache, cached
36+
from cachetools.keys import hashkey
3437
from pydantic_core import to_json
3538

3639
from pyiceberg.avro.file import AvroFile, AvroOutputFile
@@ -620,6 +623,13 @@ def fetch_manifest_entry(self, io: FileIO, discard_deleted: bool = True) -> List
620623
]
621624

622625

626+
@cached(cache=LRUCache(maxsize=128), key=lambda io, manifest_list: hashkey(manifest_list))
627+
def _manifests(io: FileIO, manifest_list: str) -> Tuple[ManifestFile, ...]:
628+
"""Read and cache manifests from the given manifest list, returning a tuple to prevent modification."""
629+
file = io.new_input(manifest_list)
630+
return tuple(read_manifest_list(file))
631+
632+
623633
def read_manifest_list(input_file: InputFile) -> Iterator[ManifestFile]:
624634
"""
625635
Read the manifests from the manifest list.

pyiceberg/table/snapshots.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,12 @@
1919
import time
2020
from collections import defaultdict
2121
from enum import Enum
22-
from functools import lru_cache
2322
from typing import TYPE_CHECKING, Any, DefaultDict, Dict, Iterable, List, Mapping, Optional
2423

2524
from pydantic import Field, PrivateAttr, model_serializer
2625

2726
from pyiceberg.io import FileIO
28-
from pyiceberg.manifest import DataFile, DataFileContent, ManifestFile, read_manifest_list
27+
from pyiceberg.manifest import DataFile, DataFileContent, ManifestFile, _manifests
2928
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec
3029
from pyiceberg.schema import Schema
3130

@@ -231,13 +230,6 @@ def __eq__(self, other: Any) -> bool:
231230
)
232231

233232

234-
@lru_cache
235-
def _manifests(io: FileIO, manifest_list: str) -> List[ManifestFile]:
236-
"""Return the manifests from the manifest list."""
237-
file = io.new_input(manifest_list)
238-
return list(read_manifest_list(file))
239-
240-
241233
class Snapshot(IcebergBaseModel):
242234
snapshot_id: int = Field(alias="snapshot-id")
243235
parent_snapshot_id: Optional[int] = Field(alias="parent-snapshot-id", default=None)
@@ -260,7 +252,7 @@ def __str__(self) -> str:
260252
def manifests(self, io: FileIO) -> List[ManifestFile]:
261253
"""Return the manifests for the given snapshot."""
262254
if self.manifest_list:
263-
return _manifests(io, self.manifest_list)
255+
return list(_manifests(io, self.manifest_list))
264256
return []
265257

266258

0 commit comments

Comments
 (0)