Skip to content

Commit 173ddb9

Browse files
committed
cache manifests
1 parent 18448fd commit 173ddb9

File tree

7 files changed

+26
-18
lines changed

7 files changed

+26
-18
lines changed

pyiceberg/catalog/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -717,7 +717,7 @@ def purge_table(self, identifier: Union[str, Identifier]) -> None:
717717
manifest_lists_to_delete = set()
718718
manifests_to_delete: List[ManifestFile] = []
719719
for snapshot in metadata.snapshots:
720-
manifests_to_delete += snapshot.manifests(io)
720+
manifests_to_delete += snapshot.manifests(io, snapshot.manifest_list)
721721
if snapshot.manifest_list is not None:
722722
manifest_lists_to_delete.add(snapshot.manifest_list)
723723

pyiceberg/cli/output.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ def files(self, table: Table, history: bool) -> None:
144144
manifest_list_str = f": {snapshot.manifest_list}" if snapshot.manifest_list else ""
145145
list_tree = snapshot_tree.add(f"Snapshot {snapshot.snapshot_id}, schema {snapshot.schema_id}{manifest_list_str}")
146146

147-
manifest_list = snapshot.manifests(io)
147+
manifest_list = snapshot.manifests(io, manifest_list_str)
148148
for manifest in manifest_list:
149149
manifest_tree = list_tree.add(f"Manifest: {manifest.manifest_path}")
150150
for manifest_entry in manifest.fetch_manifest_entry(io, discard_deleted=False):

pyiceberg/table/__init__.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1708,7 +1708,7 @@ def plan_files(self) -> Iterable[FileScanTask]:
17081708

17091709
manifests = [
17101710
manifest_file
1711-
for manifest_file in snapshot.manifests(self.io)
1711+
for manifest_file in snapshot.manifests(self.io, snapshot.manifest_list)
17121712
if manifest_evaluators[manifest_file.partition_spec_id](manifest_file)
17131713
]
17141714

@@ -2941,7 +2941,7 @@ def _existing_manifests(self) -> List[ManifestFile]:
29412941
if previous_snapshot is None:
29422942
raise ValueError(f"Snapshot could not be found: {self._parent_snapshot_id}")
29432943

2944-
for manifest in previous_snapshot.manifests(io=self._io):
2944+
for manifest in previous_snapshot.manifests(io=self._io, manifest_list=previous_snapshot.manifest_list):
29452945
if manifest.has_added_files() or manifest.has_existing_files() or manifest.added_snapshot_id == self._snapshot_id:
29462946
existing_manifests.append(manifest)
29472947

@@ -2992,7 +2992,7 @@ def _get_entries(manifest: ManifestFile) -> List[ManifestEntry]:
29922992
if entry.data_file.content == DataFileContent.DATA
29932993
]
29942994

2995-
list_of_entries = executor.map(_get_entries, previous_snapshot.manifests(self._io))
2995+
list_of_entries = executor.map(_get_entries, previous_snapshot.manifests(self._io, previous_snapshot.manifest_list))
29962996
return list(chain(*list_of_entries))
29972997
else:
29982998
return []
@@ -3384,7 +3384,7 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType:
33843384

33853385
entries = []
33863386
snapshot = self._get_snapshot(snapshot_id)
3387-
for manifest in snapshot.manifests(self.tbl.io):
3387+
for manifest in snapshot.manifests(self.tbl.io, snapshot.manifest_list):
33883388
for entry in manifest.fetch_manifest_entry(io=self.tbl.io):
33893389
column_sizes = entry.data_file.column_sizes or {}
33903390
value_counts = entry.data_file.value_counts or {}
@@ -3546,7 +3546,7 @@ def update_partitions_map(
35463546

35473547
partitions_map: Dict[Tuple[str, Any], Any] = {}
35483548
snapshot = self._get_snapshot(snapshot_id)
3549-
for manifest in snapshot.manifests(self.tbl.io):
3549+
for manifest in snapshot.manifests(self.tbl.io, snapshot.manifest_list):
35503550
for entry in manifest.fetch_manifest_entry(io=self.tbl.io):
35513551
partition = entry.data_file.partition
35523552
partition_record_dict = {
@@ -3624,7 +3624,7 @@ def _partition_summaries_to_rows(
36243624
specs = self.tbl.metadata.specs()
36253625
manifests = []
36263626
if snapshot := self.tbl.metadata.current_snapshot():
3627-
for manifest in snapshot.manifests(self.tbl.io):
3627+
for manifest in snapshot.manifests(self.tbl.io, snapshot.manifest_list):
36283628
is_data_file = manifest.content == ManifestContent.DATA
36293629
is_delete_file = manifest.content == ManifestContent.DELETES
36303630
manifests.append({

pyiceberg/table/snapshots.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import time
2020
from collections import defaultdict
2121
from enum import Enum
22+
from functools import lru_cache
2223
from typing import TYPE_CHECKING, Any, DefaultDict, Dict, Iterable, List, Mapping, Optional
2324

2425
from pydantic import Field, PrivateAttr, model_serializer
@@ -247,9 +248,12 @@ def __str__(self) -> str:
247248
result_str = f"{operation}id={self.snapshot_id}{parent_id}{schema_id}"
248249
return result_str
249250

250-
def manifests(self, io: FileIO) -> List[ManifestFile]:
251-
if self.manifest_list is not None:
252-
file = io.new_input(self.manifest_list)
251+
@staticmethod
252+
@lru_cache
253+
def manifests(io: FileIO, manifest_list: str) -> List[ManifestFile]:
254+
"""Return the manifests for the given snapshot."""
255+
if manifest_list not in (None, ""):
256+
file = io.new_input(manifest_list)
253257
return list(read_manifest_list(file))
254258
return []
255259

tests/integration/test_partitioning_key.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -763,10 +763,14 @@ def test_partition_key(
763763
snapshot = iceberg_table.current_snapshot()
764764
assert snapshot
765765
spark_partition_for_justification = (
766-
snapshot.manifests(iceberg_table.io)[0].fetch_manifest_entry(iceberg_table.io)[0].data_file.partition
766+
snapshot.manifests(iceberg_table.io, snapshot.manifest_list)[0]
767+
.fetch_manifest_entry(iceberg_table.io)[0]
768+
.data_file.partition
767769
)
768770
spark_path_for_justification = (
769-
snapshot.manifests(iceberg_table.io)[0].fetch_manifest_entry(iceberg_table.io)[0].data_file.file_path
771+
snapshot.manifests(iceberg_table.io, snapshot.manifest_list)[0]
772+
.fetch_manifest_entry(iceberg_table.io)[0]
773+
.data_file.file_path
770774
)
771775
assert spark_partition_for_justification == expected_partition_record
772776
assert expected_hive_partition_path_slice in spark_path_for_justification

tests/integration/test_rest_manifest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def test_write_sample_manifest(table_test_all_types: Table) -> None:
7575
if test_snapshot is None:
7676
raise ValueError("Table has no current snapshot, check the docker environment")
7777
io = table_test_all_types.io
78-
test_manifest_file = test_snapshot.manifests(io)[0]
78+
test_manifest_file = test_snapshot.manifests(io, test_snapshot.manifest_list)[0]
7979
test_manifest_entries = test_manifest_file.fetch_manifest_entry(io)
8080
entry = test_manifest_entries[0]
8181
test_schema = table_test_all_types.schema()

tests/utils/test_manifest.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ def test_read_manifest_v1(generated_manifest_file_file_v1: str) -> None:
217217
summary=Summary(Operation.APPEND),
218218
schema_id=3,
219219
)
220-
manifest_list = snapshot.manifests(io)[0]
220+
manifest_list = snapshot.manifests(io, snapshot.manifest_list)[0]
221221

222222
assert manifest_list.manifest_length == 7989
223223
assert manifest_list.partition_spec_id == 0
@@ -267,7 +267,7 @@ def test_read_manifest_v2(generated_manifest_file_file_v2: str) -> None:
267267
summary=Summary(Operation.APPEND),
268268
schema_id=3,
269269
)
270-
manifest_list = snapshot.manifests(io)[0]
270+
manifest_list = snapshot.manifests(io, manifest_list=snapshot.manifest_list)[0]
271271

272272
assert manifest_list.manifest_length == 7989
273273
assert manifest_list.partition_spec_id == 0
@@ -319,7 +319,7 @@ def test_write_manifest(
319319
summary=Summary(Operation.APPEND),
320320
schema_id=3,
321321
)
322-
demo_manifest_file = snapshot.manifests(io)[0]
322+
demo_manifest_file = snapshot.manifests(io, snapshot.manifest_list)[0]
323323
manifest_entries = demo_manifest_file.fetch_manifest_entry(io)
324324
test_schema = Schema(
325325
NestedField(1, "VendorID", IntegerType(), False), NestedField(2, "tpep_pickup_datetime", IntegerType(), False)
@@ -491,7 +491,7 @@ def test_write_manifest_list(
491491
schema_id=3,
492492
)
493493

494-
demo_manifest_list = snapshot.manifests(io)
494+
demo_manifest_list = snapshot.manifests(io, snapshot.manifest_list)
495495
with TemporaryDirectory() as tmp_dir:
496496
path = tmp_dir + "/manifest-list.avro"
497497
output = io.new_output(path)

0 commit comments

Comments
 (0)