Skip to content

Commit abbb01c

Browse files
committed
fixes according to pr
1 parent bf68848 commit abbb01c

File tree

4 files changed

+44
-33
lines changed

4 files changed

+44
-33
lines changed

pyiceberg/manifest.py

-1
Original file line numberDiff line numberDiff line change
@@ -865,7 +865,6 @@ def existing(self, entry: ManifestEntry) -> ManifestWriter:
865865
entry.snapshot_id, entry.sequence_number, entry.file_sequence_number, entry.data_file
866866
)
867867
)
868-
self._existing_files += 1
869868
return self
870869

871870

pyiceberg/table/__init__.py

-3
Original file line numberDiff line numberDiff line change
@@ -219,9 +219,6 @@ class TableProperties:
219219
MIN_SNAPSHOTS_TO_KEEP = "history.expire.min-snapshots-to-keep"
220220
MIN_SNAPSHOTS_TO_KEEP_DEFAULT = 1
221221

222-
SNAPSHOT_ID_INHERITANCE_ENABLED = "compatibility.snapshot-id-inheritance.enabled"
223-
SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT = False
224-
225222

226223
class Transaction:
227224
_table: Table

pyiceberg/table/update/snapshot.py

+39-25
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from abc import abstractmethod
2323
from collections import defaultdict
2424
from concurrent.futures import Future
25-
from dataclasses import dataclass
25+
from dataclasses import dataclass, field
2626
from functools import cached_property
2727
from typing import TYPE_CHECKING, Any, Callable, Dict, Generic, List, Optional, Set, Tuple
2828

@@ -480,8 +480,8 @@ def _deleted_entries(self) -> List[ManifestEntry]:
480480

481481
@dataclass(init=False)
482482
class RewriteManifestsResult:
483-
rewritten_manifests: List[ManifestFile]
484-
added_manifests: List[ManifestFile]
483+
rewritten_manifests: List[ManifestFile] = field(default_factory=list)
484+
added_manifests: List[ManifestFile] = field(default_factory=list)
485485

486486
def __init__(
487487
self,
@@ -544,7 +544,11 @@ def _process_manifests(self, manifests: List[ManifestFile]) -> List[ManifestFile
544544

545545

546546
class _RewriteManifests(_SnapshotProducer["_RewriteManifests"]):
547+
_table: Table
548+
_spec_id: int
547549
_target_size_bytes: int
550+
_min_count_to_merge: int
551+
_merge_enabled: bool
548552
rewritten_manifests: List[ManifestFile] = []
549553
added_manifests: List[ManifestFile] = []
550554
kept_manifests: List[ManifestFile] = []
@@ -559,10 +563,15 @@ def __init__(
559563
):
560564
from pyiceberg.table import TableProperties
561565

562-
_table: Table
563-
_spec: PartitionSpec
564-
565566
super().__init__(Operation.REPLACE, transaction, io, snapshot_properties=snapshot_properties)
567+
568+
snapshot = self._table.current_snapshot()
569+
if self._spec_id and self._spec_id not in self._table.specs():
570+
raise ValueError(f"Cannot find spec with id: {self._spec_id}")
571+
572+
if not snapshot:
573+
raise ValueError("Cannot rewrite manifests without a current snapshot")
574+
566575
self._target_size_bytes = property_as_int(
567576
self._transaction.table_metadata.properties,
568577
TableProperties.MANIFEST_TARGET_SIZE_BYTES,
@@ -571,6 +580,17 @@ def __init__(
571580
self._table = table
572581
self._spec_id = spec_id or table.spec().spec_id
573582

583+
self._min_count_to_merge = property_as_int(
584+
self._transaction.table_metadata.properties,
585+
TableProperties.MANIFEST_MIN_MERGE_COUNT,
586+
TableProperties.MANIFEST_MIN_MERGE_COUNT_DEFAULT,
587+
) # type: ignore
588+
self._merge_enabled = property_as_bool(
589+
self._transaction.table_metadata.properties,
590+
TableProperties.MANIFEST_MERGE_ENABLED,
591+
TableProperties.MANIFEST_MERGE_ENABLED_DEFAULT,
592+
)
593+
574594
def _summary(self, snapshot_properties: Dict[str, str] = EMPTY_DICT) -> Summary:
575595
from pyiceberg.table import TableProperties
576596

@@ -583,10 +603,10 @@ def _summary(self, snapshot_properties: Dict[str, str] = EMPTY_DICT) -> Summary:
583603
ssc.set_partition_summary_limit(partition_summary_limit)
584604

585605
props = {
586-
"manifests-kept": str(len([])),
606+
"manifests-kept": "0",
587607
"manifests-created": str(len(self.added_manifests)),
588608
"manifests-replaced": str(len(self.rewritten_manifests)),
589-
"entries-processed": str(len([])),
609+
"entries-processed": "0",
590610
}
591611
previous_snapshot = (
592612
self._transaction.table_metadata.snapshot_by_id(self._parent_snapshot_id)
@@ -601,28 +621,25 @@ def _summary(self, snapshot_properties: Dict[str, str] = EMPTY_DICT) -> Summary:
601621
)
602622

603623
def rewrite_manifests(self) -> RewriteManifestsResult:
604-
data_result = self._find_matching_manifests(ManifestContent.DATA)
624+
snapshot = self._table.current_snapshot()
625+
if not snapshot:
626+
raise ValueError("Cannot rewrite manifests without a current snapshot")
627+
628+
data_result = self._find_matching_manifests(snapshot, ManifestContent.DATA)
605629

606630
self.rewritten_manifests.extend(data_result.rewritten_manifests)
607631
self.added_manifests.extend(data_result.added_manifests)
608632

609-
deletes_result = self._find_matching_manifests(ManifestContent.DELETES)
633+
deletes_result = self._find_matching_manifests(snapshot, ManifestContent.DELETES)
610634
self.rewritten_manifests.extend(deletes_result.rewritten_manifests)
611635
self.added_manifests.extend(deletes_result.added_manifests)
612636

613-
if not self.rewritten_manifests:
637+
if len(self.rewritten_manifests) == 0:
614638
return RewriteManifestsResult(rewritten_manifests=[], added_manifests=[])
615639

616640
return RewriteManifestsResult(rewritten_manifests=self.rewritten_manifests, added_manifests=self.added_manifests)
617641

618-
def _find_matching_manifests(self, content: ManifestContent) -> RewriteManifestsResult:
619-
snapshot = self._table.current_snapshot()
620-
if self._spec_id and self._spec_id not in self._table.specs():
621-
raise ValueError(f"Cannot find spec with id: {self._spec_id}")
622-
623-
if not snapshot:
624-
raise ValueError("Cannot rewrite manifests without a current snapshot")
625-
642+
def _find_matching_manifests(self, snapshot: Snapshot, content: ManifestContent) -> RewriteManifestsResult:
626643
manifests = [
627644
manifest
628645
for manifest in snapshot.manifests(io=self._io)
@@ -631,8 +648,8 @@ def _find_matching_manifests(self, content: ManifestContent) -> RewriteManifests
631648

632649
data_manifest_merge_manager = _ManifestMergeManager(
633650
target_size_bytes=self._target_size_bytes,
634-
min_count_to_merge=2,
635-
merge_enabled=True,
651+
min_count_to_merge=self._min_count_to_merge,
652+
merge_enabled=self._merge_enabled,
636653
snapshot_producer=self,
637654
)
638655
new_manifests = data_manifest_merge_manager.merge_manifests(manifests=manifests)
@@ -668,10 +685,7 @@ def _existing_manifests(self) -> List[ManifestFile]:
668685
return [self._copy_manifest_file(manifest, self.snapshot_id) for manifest in self.added_manifests]
669686

670687
def _deleted_entries(self) -> List[ManifestEntry]:
671-
"""To determine if we need to record any deleted manifest entries.
672-
673-
In case of an append, nothing is deleted.
674-
"""
688+
"""To determine if we need to record any deleted manifest entries."""
675689
return []
676690

677691

tests/integration/test_writes/test_rewrite_manifests.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -180,8 +180,9 @@ def compute_manifest_entry_size_bytes(manifests: List[ManifestFile]) -> float:
180180

181181
for manifest in manifests:
182182
total_size += manifest.manifest_length
183-
num_entries += manifest.added_files_count + manifest.existing_files_count + manifest.deleted_files_count
184-
183+
num_entries += (
184+
(manifest.added_files_count or 0) + (manifest.existing_files_count or 0) + (manifest.deleted_files_count or 0)
185+
)
185186
return total_size / num_entries if num_entries > 0 else 0
186187

187188

@@ -212,7 +213,7 @@ def test_rewrite_small_manifests_partitioned_table(session_catalog: Catalog) ->
212213
tbl.refresh()
213214

214215
tbl.refresh()
215-
manifests = tbl.current_snapshot().manifests(tbl.io)
216+
manifests = tbl.current_snapshot().manifests(tbl.io) # type: ignore
216217
assert len(manifests) == 4, "Should have 4 manifests before rewrite"
217218

218219
# manifest_entry_size_bytes = compute_manifest_entry_size_bytes(manifests)
@@ -229,7 +230,7 @@ def test_rewrite_small_manifests_partitioned_table(session_catalog: Catalog) ->
229230
assert len(result.rewritten_manifests) == 4, "Action should rewrite 4 manifests"
230231
assert len(result.added_manifests) == 2, "Action should add 2 manifests"
231232

232-
new_manifests = tbl.current_snapshot().manifests(tbl.io)
233+
new_manifests = tbl.current_snapshot().manifests(tbl.io) # type: ignore
233234
assert len(new_manifests) == 2, "Should have 2 manifests after rewrite"
234235

235236
assert new_manifests[0].existing_files_count == 4

0 commit comments

Comments
 (0)