Skip to content

Commit 2609002

Browse files
committed
fix TODOs
1 parent 0519448 commit 2609002

File tree

1 file changed

+38
-57
lines changed

1 file changed

+38
-57
lines changed

pyiceberg/table/__init__.py

Lines changed: 38 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -2435,7 +2435,7 @@ def _dataframe_to_data_files(
24352435
yield from write_file(table, iter([WriteTask(write_uuid, next(counter), df)]), file_schema=file_schema)
24362436

24372437

2438-
class _SnapshotProducer:
2438+
class _SnapshotProducer(ABC):
24392439
commit_uuid: uuid.UUID
24402440
_operation: Operation
24412441
_table: Table
@@ -2480,10 +2480,9 @@ def _deleted_entries(self) -> List[ManifestEntry]: ...
24802480
@abstractmethod
24812481
def _existing_manifests(self) -> List[ManifestFile]: ...
24822482

2483-
def _combine_manifests(
2484-
self, added_manifests: List[ManifestFile], delete_manifests: List[ManifestFile], existing_manifests: List[ManifestFile]
2485-
) -> List[ManifestFile]:
2486-
return added_manifests + delete_manifests + existing_manifests
2483+
def _process_manifests(self, manifests: List[ManifestFile]) -> List[ManifestFile]:
2484+
"""To perform any post-processing on the manifests before writing them to the new snapshot."""
2485+
return manifests
24872486

24882487
def _manifests(self) -> List[ManifestFile]:
24892488
def _write_added_manifest() -> List[ManifestFile]:
@@ -2535,7 +2534,7 @@ def _write_delete_manifest() -> List[ManifestFile]:
25352534
delete_manifests = executor.submit(_write_delete_manifest)
25362535
existing_manifests = executor.submit(self._existing_manifests)
25372536

2538-
return self._combine_manifests(added_manifests.result(), delete_manifests.result(), existing_manifests.result())
2537+
return self._process_manifests(added_manifests.result() + delete_manifests.result() + existing_manifests.result())
25392538

25402539
def _summary(self) -> Summary:
25412540
ssc = SnapshotSummaryCollector()
@@ -2619,7 +2618,30 @@ def fetch_manifest_entry(self, manifest: ManifestFile, discard_deleted: bool = T
26192618
return manifest.fetch_manifest_entry(io=self._table.io, discard_deleted=discard_deleted)
26202619

26212620

2622-
class MergeAppendFiles(_SnapshotProducer):
2621+
class AppendFiles(_SnapshotProducer, ABC):
2622+
def _existing_manifests(self) -> List[ManifestFile]:
2623+
"""To determine if there are any existing manifest files.
2624+
2625+
All the existing manifest files are considered existing.
2626+
A fast append will add another ManifestFile to the ManifestList.
2627+
A merge append will merge ManifestFiles if needed later.
2628+
"""
2629+
existing_manifests = []
2630+
2631+
if self._parent_snapshot_id is not None:
2632+
previous_snapshot = self._table.snapshot_by_id(self._parent_snapshot_id)
2633+
2634+
if previous_snapshot is None:
2635+
raise ValueError(f"Snapshot could not be found: {self._parent_snapshot_id}")
2636+
2637+
for manifest in previous_snapshot.manifests(io=self._table.io):
2638+
if manifest.has_added_files() or manifest.has_existing_files() or manifest.added_snapshot_id == self._snapshot_id:
2639+
existing_manifests.append(manifest)
2640+
2641+
return existing_manifests
2642+
2643+
2644+
class MergeAppendFiles(AppendFiles):
26232645
_target_size_bytes: int
26242646
_min_count_to_merge: int
26252647
_merge_enabled: bool
@@ -2642,39 +2664,21 @@ def __init__(
26422664
self._table.properties, TableProperties.MANIFEST_MERGE_ENABLED, TableProperties.MANIFEST_MERGE_ENABLED_DEFAULT
26432665
)
26442666

2645-
def _existing_manifests(self) -> List[ManifestFile]:
2646-
"""To determine if there are any existing manifest files."""
2647-
existing_manifests = []
2648-
2649-
if self._parent_snapshot_id is not None:
2650-
previous_snapshot = self._table.snapshot_by_id(self._parent_snapshot_id)
2651-
2652-
if previous_snapshot is None:
2653-
raise ValueError(f"Snapshot could not be found: {self._parent_snapshot_id}")
2654-
2655-
for manifest in previous_snapshot.manifests(io=self._table.io):
2656-
if manifest.has_added_files() or manifest.has_existing_files() or manifest.added_snapshot_id == self._snapshot_id:
2657-
existing_manifests.append(manifest)
2658-
2659-
return existing_manifests
2660-
26612667
def _deleted_entries(self) -> List[ManifestEntry]:
26622668
"""To determine if we need to record any deleted manifest entries.
26632669
26642670
In case of an append, nothing is deleted.
26652671
"""
26662672
return []
26672673

2668-
def _combine_manifests(
2669-
self, added_manifests: List[ManifestFile], delete_manifests: List[ManifestFile], existing_manifests: List[ManifestFile]
2670-
) -> List[ManifestFile]:
2671-
unmerged_data_manifests = (
2672-
added_manifests
2673-
+ delete_manifests
2674-
+ [manifest for manifest in existing_manifests if manifest.content == ManifestContent.DATA]
2675-
)
2676-
# TODO: need to re-consider the name here: manifest containing positional deletes and manifest containing deleted entries
2677-
unmerged_deletes_manifests = [manifest for manifest in existing_manifests if manifest.content == ManifestContent.DELETES]
2674+
def _process_manifests(self, manifests: List[ManifestFile]) -> List[ManifestFile]:
2675+
"""To perform any post-processing on the manifests before writing them to the new snapshot.
2676+
2677+
In MergeAppendFiles, we merge manifests based on the target size and the minimum count to merge
2678+
if automatic merge is enabled.
2679+
"""
2680+
unmerged_data_manifests = [manifest for manifest in manifests if manifest.content == ManifestContent.DATA]
2681+
unmerged_deletes_manifests = [manifest for manifest in manifests if manifest.content == ManifestContent.DELETES]
26782682

26792683
data_manifest_merge_manager = _ManifestMergeManager(
26802684
target_size_bytes=self._target_size_bytes,
@@ -2686,27 +2690,7 @@ def _combine_manifests(
26862690
return data_manifest_merge_manager.merge_manifests(unmerged_data_manifests) + unmerged_deletes_manifests
26872691

26882692

2689-
class FastAppendFiles(_SnapshotProducer):
2690-
def _existing_manifests(self) -> List[ManifestFile]:
2691-
"""To determine if there are any existing manifest files.
2692-
2693-
A fast append will add another ManifestFile to the ManifestList.
2694-
All the existing manifest files are considered existing.
2695-
"""
2696-
existing_manifests = []
2697-
2698-
if self._parent_snapshot_id is not None:
2699-
previous_snapshot = self._table.snapshot_by_id(self._parent_snapshot_id)
2700-
2701-
if previous_snapshot is None:
2702-
raise ValueError(f"Snapshot could not be found: {self._parent_snapshot_id}")
2703-
2704-
for manifest in previous_snapshot.manifests(io=self._table.io):
2705-
if manifest.has_added_files() or manifest.has_existing_files() or manifest.added_snapshot_id == self._snapshot_id:
2706-
existing_manifests.append(manifest)
2707-
2708-
return existing_manifests
2709-
2693+
class FastAppendFiles(AppendFiles):
27102694
def _deleted_entries(self) -> List[ManifestEntry]:
27112695
"""To determine if we need to record any deleted manifest entries.
27122696
@@ -2804,9 +2788,6 @@ def _group_by_spec(
28042788
return groups
28052789

28062790
def _create_manifest(self, spec_id: int, manifest_bin: List[ManifestFile]) -> ManifestFile:
2807-
# TODO: later check if we need to implement cache:
2808-
# https://github.com/apache/iceberg/blob/main/core/src/main/java/org/apache/iceberg/ManifestMergeManager.java#L165
2809-
28102791
with self._snapshot_producer.new_manifest_writer(spec=self._snapshot_producer.spec(spec_id)) as writer:
28112792
for manifest in manifest_bin:
28122793
for entry in self._snapshot_producer.fetch_manifest_entry(manifest):

0 commit comments

Comments
 (0)