Skip to content

Commit bf63c03

Browse files
committed
fix snapshot inheritance
1 parent cbb8cec commit bf63c03

File tree

2 files changed

+83
-24
lines changed

2 files changed

+83
-24
lines changed

pyiceberg/manifest.py

+67
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,46 @@ class ManifestEntry(Record):
402402
def __init__(self, *data: Any, **named_data: Any) -> None:
403403
super().__init__(*data, **{"struct": MANIFEST_ENTRY_SCHEMAS_STRUCT[DEFAULT_READ_VERSION], **named_data})
404404

405+
def _wrap(
406+
self,
407+
new_status: ManifestEntryStatus,
408+
new_snapshot_id: int,
409+
new_data_sequence_number: Optional[int],
410+
new_file_sequence_number: Optional[int],
411+
new_file: DataFile,
412+
) -> ManifestEntry:
413+
self.status = new_status
414+
self.snapshot_id = new_snapshot_id
415+
self.data_sequence_number = new_data_sequence_number
416+
self.file_sequence_number = new_file_sequence_number
417+
self.data_file = new_file
418+
return self
419+
420+
def _wrap_append(self, new_snapshot_id: int, new_data_sequence_number: Optional[int], new_file: DataFile) -> ManifestEntry:
421+
return self._wrap(ManifestEntryStatus.ADDED, new_snapshot_id, new_data_sequence_number, None, new_file)
422+
423+
def _wrap_delete(
424+
self,
425+
new_snapshot_id: int,
426+
new_data_sequence_number: Optional[int],
427+
new_file_sequence_number: Optional[int],
428+
new_file: DataFile,
429+
) -> ManifestEntry:
430+
return self._wrap(
431+
ManifestEntryStatus.DELETED, new_snapshot_id, new_data_sequence_number, new_file_sequence_number, new_file
432+
)
433+
434+
def _wrap_existing(
435+
self,
436+
new_snapshot_id: int,
437+
new_data_sequence_number: Optional[int],
438+
new_file_sequence_number: Optional[int],
439+
new_file: DataFile,
440+
) -> ManifestEntry:
441+
return self._wrap(
442+
ManifestEntryStatus.EXISTING, new_snapshot_id, new_data_sequence_number, new_file_sequence_number, new_file
443+
)
444+
405445

406446
PARTITION_FIELD_SUMMARY_TYPE = StructType(
407447
NestedField(509, "contains_null", BooleanType(), required=True),
@@ -654,6 +694,7 @@ class ManifestWriter(ABC):
654694
_deleted_rows: int
655695
_min_data_sequence_number: Optional[int]
656696
_partitions: List[Record]
697+
_reused_entry_wrapper: ManifestEntry
657698

658699
def __init__(
659700
self, spec: PartitionSpec, schema: Schema, output_file: OutputFile, snapshot_id: int, meta: Dict[str, str] = EMPTY_DICT
@@ -673,6 +714,7 @@ def __init__(
673714
self._deleted_rows = 0
674715
self._min_data_sequence_number = None
675716
self._partitions = []
717+
self._reused_entry_wrapper = ManifestEntry()
676718

677719
def __enter__(self) -> ManifestWriter:
678720
"""Open the writer."""
@@ -763,6 +805,31 @@ def add_entry(self, entry: ManifestEntry) -> ManifestWriter:
763805
self._writer.write_block([self.prepare_entry(entry)])
764806
return self
765807

808+
def add(self, entry: ManifestEntry) -> ManifestWriter:
809+
if entry.data_sequence_number is not None and entry.data_sequence_number >= 0:
810+
self.add_entry(
811+
self._reused_entry_wrapper._wrap_append(self._snapshot_id, entry.data_sequence_number, entry.data_file)
812+
)
813+
else:
814+
self.add_entry(self._reused_entry_wrapper._wrap_append(self._snapshot_id, None, entry.data_file))
815+
return self
816+
817+
def delete(self, entry: ManifestEntry) -> ManifestWriter:
818+
self.add_entry(
819+
self._reused_entry_wrapper._wrap_delete(
820+
self._snapshot_id, entry.data_sequence_number, entry.file_sequence_number, entry.data_file
821+
)
822+
)
823+
return self
824+
825+
def existing(self, entry: ManifestEntry) -> ManifestWriter:
826+
self.add_entry(
827+
self._reused_entry_wrapper._wrap_existing(
828+
self._snapshot_id, entry.data_sequence_number, entry.file_sequence_number, entry.data_file
829+
)
830+
)
831+
return self
832+
766833

767834
class ManifestWriterV1(ManifestWriter):
768835
def __init__(self, spec: PartitionSpec, schema: Schema, output_file: OutputFile, snapshot_id: int):

pyiceberg/table/__init__.py

+16-24
Original file line numberDiff line numberDiff line change
@@ -430,12 +430,12 @@ def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT)
430430

431431
def merge_append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT) -> None:
432432
"""
433-
Shorthand API for appending a PyArrow table to a table transaction.
433+
Shorthand API for appending a PyArrow table to a table transaction.
434434
435-
Args:
436-
df: The Arrow dataframe that will be appended to overwrite the table
437-
snapshot_properties: Custom properties to be added to the snapshot summary
438-
"""
435+
Args:
436+
df: The Arrow dataframe that will be appended to overwrite the table
437+
snapshot_properties: Custom properties to be added to the snapshot summary
438+
"""
439439
try:
440440
import pyarrow as pa
441441
except ModuleNotFoundError as e:
@@ -461,11 +461,11 @@ def merge_append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY
461461
# skip writing data files if the dataframe is empty
462462
if df.shape[0] > 0:
463463
data_files = _dataframe_to_data_files(
464-
table_metadata=self._table.metadata, write_uuid=update_snapshot.commit_uuid, df=df,
465-
io=self._table.io
464+
table_metadata=self._table.metadata, write_uuid=update_snapshot.commit_uuid, df=df, io=self._table.io
466465
)
467466
for data_file in data_files:
468467
update_snapshot.append_data_file(data_file)
468+
469469
def overwrite(
470470
self, df: pa.Table, overwrite_filter: BooleanExpression = ALWAYS_TRUE, snapshot_properties: Dict[str, str] = EMPTY_DICT
471471
) -> None:
@@ -1392,12 +1392,12 @@ def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT)
13921392

13931393
def merge_append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT) -> None:
13941394
"""
1395-
Shorthand API for appending a PyArrow table to the table.
1395+
Shorthand API for appending a PyArrow table to the table.
13961396
1397-
Args:
1398-
df: The Arrow dataframe that will be appended to overwrite the table
1399-
snapshot_properties: Custom properties to be added to the snapshot summary
1400-
"""
1397+
Args:
1398+
df: The Arrow dataframe that will be appended to overwrite the table
1399+
snapshot_properties: Custom properties to be added to the snapshot summary
1400+
"""
14011401
with self.transaction() as tx:
14021402
tx.merge_append(df=df, snapshot_properties=snapshot_properties)
14031403

@@ -3919,26 +3919,18 @@ def _group_by_spec(
39193919
def _create_manifest(self, spec_id: int, manifest_bin: List[ManifestFile]) -> ManifestFile:
39203920
with self._snapshot_producer.new_manifest_writer(spec=self._snapshot_producer.spec(spec_id)) as writer:
39213921
for manifest in manifest_bin:
3922-
for entry in self._snapshot_producer.fetch_manifest_entry(manifest):
3922+
for entry in self._snapshot_producer.fetch_manifest_entry(manifest=manifest, discard_deleted=False):
39233923
if entry.status == ManifestEntryStatus.DELETED:
39243924
# suppress deletes from previous snapshots. only files deleted by this snapshot
39253925
# should be added to the new manifest
39263926
if entry.snapshot_id == self._snapshot_producer.snapshot_id:
3927-
writer.add_entry(entry)
3927+
writer.delete(entry)
39283928
elif entry.status == ManifestEntryStatus.ADDED and entry.snapshot_id == self._snapshot_producer.snapshot_id:
39293929
# adds from this snapshot are still adds, otherwise they should be existing
3930-
writer.add_entry(entry)
3930+
writer.add(entry)
39313931
else:
39323932
# add all files from the old manifest as existing files
3933-
writer.add_entry(
3934-
ManifestEntry(
3935-
status=ManifestEntryStatus.EXISTING,
3936-
snapshot_id=entry.snapshot_id,
3937-
data_sequence_number=entry.data_sequence_number,
3938-
file_sequence_number=entry.file_sequence_number,
3939-
data_file=entry.data_file,
3940-
)
3941-
)
3933+
writer.existing(entry)
39423934

39433935
return writer.to_manifest_file()
39443936

0 commit comments

Comments
 (0)