Skip to content

Commit 09193eb

Browse files
committed
adding add_files_overwrite method
use delete instead of overwrite check history too
1 parent 4cd67ac commit 09193eb

File tree

3 files changed

+449
-13
lines changed

3 files changed

+449
-13
lines changed

mkdocs/docs/api.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -672,7 +672,12 @@ file_paths = [
672672
673673
tbl.add_files(file_paths=file_paths)
674674
675+
# or if you want to overwrite
676+
677+
tbl.add_files_overwrite(file_paths=file_paths)
678+
675679
# A new snapshot is committed to the table with manifests pointing to the existing parquet files
680+
676681
```
677682

678683
<!-- prettier-ignore-start -->

pyiceberg/table/__init__.py

Lines changed: 53 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -569,6 +569,27 @@ def add_files(self, file_paths: List[str], snapshot_properties: Dict[str, str] =
569569
for data_file in data_files:
570570
update_snapshot.append_data_file(data_file)
571571

572+
def add_files_overwrite(self, file_paths: List[str], snapshot_properties: Dict[str, str] = EMPTY_DICT) -> None:
573+
"""
574+
Shorthand API for adding files as data files and overwriting the table.
575+
576+
Args:
577+
file_paths: The list of full file paths to be added as data files to the table
578+
snapshot_properties: Custom properties to be added to the snapshot summary
579+
580+
Raises:
581+
FileNotFoundError: If the file does not exist.
582+
"""
583+
if self._table.name_mapping() is None:
584+
self.set_properties(**{TableProperties.DEFAULT_NAME_MAPPING: self._table.schema().name_mapping.model_dump_json()})
585+
self.delete(delete_filter=ALWAYS_TRUE, snapshot_properties=snapshot_properties)
586+
with self.update_snapshot(snapshot_properties=snapshot_properties).fast_append() as update_snapshot:
587+
data_files = _parquet_files_to_data_files(
588+
table_metadata=self._table.metadata, file_paths=file_paths, io=self._table.io
589+
)
590+
for data_file in data_files:
591+
update_snapshot.append_data_file(data_file)
592+
572593
def update_spec(self) -> UpdateSpec:
573594
"""Create a new UpdateSpec to update the partitioning of the table.
574595
@@ -1480,6 +1501,20 @@ def add_files(self, file_paths: List[str], snapshot_properties: Dict[str, str] =
14801501
with self.transaction() as tx:
14811502
tx.add_files(file_paths=file_paths, snapshot_properties=snapshot_properties)
14821503

1504+
def add_files_overwrite(self, file_paths: List[str], snapshot_properties: Dict[str, str] = EMPTY_DICT) -> None:
1505+
"""
1506+
Shorthand API for adding files as data files and overwriting the table.
1507+
1508+
Args:
1509+
file_paths: The list of full file paths to be added as data files to the table
1510+
snapshot_properties: Custom properties to be added to the snapshot summary
1511+
1512+
Raises:
1513+
FileNotFoundError: If the file does not exist.
1514+
"""
1515+
with self.transaction() as tx:
1516+
tx.add_files_overwrite(file_paths=file_paths, snapshot_properties=snapshot_properties)
1517+
14831518
def update_spec(self, case_sensitive: bool = True) -> UpdateSpec:
14841519
return UpdateSpec(Transaction(self, autocommit=True), case_sensitive=case_sensitive)
14851520

@@ -3273,9 +3308,9 @@ def fast_append(self) -> FastAppendFiles:
32733308
def overwrite(self, commit_uuid: Optional[uuid.UUID] = None) -> OverwriteFiles:
32743309
return OverwriteFiles(
32753310
commit_uuid=commit_uuid,
3276-
operation=Operation.OVERWRITE
3277-
if self._transaction.table_metadata.current_snapshot() is not None
3278-
else Operation.APPEND,
3311+
operation=(
3312+
Operation.OVERWRITE if self._transaction.table_metadata.current_snapshot() is not None else Operation.APPEND
3313+
),
32793314
transaction=self._transaction,
32803315
io=self._io,
32813316
snapshot_properties=self._snapshot_properties,
@@ -3665,12 +3700,16 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType:
36653700
"null_value_count": null_value_counts.get(field.field_id),
36663701
"nan_value_count": nan_value_counts.get(field.field_id),
36673702
# Makes them readable
3668-
"lower_bound": from_bytes(field.field_type, lower_bound)
3669-
if (lower_bound := lower_bounds.get(field.field_id))
3670-
else None,
3671-
"upper_bound": from_bytes(field.field_type, upper_bound)
3672-
if (upper_bound := upper_bounds.get(field.field_id))
3673-
else None,
3703+
"lower_bound": (
3704+
from_bytes(field.field_type, lower_bound)
3705+
if (lower_bound := lower_bounds.get(field.field_id))
3706+
else None
3707+
),
3708+
"upper_bound": (
3709+
from_bytes(field.field_type, upper_bound)
3710+
if (upper_bound := upper_bounds.get(field.field_id))
3711+
else None
3712+
),
36743713
}
36753714
for field in self.tbl.metadata.schema().fields
36763715
}
@@ -3905,9 +3944,11 @@ def _partition_summaries_to_rows(
39053944
"added_delete_files_count": manifest.added_files_count if is_delete_file else 0,
39063945
"existing_delete_files_count": manifest.existing_files_count if is_delete_file else 0,
39073946
"deleted_delete_files_count": manifest.deleted_files_count if is_delete_file else 0,
3908-
"partition_summaries": _partition_summaries_to_rows(specs[manifest.partition_spec_id], manifest.partitions)
3909-
if manifest.partitions
3910-
else [],
3947+
"partition_summaries": (
3948+
_partition_summaries_to_rows(specs[manifest.partition_spec_id], manifest.partitions)
3949+
if manifest.partitions
3950+
else []
3951+
),
39113952
})
39123953

39133954
return pa.Table.from_pylist(

0 commit comments

Comments
 (0)