Skip to content

Commit bc7e1cc

Browse files
Fokkokevinjqliu
andcommitted
Fix the snapshot summary of a partial overwrite (#1879)
# Rationale for this change @kevinjqliu PTAL. I took the liberty of providing a fix for this since I was curious where this was coming from, hope you don't mind! I've cherry-picked your commit with the test. ![image](https://github.com/user-attachments/assets/14227da9-1f4a-4411-88f0-309907d3d332) Java produces: ```json { "added-data-files": "1", "added-files-size": "707", "added-records": "2", "app-id": "local-1743678304626", "changed-partition-count": "1", "deleted-data-files": "1", "deleted-records": "3", "engine-name": "spark", "engine-version": "3.5.5", "iceberg-version": "Apache Iceberg 1.8.1 (commit 9ce0fcf0af7becf25ad9fc996c3bad2afdcfd33d)", "removed-files-size": "693", "spark.app.id": "local-1743678304626", "total-data-files": "3", "total-delete-files": "0", "total-equality-deletes": "0", "total-files-size": "1993", "total-position-deletes": "0", "total-records": "4" } ``` # Are these changes tested? # Are there any user-facing changes? <!-- In the case of user-facing changes, please add the changelog label. --> --------- Co-authored-by: Kevin Liu <[email protected]> Co-authored-by: Kevin Liu <[email protected]>
1 parent 29e94e7 commit bc7e1cc

File tree

5 files changed

+115
-27
lines changed

5 files changed

+115
-27
lines changed

pyiceberg/table/snapshots.py

+6
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from pyiceberg.manifest import DataFile, DataFileContent, ManifestFile, _manifests
2929
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec
3030
from pyiceberg.schema import Schema
31+
from pyiceberg.utils.deprecated import deprecation_message
3132

3233
if TYPE_CHECKING:
3334
from pyiceberg.table.metadata import TableMetadata
@@ -356,6 +357,11 @@ def update_snapshot_summaries(
356357
raise ValueError(f"Operation not implemented: {summary.operation}")
357358

358359
if truncate_full_table and summary.operation == Operation.OVERWRITE and previous_summary is not None:
360+
deprecation_message(
361+
deprecated_in="0.10.0",
362+
removed_in="0.11.0",
363+
help_message="The truncate-full-table shouldn't be used.",
364+
)
359365
summary = _truncate_table_summary(summary, previous_summary)
360366

361367
if not previous_summary:

pyiceberg/table/update/snapshot.py

-1
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,6 @@ def _summary(self, snapshot_properties: Dict[str, str] = EMPTY_DICT) -> Summary:
236236
return update_snapshot_summaries(
237237
summary=Summary(operation=self._operation, **ssc.build(), **snapshot_properties),
238238
previous_summary=previous_snapshot.summary if previous_snapshot is not None else None,
239-
truncate_full_table=self._operation == Operation.OVERWRITE,
240239
)
241240

242241
def _commit(self) -> UpdatesAndRequirements:

tests/integration/test_deletes.py

+9-11
Original file line numberDiff line numberDiff line change
@@ -467,21 +467,19 @@ def test_partitioned_table_positional_deletes_sequence_number(spark: SparkSessio
467467
assert snapshots[2].summary == Summary(
468468
Operation.OVERWRITE,
469469
**{
470-
"added-files-size": snapshots[2].summary["total-files-size"],
471470
"added-data-files": "1",
471+
"added-files-size": snapshots[2].summary["added-files-size"],
472472
"added-records": "2",
473473
"changed-partition-count": "1",
474-
"total-files-size": snapshots[2].summary["total-files-size"],
475-
"total-delete-files": "0",
476-
"total-data-files": "1",
477-
"total-position-deletes": "0",
478-
"total-records": "2",
479-
"total-equality-deletes": "0",
480-
"deleted-data-files": "2",
481-
"removed-delete-files": "1",
482-
"deleted-records": "5",
474+
"deleted-data-files": "1",
475+
"deleted-records": "3",
483476
"removed-files-size": snapshots[2].summary["removed-files-size"],
484-
"removed-position-deletes": "1",
477+
"total-data-files": "2",
478+
"total-delete-files": "1",
479+
"total-equality-deletes": "0",
480+
"total-files-size": snapshots[2].summary["total-files-size"],
481+
"total-position-deletes": "1",
482+
"total-records": "4",
485483
},
486484
)
487485

tests/integration/test_writes/test_writes.py

+94-1
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@ def test_summaries(spark: SparkSession, session_catalog: Catalog, arrow_table_wi
249249
"total-records": "0",
250250
}
251251

252-
# Overwrite
252+
# Append
253253
assert summaries[3] == {
254254
"added-data-files": "1",
255255
"added-files-size": str(file_size),
@@ -263,6 +263,99 @@ def test_summaries(spark: SparkSession, session_catalog: Catalog, arrow_table_wi
263263
}
264264

265265

266+
@pytest.mark.integration
267+
def test_summaries_partial_overwrite(spark: SparkSession, session_catalog: Catalog) -> None:
268+
identifier = "default.test_summaries_partial_overwrite"
269+
TEST_DATA = {
270+
"id": [1, 2, 3, 1, 1],
271+
"name": ["AB", "CD", "EF", "CD", "EF"],
272+
}
273+
pa_schema = pa.schema(
274+
[
275+
pa.field("id", pa.int32()),
276+
pa.field("name", pa.string()),
277+
]
278+
)
279+
arrow_table = pa.Table.from_pydict(TEST_DATA, schema=pa_schema)
280+
tbl = _create_table(session_catalog, identifier, {"format-version": "2"}, schema=pa_schema)
281+
with tbl.update_spec() as txn:
282+
txn.add_identity("id")
283+
tbl.append(arrow_table)
284+
285+
assert len(tbl.inspect.data_files()) == 3
286+
287+
tbl.delete(delete_filter="id == 1 and name = 'AB'") # partial overwrite data from 1 data file
288+
289+
rows = spark.sql(
290+
f"""
291+
SELECT operation, summary
292+
FROM {identifier}.snapshots
293+
ORDER BY committed_at ASC
294+
"""
295+
).collect()
296+
297+
operations = [row.operation for row in rows]
298+
assert operations == ["append", "overwrite"]
299+
300+
summaries = [row.summary for row in rows]
301+
302+
file_size = int(summaries[0]["added-files-size"])
303+
assert file_size > 0
304+
305+
# APPEND
306+
assert summaries[0] == {
307+
"added-data-files": "3",
308+
"added-files-size": "2570",
309+
"added-records": "5",
310+
"changed-partition-count": "3",
311+
"total-data-files": "3",
312+
"total-delete-files": "0",
313+
"total-equality-deletes": "0",
314+
"total-files-size": "2570",
315+
"total-position-deletes": "0",
316+
"total-records": "5",
317+
}
318+
# Java produces:
319+
# {
320+
# "added-data-files": "1",
321+
# "added-files-size": "707",
322+
# "added-records": "2",
323+
# "app-id": "local-1743678304626",
324+
# "changed-partition-count": "1",
325+
# "deleted-data-files": "1",
326+
# "deleted-records": "3",
327+
# "engine-name": "spark",
328+
# "engine-version": "3.5.5",
329+
# "iceberg-version": "Apache Iceberg 1.8.1 (commit 9ce0fcf0af7becf25ad9fc996c3bad2afdcfd33d)",
330+
# "removed-files-size": "693",
331+
# "spark.app.id": "local-1743678304626",
332+
# "total-data-files": "3",
333+
# "total-delete-files": "0",
334+
# "total-equality-deletes": "0",
335+
# "total-files-size": "1993",
336+
# "total-position-deletes": "0",
337+
# "total-records": "4"
338+
# }
339+
files = tbl.inspect.data_files()
340+
assert len(files) == 3
341+
assert summaries[1] == {
342+
"added-data-files": "1",
343+
"added-files-size": "859",
344+
"added-records": "2",
345+
"changed-partition-count": "1",
346+
"deleted-data-files": "1",
347+
"deleted-records": "3",
348+
"removed-files-size": "866",
349+
"total-data-files": "3",
350+
"total-delete-files": "0",
351+
"total-equality-deletes": "0",
352+
"total-files-size": "2563",
353+
"total-position-deletes": "0",
354+
"total-records": "4",
355+
}
356+
assert len(tbl.scan().to_pandas()) == 4
357+
358+
266359
@pytest.mark.integration
267360
def test_data_files(spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None:
268361
identifier = "default.arrow_data_files"

tests/table/test_snapshots.py

+6-14
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,6 @@ def test_merge_snapshot_summaries_overwrite_summary() -> None:
289289
"total-position-deletes": "1",
290290
"total-records": "1",
291291
},
292-
truncate_full_table=True,
293292
)
294293

295294
expected = {
@@ -299,18 +298,12 @@ def test_merge_snapshot_summaries_overwrite_summary() -> None:
299298
"added-files-size": "4",
300299
"added-position-deletes": "5",
301300
"added-records": "6",
302-
"total-data-files": "1",
303-
"total-records": "6",
304-
"total-delete-files": "2",
305-
"total-equality-deletes": "3",
306-
"total-files-size": "4",
307-
"total-position-deletes": "5",
308-
"deleted-data-files": "1",
309-
"removed-delete-files": "1",
310-
"deleted-records": "1",
311-
"removed-files-size": "1",
312-
"removed-position-deletes": "1",
313-
"removed-equality-deletes": "1",
301+
"total-data-files": "2",
302+
"total-delete-files": "3",
303+
"total-records": "7",
304+
"total-files-size": "5",
305+
"total-position-deletes": "6",
306+
"total-equality-deletes": "4",
314307
}
315308

316309
assert actual.additional_properties == expected
@@ -337,7 +330,6 @@ def test_invalid_type() -> None:
337330
},
338331
),
339332
previous_summary={"total-data-files": "abc"}, # should be a number
340-
truncate_full_table=True,
341333
)
342334

343335
assert "Could not parse summary property total-data-files to an int: abc" in str(e.value)

0 commit comments

Comments
 (0)