@@ -262,6 +262,76 @@ def test_summaries(spark: SparkSession, session_catalog: Catalog, arrow_table_wi
262262 }
263263
264264
265+ @pytest .mark .integration
266+ def test_summaries_partial_overwrite (spark : SparkSession , session_catalog : Catalog ) -> None :
267+ identifier = "default.test_summaries_partial_overwrite"
268+ TEST_DATA = {
269+ "id" : [1 , 2 , 3 , 1 , 1 ],
270+ "name" : ["AB" , "CD" , "EF" , "CD" , "EF" ],
271+ }
272+ pa_schema = pa .schema (
273+ [
274+ pa .field ("id" , pa .dictionary (pa .int32 (), pa .int32 (), False )),
275+ pa .field ("name" , pa .dictionary (pa .int32 (), pa .string (), False )),
276+ ]
277+ )
278+ arrow_table = pa .Table .from_pydict (TEST_DATA , schema = pa_schema )
279+ tbl = _create_table (session_catalog , identifier , {"format-version" : "2" }, schema = pa_schema )
280+ with tbl .update_spec () as txn :
281+ txn .add_identity ("id" ) # partition by `id` to create 3 data files
282+ tbl .append (arrow_table ) # append
283+ tbl .delete (delete_filter = "id == 1 and name = 'AB'" ) # partial overwrite data from 1 data file
284+
285+ rows = spark .sql (
286+ f"""
287+ SELECT operation, summary
288+ FROM { identifier } .snapshots
289+ ORDER BY committed_at ASC
290+ """
291+ ).collect ()
292+
293+ operations = [row .operation for row in rows ]
294+ assert operations == ["append" , "overwrite" ]
295+
296+ summaries = [row .summary for row in rows ]
297+
298+ file_size = int (summaries [0 ]["added-files-size" ])
299+ assert file_size > 0
300+
301+ # APPEND
302+ assert summaries [0 ] == {
303+ "added-data-files" : "3" ,
304+ "added-files-size" : "2848" ,
305+ "added-records" : "5" ,
306+ "changed-partition-count" : "3" ,
307+ "total-data-files" : "3" ,
308+ "total-delete-files" : "0" ,
309+ "total-equality-deletes" : "0" ,
310+ "total-files-size" : "2848" ,
311+ "total-position-deletes" : "0" ,
312+ "total-records" : "5" ,
313+ }
314+ # BUG `deleted-data-files` property is being replaced by the previous summary's `total-data-files` value
315+ # OVERWRITE from tbl.delete
316+ assert summaries [1 ] == {
317+ "added-data-files" : "1" ,
318+ "added-files-size" : "859" ,
319+ "added-records" : "2" , # wrong should be 0
320+ "changed-partition-count" : "1" ,
321+ "deleted-data-files" : "3" , # wrong should be 1
322+ "deleted-records" : "5" , # wrong should be 1
323+ "removed-files-size" : "2848" ,
324+ "total-data-files" : "1" , # wrong should be 3
325+ "total-delete-files" : "0" ,
326+ "total-equality-deletes" : "0" ,
327+ "total-files-size" : "859" ,
328+ "total-position-deletes" : "0" ,
329+ "total-records" : "2" , # wrong should be 4
330+ }
331+ assert len (tbl .inspect .data_files ()) == 3
332+ assert len (tbl .scan ().to_pandas ()) == 4
333+
334+
265335@pytest .mark .integration
266336def test_data_files (spark : SparkSession , session_catalog : Catalog , arrow_table_with_null : pa .Table ) -> None :
267337 identifier = "default.arrow_data_files"
0 commit comments