Include DELETE entries when inspecting (apache#1731)

Fokko · web-flow · commit a403c65f9e95 · 2025-03-18T12:11:00.000+01:00
While doing some checks, I've noticed that these are missing.
diff --git a/pyiceberg/table/inspect.py b/pyiceberg/table/inspect.py
@@ -161,7 +161,7 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType:
         entries = []
         snapshot = self._get_snapshot(snapshot_id)
         for manifest in snapshot.manifests(self.tbl.io):
-            for entry in manifest.fetch_manifest_entry(io=self.tbl.io):
+            for entry in manifest.fetch_manifest_entry(io=self.tbl.io, discard_deleted=False):
                 column_sizes = entry.data_file.column_sizes or {}
                 value_counts = entry.data_file.value_counts or {}
                 null_value_counts = entry.data_file.null_value_counts or {}
diff --git a/tests/integration/test_inspect_table.py b/tests/integration/test_inspect_table.py
@@ -164,6 +164,8 @@ def test_inspect_entries(
 
     # Write some data
     tbl.append(arrow_table_with_null)
+    # Generate a DELETE entry
+    tbl.overwrite(arrow_table_with_null)
 
     def check_pyiceberg_df_equals_spark_df(df: pa.Table, spark_df: DataFrame) -> None:
         assert df.column_names == [
@@ -185,6 +187,8 @@ def check_pyiceberg_df_equals_spark_df(df: pa.Table, spark_df: DataFrame) -> Non
 
         lhs = df.to_pandas()
         rhs = spark_df.toPandas()
+        assert len(lhs) == len(rhs)
+
         for column in df.column_names:
             for left, right in zip(lhs[column].to_list(), rhs[column].to_list()):
                 if column == "data_file":