Migrate implementation of files() table from __init__.py

soumya-ghosh · soumya-ghosh · commit 0a7db610a9ff · 2024-09-13T00:26:22.000+05:30
diff --git a/pyiceberg/table/inspect.py b/pyiceberg/table/inspect.py
@@ -17,7 +17,7 @@
 from __future__ import annotations
 
 from datetime import datetime
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple
 
 from pyiceberg.conversions import from_bytes
 from pyiceberg.manifest import DataFile, DataFileContent, ManifestContent, PartitionFieldSummary
@@ -473,7 +473,7 @@ def history(self) -> "pa.Table":
 
         return pa.Table.from_pylist(history, schema=history_schema)
 
-    def files(self, snapshot_id: Optional[int] = None) -> "pa.Table":
+    def _files(self, snapshot_id: Optional[int] = None, data_file_filter: Optional[Set[DataFileContent]] = None) -> "pa.Table":
         import pyarrow as pa
 
         from pyiceberg.io.pyarrow import schema_to_pyarrow
@@ -530,6 +530,8 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType:
         for manifest_list in snapshot.manifests(io):
             for manifest_entry in manifest_list.fetch_manifest_entry(io):
                 data_file = manifest_entry.data_file
+                if data_file_filter and data_file.content not in data_file_filter:
+                    continue
                 column_sizes = data_file.column_sizes or {}
                 value_counts = data_file.value_counts or {}
                 null_value_counts = data_file.null_value_counts or {}
@@ -558,12 +560,12 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType:
                     "spec_id": data_file.spec_id,
                     "record_count": data_file.record_count,
                     "file_size_in_bytes": data_file.file_size_in_bytes,
-                    "column_sizes": dict(data_file.column_sizes),
-                    "value_counts": dict(data_file.value_counts),
-                    "null_value_counts": dict(data_file.null_value_counts),
-                    "nan_value_counts": dict(data_file.nan_value_counts),
-                    "lower_bounds": dict(data_file.lower_bounds),
-                    "upper_bounds": dict(data_file.upper_bounds),
+                    "column_sizes": dict(data_file.column_sizes) if data_file.column_sizes is not None else None,
+                    "value_counts": dict(data_file.value_counts) if data_file.value_counts is not None else None,
+                    "null_value_counts": dict(data_file.null_value_counts) if data_file.null_value_counts is not None else None,
+                    "nan_value_counts": dict(data_file.nan_value_counts) if data_file.nan_value_counts is not None else None,
+                    "lower_bounds": dict(data_file.lower_bounds) if data_file.lower_bounds is not None else None,
+                    "upper_bounds": dict(data_file.upper_bounds) if data_file.upper_bounds is not None else None,
                     "key_metadata": data_file.key_metadata,
                     "split_offsets": data_file.split_offsets,
                     "equality_ids": data_file.equality_ids,
@@ -575,3 +577,12 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType:
             files,
             schema=files_schema,
         )
+
+    def files(self, snapshot_id: Optional[int] = None) -> "pa.Table":
+        return self._files(snapshot_id)
+
+    def data_files(self, snapshot_id: Optional[int] = None) -> "pa.Table":
+        return self._files(snapshot_id, {DataFileContent.DATA})
+
+    def delete_files(self, snapshot_id: Optional[int] = None) -> "pa.Table":
+        return self._files(snapshot_id, {DataFileContent.POSITION_DELETES, DataFileContent.EQUALITY_DELETES})