17
17
from __future__ import annotations
18
18
19
19
from datetime import datetime
20
- from typing import TYPE_CHECKING , Any , Dict , List , Optional , Tuple
20
+ from typing import TYPE_CHECKING , Any , Dict , List , Optional , Set , Tuple
21
21
22
22
from pyiceberg .conversions import from_bytes
23
23
from pyiceberg .manifest import DataFile , DataFileContent , ManifestContent , PartitionFieldSummary
@@ -473,7 +473,7 @@ def history(self) -> "pa.Table":
473
473
474
474
return pa .Table .from_pylist (history , schema = history_schema )
475
475
476
- def files (self , snapshot_id : Optional [int ] = None ) -> "pa.Table" :
476
+ def _files (self , snapshot_id : Optional [int ] = None , data_file_filter : Optional [ Set [ DataFileContent ] ] = None ) -> "pa.Table" :
477
477
import pyarrow as pa
478
478
479
479
from pyiceberg .io .pyarrow import schema_to_pyarrow
@@ -530,6 +530,8 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType:
530
530
for manifest_list in snapshot .manifests (io ):
531
531
for manifest_entry in manifest_list .fetch_manifest_entry (io ):
532
532
data_file = manifest_entry .data_file
533
+ if data_file_filter and data_file .content not in data_file_filter :
534
+ continue
533
535
column_sizes = data_file .column_sizes or {}
534
536
value_counts = data_file .value_counts or {}
535
537
null_value_counts = data_file .null_value_counts or {}
@@ -558,12 +560,12 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType:
558
560
"spec_id" : data_file .spec_id ,
559
561
"record_count" : data_file .record_count ,
560
562
"file_size_in_bytes" : data_file .file_size_in_bytes ,
561
- "column_sizes" : dict (data_file .column_sizes ),
562
- "value_counts" : dict (data_file .value_counts ),
563
- "null_value_counts" : dict (data_file .null_value_counts ),
564
- "nan_value_counts" : dict (data_file .nan_value_counts ),
565
- "lower_bounds" : dict (data_file .lower_bounds ),
566
- "upper_bounds" : dict (data_file .upper_bounds ),
563
+ "column_sizes" : dict (data_file .column_sizes ) if data_file . column_sizes is not None else None ,
564
+ "value_counts" : dict (data_file .value_counts ) if data_file . value_counts is not None else None ,
565
+ "null_value_counts" : dict (data_file .null_value_counts ) if data_file . null_value_counts is not None else None ,
566
+ "nan_value_counts" : dict (data_file .nan_value_counts ) if data_file . nan_value_counts is not None else None ,
567
+ "lower_bounds" : dict (data_file .lower_bounds ) if data_file . lower_bounds is not None else None ,
568
+ "upper_bounds" : dict (data_file .upper_bounds ) if data_file . upper_bounds is not None else None ,
567
569
"key_metadata" : data_file .key_metadata ,
568
570
"split_offsets" : data_file .split_offsets ,
569
571
"equality_ids" : data_file .equality_ids ,
@@ -575,3 +577,12 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType:
575
577
files ,
576
578
schema = files_schema ,
577
579
)
580
+
581
+ def files (self , snapshot_id : Optional [int ] = None ) -> "pa.Table" :
582
+ return self ._files (snapshot_id )
583
+
584
+ def data_files (self , snapshot_id : Optional [int ] = None ) -> "pa.Table" :
585
+ return self ._files (snapshot_id , {DataFileContent .DATA })
586
+
587
+ def delete_files (self , snapshot_id : Optional [int ] = None ) -> "pa.Table" :
588
+ return self ._files (snapshot_id , {DataFileContent .POSITION_DELETES , DataFileContent .EQUALITY_DELETES })
0 commit comments