@@ -523,9 +523,11 @@ def history(self) -> "pa.Table":
523
523
524
524
return pa .Table .from_pylist (history , schema = history_schema )
525
525
526
- def _files_by_manifest (
526
+ def _get_files_from_manifest (
527
527
self , manifest_list : ManifestFile , data_file_filter : Optional [Set [DataFileContent ]] = None
528
- ) -> List [Dict [str , Any ]]:
528
+ ) -> "pa.Table" :
529
+ import pyarrow as pa
530
+
529
531
files : list [dict [str , Any ]] = []
530
532
schema = self .tbl .metadata .schema ()
531
533
io = self .tbl .io
@@ -576,7 +578,10 @@ def _files_by_manifest(
576
578
"readable_metrics" : readable_metrics ,
577
579
}
578
580
)
579
- return files
581
+ return pa .Table .from_pylist (
582
+ files ,
583
+ schema = self ._get_files_schema (),
584
+ )
580
585
581
586
def _get_files_schema (self ) -> "pa.Schema" :
582
587
import pyarrow as pa
@@ -630,23 +635,20 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType:
630
635
def _files (self , snapshot_id : Optional [int ] = None , data_file_filter : Optional [Set [DataFileContent ]] = None ) -> "pa.Table" :
631
636
import pyarrow as pa
632
637
633
- files : list [dict [ str , Any ] ] = []
638
+ files_table : list [pa . Table ] = []
634
639
635
640
if not snapshot_id and not self .tbl .metadata .current_snapshot ():
636
641
return pa .Table .from_pylist (
637
- files ,
642
+ [] ,
638
643
schema = self ._get_files_schema (),
639
644
)
640
645
snapshot = self ._get_snapshot (snapshot_id )
641
646
642
647
io = self .tbl .io
643
648
for manifest_list in snapshot .manifests (io ):
644
- files . extend (self ._files_by_manifest (manifest_list , data_file_filter ))
649
+ files_table . append (self ._get_files_from_manifest (manifest_list , data_file_filter ))
645
650
646
- return pa .Table .from_pylist (
647
- files ,
648
- schema = self ._get_files_schema (),
649
- )
651
+ return pa .concat_tables (files_table )
650
652
651
653
def files (self , snapshot_id : Optional [int ] = None ) -> "pa.Table" :
652
654
return self ._files (snapshot_id )
@@ -678,21 +680,16 @@ def _all_files(self, data_file_filter: Optional[Set[DataFileContent]] = None) ->
678
680
return pa .Table .from_pylist ([], schema = self ._get_files_schema ())
679
681
680
682
executor = ExecutorFactory .get_or_create ()
681
- all_manifest_files_by_snapshot : Iterator [List [ManifestFile ]] = executor .map (
682
- lambda args : args [0 ].manifests (self .tbl .io ), [(snapshot ,) for snapshot in snapshots ]
683
- )
684
- all_manifest_files = list (
685
- {(manifest .manifest_path , manifest ) for manifest_list in all_manifest_files_by_snapshot for manifest in manifest_list }
686
- )
687
- all_files_by_manifest : Iterator [List [Dict [str , Any ]]] = executor .map (
688
- lambda args : self ._files_by_manifest (* args ), [(manifest , data_file_filter ) for _ , manifest in all_manifest_files ]
689
- )
690
- all_files_list = [file for files in all_files_by_manifest for file in files ]
691
- return pa .Table .from_pylist (
692
- all_files_list ,
693
- schema = self ._get_files_schema (),
683
+ manifest_lists = executor .map (lambda snapshot : snapshot .manifests (self .tbl .io ), snapshots )
684
+
685
+ unique_manifests = {(manifest .manifest_path , manifest ) for manifest_list in manifest_lists for manifest in manifest_list }
686
+
687
+ file_lists = executor .map (
688
+ lambda args : self ._get_files_from_manifest (* args ), [(manifest , data_file_filter ) for _ , manifest in unique_manifests ]
694
689
)
695
690
691
+ return pa .concat_tables (file_lists )
692
+
696
693
def all_files (self ) -> "pa.Table" :
697
694
return self ._all_files ()
698
695
0 commit comments