Skip to content

Commit fb10185

Browse files
committed
refactored _get_files_from_manifest and _all_files methods
1 parent 96c680b commit fb10185

File tree

1 file changed

+20
-23
lines changed

1 file changed

+20
-23
lines changed

pyiceberg/table/inspect.py

+20-23
Original file line numberDiff line numberDiff line change
@@ -523,9 +523,11 @@ def history(self) -> "pa.Table":
523523

524524
return pa.Table.from_pylist(history, schema=history_schema)
525525

526-
def _files_by_manifest(
526+
def _get_files_from_manifest(
527527
self, manifest_list: ManifestFile, data_file_filter: Optional[Set[DataFileContent]] = None
528-
) -> List[Dict[str, Any]]:
528+
) -> "pa.Table":
529+
import pyarrow as pa
530+
529531
files: list[dict[str, Any]] = []
530532
schema = self.tbl.metadata.schema()
531533
io = self.tbl.io
@@ -576,7 +578,10 @@ def _files_by_manifest(
576578
"readable_metrics": readable_metrics,
577579
}
578580
)
579-
return files
581+
return pa.Table.from_pylist(
582+
files,
583+
schema=self._get_files_schema(),
584+
)
580585

581586
def _get_files_schema(self) -> "pa.Schema":
582587
import pyarrow as pa
@@ -630,23 +635,20 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType:
630635
def _files(self, snapshot_id: Optional[int] = None, data_file_filter: Optional[Set[DataFileContent]] = None) -> "pa.Table":
631636
import pyarrow as pa
632637

633-
files: list[dict[str, Any]] = []
638+
files_table: list[pa.Table] = []
634639

635640
if not snapshot_id and not self.tbl.metadata.current_snapshot():
636641
return pa.Table.from_pylist(
637-
files,
642+
[],
638643
schema=self._get_files_schema(),
639644
)
640645
snapshot = self._get_snapshot(snapshot_id)
641646

642647
io = self.tbl.io
643648
for manifest_list in snapshot.manifests(io):
644-
files.extend(self._files_by_manifest(manifest_list, data_file_filter))
649+
files_table.append(self._get_files_from_manifest(manifest_list, data_file_filter))
645650

646-
return pa.Table.from_pylist(
647-
files,
648-
schema=self._get_files_schema(),
649-
)
651+
return pa.concat_tables(files_table)
650652

651653
def files(self, snapshot_id: Optional[int] = None) -> "pa.Table":
652654
return self._files(snapshot_id)
@@ -678,21 +680,16 @@ def _all_files(self, data_file_filter: Optional[Set[DataFileContent]] = None) ->
678680
return pa.Table.from_pylist([], schema=self._get_files_schema())
679681

680682
executor = ExecutorFactory.get_or_create()
681-
all_manifest_files_by_snapshot: Iterator[List[ManifestFile]] = executor.map(
682-
lambda args: args[0].manifests(self.tbl.io), [(snapshot,) for snapshot in snapshots]
683-
)
684-
all_manifest_files = list(
685-
{(manifest.manifest_path, manifest) for manifest_list in all_manifest_files_by_snapshot for manifest in manifest_list}
686-
)
687-
all_files_by_manifest: Iterator[List[Dict[str, Any]]] = executor.map(
688-
lambda args: self._files_by_manifest(*args), [(manifest, data_file_filter) for _, manifest in all_manifest_files]
689-
)
690-
all_files_list = [file for files in all_files_by_manifest for file in files]
691-
return pa.Table.from_pylist(
692-
all_files_list,
693-
schema=self._get_files_schema(),
683+
manifest_lists = executor.map(lambda snapshot: snapshot.manifests(self.tbl.io), snapshots)
684+
685+
unique_manifests = {(manifest.manifest_path, manifest) for manifest_list in manifest_lists for manifest in manifest_list}
686+
687+
file_lists = executor.map(
688+
lambda args: self._get_files_from_manifest(*args), [(manifest, data_file_filter) for _, manifest in unique_manifests]
694689
)
695690

691+
return pa.concat_tables(file_lists)
692+
696693
def all_files(self) -> "pa.Table":
697694
return self._all_files()
698695

0 commit comments

Comments
 (0)