Skip to content

Commit 9fff025

Browse files
committed
Add integration tests format version 3 for files metadata tables
1 parent 95a63cb commit 9fff025

File tree

2 files changed

+51
-11
lines changed

2 files changed

+51
-11
lines changed

pyiceberg/table/inspect.py

+3-7
Original file line numberDiff line numberDiff line change
@@ -635,16 +635,12 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType:
635635
def _files(self, snapshot_id: Optional[int] = None, data_file_filter: Optional[Set[DataFileContent]] = None) -> "pa.Table":
636636
import pyarrow as pa
637637

638-
files_table: list[pa.Table] = []
639-
640638
if not snapshot_id and not self.tbl.metadata.current_snapshot():
641-
return pa.Table.from_pylist(
642-
[],
643-
schema=self._get_files_schema(),
644-
)
645-
snapshot = self._get_snapshot(snapshot_id)
639+
return self._get_files_schema().empty_table()
646640

641+
snapshot = self._get_snapshot(snapshot_id)
647642
io = self.tbl.io
643+
files_table: list[pa.Table] = []
648644
for manifest_list in snapshot.manifests(io):
649645
files_table.append(self._get_files_from_manifest(manifest_list, data_file_filter))
650646

tests/integration/test_inspect_table.py

+48-4
Original file line numberDiff line numberDiff line change
@@ -100,10 +100,8 @@ def _inspect_files_asserts(df: pa.Table, spark_df: DataFrame) -> None:
100100
assert isinstance(value.as_py(), int)
101101

102102
for split_offsets in df["split_offsets"]:
103-
assert isinstance(split_offsets.as_py(), list)
104-
105-
for file_format in df["file_format"]:
106-
assert file_format.as_py() == "PARQUET"
103+
if split_offsets.as_py() is not None:
104+
assert isinstance(split_offsets.as_py(), list)
107105

108106
for file_path in df["file_path"]:
109107
assert file_path.as_py().startswith("s3://")
@@ -985,3 +983,49 @@ def test_inspect_all_files(
985983
_inspect_files_asserts(all_files_df, spark.table(f"{identifier}.all_files"))
986984
_inspect_files_asserts(all_data_files_df, spark.table(f"{identifier}.all_data_files"))
987985
_inspect_files_asserts(all_delete_files_df, spark.table(f"{identifier}.all_delete_files"))
986+
987+
988+
@pytest.mark.integration
989+
def test_inspect_files_format_version_3(spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None:
990+
identifier = "default.table_metadata_files"
991+
992+
tbl = _create_table(
993+
session_catalog,
994+
identifier,
995+
properties={
996+
"format-version": "3",
997+
"write.delete.mode": "merge-on-read",
998+
"write.update.mode": "merge-on-read",
999+
"write.merge.mode": "merge-on-read",
1000+
},
1001+
)
1002+
1003+
insert_data_sql = f"""INSERT INTO {identifier} VALUES
1004+
(false, 'a', 'aaaaaaaaaaaaaaaaaaaaaa', 1, 1, 0.0, 0.0, TIMESTAMP('2023-01-01 19:25:00'), TIMESTAMP('2023-01-01 19:25:00+00:00'), DATE('2023-01-01'), X'01', X'00000000000000000000000000000000'),
1005+
(NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL),
1006+
(true, 'z', 'zzzzzzzzzzzzzzzzzzzzzz', 9, 9, 0.9, 0.9, TIMESTAMP('2023-03-01 19:25:00'), TIMESTAMP('2023-03-01 19:25:00+00:00'), DATE('2023-03-01'), X'12', X'11111111111111111111111111111111');
1007+
"""
1008+
1009+
spark.sql(insert_data_sql)
1010+
spark.sql(insert_data_sql)
1011+
spark.sql(f"UPDATE {identifier} SET int = 2 WHERE int = 1")
1012+
spark.sql(f"DELETE FROM {identifier} WHERE int = 9")
1013+
spark.table(identifier).show(20, False)
1014+
1015+
tbl.refresh()
1016+
1017+
files_df = tbl.inspect.files()
1018+
data_files_df = tbl.inspect.data_files()
1019+
delete_files_df = tbl.inspect.delete_files()
1020+
1021+
all_files_df = tbl.inspect.all_files()
1022+
all_data_files_df = tbl.inspect.all_data_files()
1023+
all_delete_files_df = tbl.inspect.all_delete_files()
1024+
1025+
_inspect_files_asserts(files_df, spark.table(f"{identifier}.files"))
1026+
_inspect_files_asserts(data_files_df, spark.table(f"{identifier}.data_files"))
1027+
_inspect_files_asserts(delete_files_df, spark.table(f"{identifier}.delete_files"))
1028+
1029+
_inspect_files_asserts(all_files_df, spark.table(f"{identifier}.all_files"))
1030+
_inspect_files_asserts(all_data_files_df, spark.table(f"{identifier}.all_data_files"))
1031+
_inspect_files_asserts(all_delete_files_df, spark.table(f"{identifier}.all_delete_files"))

0 commit comments

Comments
 (0)