From 068c786059a75dfe6dc2064b1c17e6a949e35150 Mon Sep 17 00:00:00 2001 From: Honah J Date: Fri, 12 Apr 2024 17:53:22 -0700 Subject: [PATCH] Read: fetch file_schema directly from pyarrow_to_schema (#597) --- pyiceberg/io/pyarrow.py | 10 ++-------- tests/conftest.py | 4 +++- tests/io/test_pyarrow.py | 6 +++--- 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 038414ee40..357fa34078 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -120,7 +120,6 @@ pre_order_visit, promote, prune_columns, - sanitize_column_names, visit, visit_with_partner, ) @@ -950,12 +949,7 @@ def _task_to_table( with fs.open_input_file(path) as fin: fragment = arrow_format.make_fragment(fin) physical_schema = fragment.physical_schema - schema_raw = None - if metadata := physical_schema.metadata: - schema_raw = metadata.get(ICEBERG_SCHEMA) - file_schema = ( - Schema.model_validate_json(schema_raw) if schema_raw is not None else pyarrow_to_schema(physical_schema, name_mapping) - ) + file_schema = pyarrow_to_schema(physical_schema, name_mapping) pyarrow_filter = None if bound_row_filter is not AlwaysTrue(): @@ -963,7 +957,7 @@ def _task_to_table( bound_file_filter = bind(file_schema, translated_row_filter, case_sensitive=case_sensitive) pyarrow_filter = expression_to_pyarrow(bound_file_filter) - file_project_schema = sanitize_column_names(prune_columns(file_schema, projected_field_ids, select_full_types=False)) + file_project_schema = prune_columns(file_schema, projected_field_ids, select_full_types=False) if file_schema is None: raise ValueError(f"Missing Iceberg schema in Metadata for file: {path}") diff --git a/tests/conftest.py b/tests/conftest.py index 62444b457a..46fa4a8e3a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1913,9 +1913,11 @@ def data_file(table_schema_simple: Schema, tmp_path: str) -> str: import pyarrow as pa from pyarrow import parquet as pq + from pyiceberg.io.pyarrow import schema_to_pyarrow + table = pa.table( {"foo": ["a", "b", "c"], "bar": [1, 2, 3], "baz": [True, False, None]}, - metadata={"iceberg.schema": table_schema_simple.model_dump_json()}, + schema=schema_to_pyarrow(table_schema_simple), ) file_path = f"{tmp_path}/0000-data.parquet" diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index 2acffdfdf9..98e6bf5e5e 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -1383,7 +1383,7 @@ def test_delete(deletes_file: str, example_task: FileScanTask, table_schema_simp str(with_deletes) == """pyarrow.Table foo: string -bar: int64 not null +bar: int32 not null baz: bool ---- foo: [["a","c"]] @@ -1426,7 +1426,7 @@ def test_delete_duplicates(deletes_file: str, example_task: FileScanTask, table_ str(with_deletes) == """pyarrow.Table foo: string -bar: int64 not null +bar: int32 not null baz: bool ---- foo: [["a","c"]] @@ -1462,7 +1462,7 @@ def test_pyarrow_wrap_fsspec(example_task: FileScanTask, table_schema_simple: Sc str(projection) == """pyarrow.Table foo: string -bar: int64 not null +bar: int32 not null baz: bool ---- foo: [["a","b","c"]]