From 068c786059a75dfe6dc2064b1c17e6a949e35150 Mon Sep 17 00:00:00 2001
From: Honah J <honahx@apache.org>
Date: Fri, 12 Apr 2024 17:53:22 -0700
Subject: [PATCH] Read: fetch file_schema directly from pyarrow_to_schema
 (#597)

---
 pyiceberg/io/pyarrow.py  | 10 ++--------
 tests/conftest.py        |  4 +++-
 tests/io/test_pyarrow.py |  6 +++---
 3 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
index 038414ee40..357fa34078 100644
--- a/pyiceberg/io/pyarrow.py
+++ b/pyiceberg/io/pyarrow.py
@@ -120,7 +120,6 @@
     pre_order_visit,
     promote,
     prune_columns,
-    sanitize_column_names,
     visit,
     visit_with_partner,
 )
@@ -950,12 +949,7 @@ def _task_to_table(
     with fs.open_input_file(path) as fin:
         fragment = arrow_format.make_fragment(fin)
         physical_schema = fragment.physical_schema
-        schema_raw = None
-        if metadata := physical_schema.metadata:
-            schema_raw = metadata.get(ICEBERG_SCHEMA)
-        file_schema = (
-            Schema.model_validate_json(schema_raw) if schema_raw is not None else pyarrow_to_schema(physical_schema, name_mapping)
-        )
+        file_schema = pyarrow_to_schema(physical_schema, name_mapping)
 
         pyarrow_filter = None
         if bound_row_filter is not AlwaysTrue():
@@ -963,7 +957,7 @@ def _task_to_table(
             bound_file_filter = bind(file_schema, translated_row_filter, case_sensitive=case_sensitive)
             pyarrow_filter = expression_to_pyarrow(bound_file_filter)
 
-        file_project_schema = sanitize_column_names(prune_columns(file_schema, projected_field_ids, select_full_types=False))
+        file_project_schema = prune_columns(file_schema, projected_field_ids, select_full_types=False)
 
         if file_schema is None:
             raise ValueError(f"Missing Iceberg schema in Metadata for file: {path}")
diff --git a/tests/conftest.py b/tests/conftest.py
index 62444b457a..46fa4a8e3a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1913,9 +1913,11 @@ def data_file(table_schema_simple: Schema, tmp_path: str) -> str:
     import pyarrow as pa
     from pyarrow import parquet as pq
 
+    from pyiceberg.io.pyarrow import schema_to_pyarrow
+
     table = pa.table(
         {"foo": ["a", "b", "c"], "bar": [1, 2, 3], "baz": [True, False, None]},
-        metadata={"iceberg.schema": table_schema_simple.model_dump_json()},
+        schema=schema_to_pyarrow(table_schema_simple),
     )
 
     file_path = f"{tmp_path}/0000-data.parquet"
diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py
index 2acffdfdf9..98e6bf5e5e 100644
--- a/tests/io/test_pyarrow.py
+++ b/tests/io/test_pyarrow.py
@@ -1383,7 +1383,7 @@ def test_delete(deletes_file: str, example_task: FileScanTask, table_schema_simp
         str(with_deletes)
         == """pyarrow.Table
 foo: string
-bar: int64 not null
+bar: int32 not null
 baz: bool
 ----
 foo: [["a","c"]]
@@ -1426,7 +1426,7 @@ def test_delete_duplicates(deletes_file: str, example_task: FileScanTask, table_
         str(with_deletes)
         == """pyarrow.Table
 foo: string
-bar: int64 not null
+bar: int32 not null
 baz: bool
 ----
 foo: [["a","c"]]
@@ -1462,7 +1462,7 @@ def test_pyarrow_wrap_fsspec(example_task: FileScanTask, table_schema_simple: Sc
         str(projection)
         == """pyarrow.Table
 foo: string
-bar: int64 not null
+bar: int32 not null
 baz: bool
 ----
 foo: [["a","b","c"]]