Move to PyArrow 17

Fokko · Fokko · commit 9969926be8c9 · 2024-08-12T11:34:52.000+02:00
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -1197,7 +1197,7 @@ def _task_to_record_batches(
     positional_deletes: Optional[List[ChunkedArray]],
     case_sensitive: bool,
     name_mapping: Optional[NameMapping] = None,
-    use_large_types: bool = True,
+    use_large_types: Optional[bool] = None,
 ) -> Iterator[pa.RecordBatch]:
     _, _, path = PyArrowFileIO.parse_location(task.file.file_path)
     arrow_format = ds.ParquetFileFormat(pre_buffer=True, buffer_size=(ONE_MEGABYTE * 8))
@@ -1220,15 +1220,16 @@ def _task_to_record_batches(
         if file_schema is None:
             raise ValueError(f"Missing Iceberg schema in Metadata for file: {path}")
 
+        projected_file_schema = None
+        if use_large_types is not None:
+            if use_large_types is True:
+                projected_file_schema = _pyarrow_schema_ensure_large_types(physical_schema)
+            else:
+                projected_file_schema = _pyarrow_schema_ensure_small_types(physical_schema)
+
         fragment_scanner = ds.Scanner.from_fragment(
             fragment=fragment,
-            # With PyArrow 16.0.0 there is an issue with casting record-batches:
-            # https://github.com/apache/arrow/issues/41884
-            # https://github.com/apache/arrow/issues/43183
-            # Would be good to remove this later on
-            schema=_pyarrow_schema_ensure_large_types(physical_schema)
-            if use_large_types
-            else (_pyarrow_schema_ensure_small_types(physical_schema)),
+            schema=projected_file_schema,
             # This will push down the query to Arrow.
             # But in case there are positional deletes, we have to apply them first
             filter=pyarrow_filter if not positional_deletes else None,
@@ -1246,14 +1247,9 @@ def _task_to_record_batches(
                 batch = batch.take(indices)
                 # Apply the user filter
                 if pyarrow_filter is not None:
-                    # we need to switch back and forth between RecordBatch and Table
-                    # as Expression filter isn't yet supported in RecordBatch
-                    # https://github.com/apache/arrow/issues/39220
-                    arrow_table = pa.Table.from_batches([batch])
-                    arrow_table = arrow_table.filter(pyarrow_filter)
-                    if len(arrow_table) == 0:
+                    batch = batch.filter(pyarrow_filter)
+                    if len(batch) == 0:
                         continue
-                    batch = arrow_table.to_batches()[0]
             yield _to_requested_schema(
                 projected_schema, file_project_schema, batch, downcast_ns_timestamp_to_us=True, use_large_types=use_large_types
             )
@@ -1268,7 +1264,7 @@ def _task_to_table(
     positional_deletes: Optional[List[ChunkedArray]],
     case_sensitive: bool,
     name_mapping: Optional[NameMapping] = None,
-    use_large_types: bool = True,
+    use_large_types: Optional[bool] = None,
 ) -> Optional[pa.Table]:
     batches = list(
         _task_to_record_batches(
@@ -1348,7 +1344,9 @@ def project_table(
             # When FsSpec is not installed
             raise ValueError(f"Expected PyArrowFileIO or FsspecFileIO, got: {io}") from e
 
-    use_large_types = property_as_bool(io.properties, PYARROW_USE_LARGE_TYPES_ON_READ, True)
+    use_large_types = None
+    if PYARROW_USE_LARGE_TYPES_ON_READ in io.properties:
+        use_large_types = property_as_bool(io.properties, PYARROW_USE_LARGE_TYPES_ON_READ, True)
 
     bound_row_filter = bind(table_metadata.schema(), row_filter, case_sensitive=case_sensitive)