@@ -1197,7 +1197,7 @@ def _task_to_record_batches(
1197
1197
positional_deletes : Optional [List [ChunkedArray ]],
1198
1198
case_sensitive : bool ,
1199
1199
name_mapping : Optional [NameMapping ] = None ,
1200
- use_large_types : bool = True ,
1200
+ use_large_types : Optional [ bool ] = None ,
1201
1201
) -> Iterator [pa .RecordBatch ]:
1202
1202
_ , _ , path = PyArrowFileIO .parse_location (task .file .file_path )
1203
1203
arrow_format = ds .ParquetFileFormat (pre_buffer = True , buffer_size = (ONE_MEGABYTE * 8 ))
@@ -1220,15 +1220,16 @@ def _task_to_record_batches(
1220
1220
if file_schema is None :
1221
1221
raise ValueError (f"Missing Iceberg schema in Metadata for file: { path } " )
1222
1222
1223
+ projected_file_schema = None
1224
+ if use_large_types is not None :
1225
+ if use_large_types is True :
1226
+ projected_file_schema = _pyarrow_schema_ensure_large_types (physical_schema )
1227
+ else :
1228
+ projected_file_schema = _pyarrow_schema_ensure_small_types (physical_schema )
1229
+
1223
1230
fragment_scanner = ds .Scanner .from_fragment (
1224
1231
fragment = fragment ,
1225
- # With PyArrow 16.0.0 there is an issue with casting record-batches:
1226
- # https://github.com/apache/arrow/issues/41884
1227
- # https://github.com/apache/arrow/issues/43183
1228
- # Would be good to remove this later on
1229
- schema = _pyarrow_schema_ensure_large_types (physical_schema )
1230
- if use_large_types
1231
- else (_pyarrow_schema_ensure_small_types (physical_schema )),
1232
+ schema = projected_file_schema ,
1232
1233
# This will push down the query to Arrow.
1233
1234
# But in case there are positional deletes, we have to apply them first
1234
1235
filter = pyarrow_filter if not positional_deletes else None ,
@@ -1246,14 +1247,9 @@ def _task_to_record_batches(
1246
1247
batch = batch .take (indices )
1247
1248
# Apply the user filter
1248
1249
if pyarrow_filter is not None :
1249
- # we need to switch back and forth between RecordBatch and Table
1250
- # as Expression filter isn't yet supported in RecordBatch
1251
- # https://github.com/apache/arrow/issues/39220
1252
- arrow_table = pa .Table .from_batches ([batch ])
1253
- arrow_table = arrow_table .filter (pyarrow_filter )
1254
- if len (arrow_table ) == 0 :
1250
+ batch = batch .filter (pyarrow_filter )
1251
+ if len (batch ) == 0 :
1255
1252
continue
1256
- batch = arrow_table .to_batches ()[0 ]
1257
1253
yield _to_requested_schema (
1258
1254
projected_schema , file_project_schema , batch , downcast_ns_timestamp_to_us = True , use_large_types = use_large_types
1259
1255
)
@@ -1268,7 +1264,7 @@ def _task_to_table(
1268
1264
positional_deletes : Optional [List [ChunkedArray ]],
1269
1265
case_sensitive : bool ,
1270
1266
name_mapping : Optional [NameMapping ] = None ,
1271
- use_large_types : bool = True ,
1267
+ use_large_types : Optional [ bool ] = None ,
1272
1268
) -> Optional [pa .Table ]:
1273
1269
batches = list (
1274
1270
_task_to_record_batches (
@@ -1348,7 +1344,9 @@ def project_table(
1348
1344
# When FsSpec is not installed
1349
1345
raise ValueError (f"Expected PyArrowFileIO or FsspecFileIO, got: { io } " ) from e
1350
1346
1351
- use_large_types = property_as_bool (io .properties , PYARROW_USE_LARGE_TYPES_ON_READ , True )
1347
+ use_large_types = None
1348
+ if PYARROW_USE_LARGE_TYPES_ON_READ in io .properties :
1349
+ use_large_types = property_as_bool (io .properties , PYARROW_USE_LARGE_TYPES_ON_READ , True )
1352
1350
1353
1351
bound_row_filter = bind (table_metadata .schema (), row_filter , case_sensitive = case_sensitive )
1354
1352
0 commit comments