@@ -590,16 +590,14 @@ def _get_file_format(file_format: FileFormat, **kwargs: Dict[str, Any]) -> ds.Fi
590
590
raise ValueError (f"Unsupported file format: { file_format } " )
591
591
592
592
593
- def _construct_fragment (fs : FileSystem , data_file : DataFile ,
594
- file_format_kwargs : Dict [str , Any ] = EMPTY_DICT ) -> ds .Fragment :
593
+ def _construct_fragment (fs : FileSystem , data_file : DataFile , file_format_kwargs : Dict [str , Any ] = EMPTY_DICT ) -> ds .Fragment :
595
594
_ , _ , path = PyArrowFileIO .parse_location (data_file .file_path )
596
595
return _get_file_format (data_file .file_format , ** file_format_kwargs ).make_fragment (path , fs )
597
596
598
597
599
598
def _read_deletes (fs : FileSystem , data_file : DataFile ) -> Dict [str , pa .ChunkedArray ]:
600
599
delete_fragment = _construct_fragment (
601
- fs , data_file ,
602
- file_format_kwargs = {"dictionary_columns" : ("file_path" ,), "pre_buffer" : True , "buffer_size" : ONE_MEGABYTE }
600
+ fs , data_file , file_format_kwargs = {"dictionary_columns" : ("file_path" ,), "pre_buffer" : True , "buffer_size" : ONE_MEGABYTE }
603
601
)
604
602
table = ds .Scanner .from_fragment (fragment = delete_fragment ).to_table ()
605
603
table = table .unify_dictionaries ()
@@ -731,8 +729,7 @@ def _get_field_doc(field: pa.Field) -> Optional[str]:
731
729
732
730
733
731
class _ConvertToIceberg (PyArrowSchemaVisitor [Union [IcebergType , Schema ]]):
734
- def _convert_fields (self , arrow_fields : Iterable [pa .Field ], field_results : List [Optional [IcebergType ]]) -> List [
735
- NestedField ]:
732
+ def _convert_fields (self , arrow_fields : Iterable [pa .Field ], field_results : List [Optional [IcebergType ]]) -> List [NestedField ]:
736
733
fields = []
737
734
for i , field in enumerate (arrow_fields ):
738
735
field_id = _get_field_id (field )
@@ -756,7 +753,7 @@ def list(self, list_type: pa.ListType, element_result: Optional[IcebergType]) ->
756
753
return None
757
754
758
755
def map (
759
- self , map_type : pa .MapType , key_result : Optional [IcebergType ], value_result : Optional [IcebergType ]
756
+ self , map_type : pa .MapType , key_result : Optional [IcebergType ], value_result : Optional [IcebergType ]
760
757
) -> Optional [IcebergType ]:
761
758
key_field = map_type .key_field
762
759
key_id = _get_field_id (key_field )
@@ -825,15 +822,15 @@ def _hack_names(column_name_list: list[str], enabled: bool):
825
822
return column_name_list
826
823
827
824
def _task_to_table (
828
- fs : FileSystem ,
829
- task : FileScanTask ,
830
- bound_row_filter : BooleanExpression ,
831
- projected_schema : Schema ,
832
- projected_field_ids : Set [int ],
833
- positional_deletes : Optional [List [ChunkedArray ]],
834
- case_sensitive : bool ,
835
- row_counts : List [int ],
836
- limit : Optional [int ] = None ,
825
+ fs : FileSystem ,
826
+ task : FileScanTask ,
827
+ bound_row_filter : BooleanExpression ,
828
+ projected_schema : Schema ,
829
+ projected_field_ids : Set [int ],
830
+ positional_deletes : Optional [List [ChunkedArray ]],
831
+ case_sensitive : bool ,
832
+ row_counts : List [int ],
833
+ limit : Optional [int ] = None ,
837
834
) -> Optional [pa .Table ]:
838
835
if limit and sum (row_counts ) >= limit :
839
836
return None
@@ -848,17 +845,15 @@ def _task_to_table(
848
845
schema_raw = metadata .get (ICEBERG_SCHEMA )
849
846
# TODO: if field_ids are not present, Name Mapping should be implemented to look them up in the table schema,
850
847
# see https://github.com/apache/iceberg/issues/7451
851
- file_schema = Schema .model_validate_json (schema_raw ) if schema_raw is not None else pyarrow_to_schema (
852
- physical_schema )
848
+ file_schema = Schema .model_validate_json (schema_raw ) if schema_raw is not None else pyarrow_to_schema (physical_schema )
853
849
854
850
pyarrow_filter = None
855
851
if bound_row_filter is not AlwaysTrue ():
856
852
translated_row_filter = translate_column_names (bound_row_filter , file_schema , case_sensitive = case_sensitive )
857
853
bound_file_filter = bind (file_schema , translated_row_filter , case_sensitive = case_sensitive )
858
854
pyarrow_filter = expression_to_pyarrow (bound_file_filter )
859
855
860
- file_project_schema = sanitize_column_names (
861
- prune_columns (file_schema , projected_field_ids , select_full_types = False ))
856
+ file_project_schema = sanitize_column_names (prune_columns (file_schema , projected_field_ids , select_full_types = False ))
862
857
863
858
if file_schema is None :
864
859
raise ValueError (f"Missing Iceberg schema in Metadata for file: { path } " )
@@ -929,12 +924,12 @@ def _read_all_delete_files(fs: FileSystem, tasks: Iterable[FileScanTask]) -> Dic
929
924
930
925
931
926
def project_table (
932
- tasks : Iterable [FileScanTask ],
933
- table : Table ,
934
- row_filter : BooleanExpression ,
935
- projected_schema : Schema ,
936
- case_sensitive : bool = True ,
937
- limit : Optional [int ] = None ,
927
+ tasks : Iterable [FileScanTask ],
928
+ table : Table ,
929
+ row_filter : BooleanExpression ,
930
+ projected_schema : Schema ,
931
+ case_sensitive : bool = True ,
932
+ limit : Optional [int ] = None ,
938
933
) -> pa .Table :
939
934
"""Resolve the right columns based on the identifier.
940
935
@@ -1019,8 +1014,7 @@ def project_table(
1019
1014
1020
1015
1021
1016
def to_requested_schema (requested_schema : Schema , file_schema : Schema , table : pa .Table ) -> pa .Table :
1022
- struct_array = visit_with_partner (requested_schema , table , ArrowProjectionVisitor (file_schema ),
1023
- ArrowAccessor (file_schema ))
1017
+ struct_array = visit_with_partner (requested_schema , table , ArrowProjectionVisitor (file_schema ), ArrowAccessor (file_schema ))
1024
1018
1025
1019
arrays = []
1026
1020
fields = []
@@ -1043,12 +1037,11 @@ def cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array:
1043
1037
return values .cast (schema_to_pyarrow (promote (file_field .field_type , field .field_type )))
1044
1038
return values
1045
1039
1046
- def schema (self , schema : Schema , schema_partner : Optional [pa .Array ], struct_result : Optional [pa .Array ]) -> Optional [
1047
- pa .Array ]:
1040
+ def schema (self , schema : Schema , schema_partner : Optional [pa .Array ], struct_result : Optional [pa .Array ]) -> Optional [pa .Array ]:
1048
1041
return struct_result
1049
1042
1050
1043
def struct (
1051
- self , struct : StructType , struct_array : Optional [pa .Array ], field_results : List [Optional [pa .Array ]]
1044
+ self , struct : StructType , struct_array : Optional [pa .Array ], field_results : List [Optional [pa .Array ]]
1052
1045
) -> Optional [pa .Array ]:
1053
1046
if struct_array is None :
1054
1047
return None
@@ -1071,17 +1064,15 @@ def struct(
1071
1064
def field (self , field : NestedField , _ : Optional [pa .Array ], field_array : Optional [pa .Array ]) -> Optional [pa .Array ]:
1072
1065
return field_array
1073
1066
1074
- def list (self , list_type : ListType , list_array : Optional [pa .Array ], value_array : Optional [pa .Array ]) -> Optional [
1075
- pa .Array ]:
1067
+ def list (self , list_type : ListType , list_array : Optional [pa .Array ], value_array : Optional [pa .Array ]) -> Optional [pa .Array ]:
1076
1068
return (
1077
1069
pa .ListArray .from_arrays (list_array .offsets , self .cast_if_needed (list_type .element_field , value_array ))
1078
1070
if isinstance (list_array , pa .ListArray )
1079
1071
else None
1080
1072
)
1081
1073
1082
1074
def map (
1083
- self , map_type : MapType , map_array : Optional [pa .Array ], key_result : Optional [pa .Array ],
1084
- value_result : Optional [pa .Array ]
1075
+ self , map_type : MapType , map_array : Optional [pa .Array ], key_result : Optional [pa .Array ], value_result : Optional [pa .Array ]
1085
1076
) -> Optional [pa .Array ]:
1086
1077
return (
1087
1078
pa .MapArray .from_arrays (
@@ -1202,8 +1193,7 @@ class StatsAggregator:
1202
1193
current_max : Any
1203
1194
trunc_length : Optional [int ]
1204
1195
1205
- def __init__ (self , iceberg_type : PrimitiveType , physical_type_string : str ,
1206
- trunc_length : Optional [int ] = None ) -> None :
1196
+ def __init__ (self , iceberg_type : PrimitiveType , physical_type_string : str , trunc_length : Optional [int ] = None ) -> None :
1207
1197
self .current_min = None
1208
1198
self .current_max = None
1209
1199
self .trunc_length = trunc_length
@@ -1316,30 +1306,27 @@ def __init__(self, schema: Schema, properties: Dict[str, str]):
1316
1306
self ._properties = properties
1317
1307
self ._default_mode = self ._properties .get (DEFAULT_METRICS_MODE_KEY )
1318
1308
1319
- def schema (self , schema : Schema , struct_result : Callable [[], List [StatisticsCollector ]]) -> List [
1320
- StatisticsCollector ]:
1309
+ def schema (self , schema : Schema , struct_result : Callable [[], List [StatisticsCollector ]]) -> List [StatisticsCollector ]:
1321
1310
return struct_result ()
1322
1311
1323
1312
def struct (
1324
- self , struct : StructType , field_results : List [Callable [[], List [StatisticsCollector ]]]
1313
+ self , struct : StructType , field_results : List [Callable [[], List [StatisticsCollector ]]]
1325
1314
) -> List [StatisticsCollector ]:
1326
1315
return list (chain (* [result () for result in field_results ]))
1327
1316
1328
- def field (self , field : NestedField , field_result : Callable [[], List [StatisticsCollector ]]) -> List [
1329
- StatisticsCollector ]:
1317
+ def field (self , field : NestedField , field_result : Callable [[], List [StatisticsCollector ]]) -> List [StatisticsCollector ]:
1330
1318
self ._field_id = field .field_id
1331
1319
return field_result ()
1332
1320
1333
- def list (self , list_type : ListType , element_result : Callable [[], List [StatisticsCollector ]]) -> List [
1334
- StatisticsCollector ]:
1321
+ def list (self , list_type : ListType , element_result : Callable [[], List [StatisticsCollector ]]) -> List [StatisticsCollector ]:
1335
1322
self ._field_id = list_type .element_id
1336
1323
return element_result ()
1337
1324
1338
1325
def map (
1339
- self ,
1340
- map_type : MapType ,
1341
- key_result : Callable [[], List [StatisticsCollector ]],
1342
- value_result : Callable [[], List [StatisticsCollector ]],
1326
+ self ,
1327
+ map_type : MapType ,
1328
+ key_result : Callable [[], List [StatisticsCollector ]],
1329
+ value_result : Callable [[], List [StatisticsCollector ]],
1343
1330
) -> List [StatisticsCollector ]:
1344
1331
self ._field_id = map_type .key_id
1345
1332
k = key_result ()
@@ -1362,8 +1349,8 @@ def primitive(self, primitive: PrimitiveType) -> List[StatisticsCollector]:
1362
1349
metrics_mode = match_metrics_mode (col_mode )
1363
1350
1364
1351
if (
1365
- not (isinstance (primitive , StringType ) or isinstance (primitive , BinaryType ))
1366
- and metrics_mode .type == MetricModeTypes .TRUNCATE
1352
+ not (isinstance (primitive , StringType ) or isinstance (primitive , BinaryType ))
1353
+ and metrics_mode .type == MetricModeTypes .TRUNCATE
1367
1354
):
1368
1355
metrics_mode = MetricsMode (MetricModeTypes .FULL )
1369
1356
@@ -1372,13 +1359,12 @@ def primitive(self, primitive: PrimitiveType) -> List[StatisticsCollector]:
1372
1359
if is_nested and metrics_mode .type in [MetricModeTypes .TRUNCATE , MetricModeTypes .FULL ]:
1373
1360
metrics_mode = MetricsMode (MetricModeTypes .COUNTS )
1374
1361
1375
- return [StatisticsCollector (field_id = self ._field_id , iceberg_type = primitive , mode = metrics_mode ,
1376
- column_name = column_name )]
1362
+ return [StatisticsCollector (field_id = self ._field_id , iceberg_type = primitive , mode = metrics_mode , column_name = column_name )]
1377
1363
1378
1364
1379
1365
def compute_statistics_plan (
1380
- schema : Schema ,
1381
- table_properties : Dict [str , str ],
1366
+ schema : Schema ,
1367
+ table_properties : Dict [str , str ],
1382
1368
) -> Dict [int , StatisticsCollector ]:
1383
1369
"""
1384
1370
Compute the statistics plan for all columns.
@@ -1417,8 +1403,7 @@ def __init__(self) -> None:
1417
1403
def schema (self , schema : Schema , struct_result : Callable [[], List [ID2ParquetPath ]]) -> List [ID2ParquetPath ]:
1418
1404
return struct_result ()
1419
1405
1420
- def struct (self , struct : StructType , field_results : List [Callable [[], List [ID2ParquetPath ]]]) -> List [
1421
- ID2ParquetPath ]:
1406
+ def struct (self , struct : StructType , field_results : List [Callable [[], List [ID2ParquetPath ]]]) -> List [ID2ParquetPath ]:
1422
1407
return list (chain (* [result () for result in field_results ]))
1423
1408
1424
1409
def field (self , field : NestedField , field_result : Callable [[], List [ID2ParquetPath ]]) -> List [ID2ParquetPath ]:
@@ -1436,10 +1421,10 @@ def list(self, list_type: ListType, element_result: Callable[[], List[ID2Parquet
1436
1421
return result
1437
1422
1438
1423
def map (
1439
- self ,
1440
- map_type : MapType ,
1441
- key_result : Callable [[], List [ID2ParquetPath ]],
1442
- value_result : Callable [[], List [ID2ParquetPath ]],
1424
+ self ,
1425
+ map_type : MapType ,
1426
+ key_result : Callable [[], List [ID2ParquetPath ]],
1427
+ value_result : Callable [[], List [ID2ParquetPath ]],
1443
1428
) -> List [ID2ParquetPath ]:
1444
1429
self ._field_id = map_type .key_id
1445
1430
self ._path .append ("key_value.key" )
@@ -1456,7 +1441,7 @@ def primitive(self, primitive: PrimitiveType) -> List[ID2ParquetPath]:
1456
1441
1457
1442
1458
1443
def parquet_path_to_id_mapping (
1459
- schema : Schema ,
1444
+ schema : Schema ,
1460
1445
) -> Dict [str , int ]:
1461
1446
"""
1462
1447
Compute the mapping of parquet column path to Iceberg ID.
@@ -1475,11 +1460,11 @@ def parquet_path_to_id_mapping(
1475
1460
1476
1461
1477
1462
def fill_parquet_file_metadata (
1478
- df : DataFile ,
1479
- parquet_metadata : pq .FileMetaData ,
1480
- file_size : int ,
1481
- stats_columns : Dict [int , StatisticsCollector ],
1482
- parquet_column_mapping : Dict [str , int ],
1463
+ df : DataFile ,
1464
+ parquet_metadata : pq .FileMetaData ,
1465
+ file_size : int ,
1466
+ stats_columns : Dict [int , StatisticsCollector ],
1467
+ parquet_column_mapping : Dict [str , int ],
1483
1468
) -> None :
1484
1469
"""
1485
1470
Compute and fill the following fields of the DataFile object.
0 commit comments