@@ -590,16 +590,14 @@ def _get_file_format(file_format: FileFormat, **kwargs: Dict[str, Any]) -> ds.Fi
590590 raise ValueError (f"Unsupported file format: { file_format } " )
591591
592592
593- def _construct_fragment (fs : FileSystem , data_file : DataFile ,
594- file_format_kwargs : Dict [str , Any ] = EMPTY_DICT ) -> ds .Fragment :
593+ def _construct_fragment (fs : FileSystem , data_file : DataFile , file_format_kwargs : Dict [str , Any ] = EMPTY_DICT ) -> ds .Fragment :
595594 _ , _ , path = PyArrowFileIO .parse_location (data_file .file_path )
596595 return _get_file_format (data_file .file_format , ** file_format_kwargs ).make_fragment (path , fs )
597596
598597
599598def _read_deletes (fs : FileSystem , data_file : DataFile ) -> Dict [str , pa .ChunkedArray ]:
600599 delete_fragment = _construct_fragment (
601- fs , data_file ,
602- file_format_kwargs = {"dictionary_columns" : ("file_path" ,), "pre_buffer" : True , "buffer_size" : ONE_MEGABYTE }
600+ fs , data_file , file_format_kwargs = {"dictionary_columns" : ("file_path" ,), "pre_buffer" : True , "buffer_size" : ONE_MEGABYTE }
603601 )
604602 table = ds .Scanner .from_fragment (fragment = delete_fragment ).to_table ()
605603 table = table .unify_dictionaries ()
@@ -731,8 +729,7 @@ def _get_field_doc(field: pa.Field) -> Optional[str]:
731729
732730
733731class _ConvertToIceberg (PyArrowSchemaVisitor [Union [IcebergType , Schema ]]):
734- def _convert_fields (self , arrow_fields : Iterable [pa .Field ], field_results : List [Optional [IcebergType ]]) -> List [
735- NestedField ]:
732+ def _convert_fields (self , arrow_fields : Iterable [pa .Field ], field_results : List [Optional [IcebergType ]]) -> List [NestedField ]:
736733 fields = []
737734 for i , field in enumerate (arrow_fields ):
738735 field_id = _get_field_id (field )
@@ -756,7 +753,7 @@ def list(self, list_type: pa.ListType, element_result: Optional[IcebergType]) ->
756753 return None
757754
758755 def map (
759- self , map_type : pa .MapType , key_result : Optional [IcebergType ], value_result : Optional [IcebergType ]
756+ self , map_type : pa .MapType , key_result : Optional [IcebergType ], value_result : Optional [IcebergType ]
760757 ) -> Optional [IcebergType ]:
761758 key_field = map_type .key_field
762759 key_id = _get_field_id (key_field )
@@ -825,15 +822,15 @@ def _hack_names(column_name_list: list[str], enabled: bool):
825822 return column_name_list
826823
827824def _task_to_table (
828- fs : FileSystem ,
829- task : FileScanTask ,
830- bound_row_filter : BooleanExpression ,
831- projected_schema : Schema ,
832- projected_field_ids : Set [int ],
833- positional_deletes : Optional [List [ChunkedArray ]],
834- case_sensitive : bool ,
835- row_counts : List [int ],
836- limit : Optional [int ] = None ,
825+ fs : FileSystem ,
826+ task : FileScanTask ,
827+ bound_row_filter : BooleanExpression ,
828+ projected_schema : Schema ,
829+ projected_field_ids : Set [int ],
830+ positional_deletes : Optional [List [ChunkedArray ]],
831+ case_sensitive : bool ,
832+ row_counts : List [int ],
833+ limit : Optional [int ] = None ,
837834) -> Optional [pa .Table ]:
838835 if limit and sum (row_counts ) >= limit :
839836 return None
@@ -848,17 +845,15 @@ def _task_to_table(
848845 schema_raw = metadata .get (ICEBERG_SCHEMA )
849846 # TODO: if field_ids are not present, Name Mapping should be implemented to look them up in the table schema,
850847 # see https://github.com/apache/iceberg/issues/7451
851- file_schema = Schema .model_validate_json (schema_raw ) if schema_raw is not None else pyarrow_to_schema (
852- physical_schema )
848+ file_schema = Schema .model_validate_json (schema_raw ) if schema_raw is not None else pyarrow_to_schema (physical_schema )
853849
854850 pyarrow_filter = None
855851 if bound_row_filter is not AlwaysTrue ():
856852 translated_row_filter = translate_column_names (bound_row_filter , file_schema , case_sensitive = case_sensitive )
857853 bound_file_filter = bind (file_schema , translated_row_filter , case_sensitive = case_sensitive )
858854 pyarrow_filter = expression_to_pyarrow (bound_file_filter )
859855
860- file_project_schema = sanitize_column_names (
861- prune_columns (file_schema , projected_field_ids , select_full_types = False ))
856+ file_project_schema = sanitize_column_names (prune_columns (file_schema , projected_field_ids , select_full_types = False ))
862857
863858 if file_schema is None :
864859 raise ValueError (f"Missing Iceberg schema in Metadata for file: { path } " )
@@ -929,12 +924,12 @@ def _read_all_delete_files(fs: FileSystem, tasks: Iterable[FileScanTask]) -> Dic
929924
930925
931926def project_table (
932- tasks : Iterable [FileScanTask ],
933- table : Table ,
934- row_filter : BooleanExpression ,
935- projected_schema : Schema ,
936- case_sensitive : bool = True ,
937- limit : Optional [int ] = None ,
927+ tasks : Iterable [FileScanTask ],
928+ table : Table ,
929+ row_filter : BooleanExpression ,
930+ projected_schema : Schema ,
931+ case_sensitive : bool = True ,
932+ limit : Optional [int ] = None ,
938933) -> pa .Table :
939934 """Resolve the right columns based on the identifier.
940935
@@ -1019,8 +1014,7 @@ def project_table(
10191014
10201015
10211016def to_requested_schema (requested_schema : Schema , file_schema : Schema , table : pa .Table ) -> pa .Table :
1022- struct_array = visit_with_partner (requested_schema , table , ArrowProjectionVisitor (file_schema ),
1023- ArrowAccessor (file_schema ))
1017+ struct_array = visit_with_partner (requested_schema , table , ArrowProjectionVisitor (file_schema ), ArrowAccessor (file_schema ))
10241018
10251019 arrays = []
10261020 fields = []
@@ -1043,12 +1037,11 @@ def cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array:
10431037 return values .cast (schema_to_pyarrow (promote (file_field .field_type , field .field_type )))
10441038 return values
10451039
1046- def schema (self , schema : Schema , schema_partner : Optional [pa .Array ], struct_result : Optional [pa .Array ]) -> Optional [
1047- pa .Array ]:
1040+ def schema (self , schema : Schema , schema_partner : Optional [pa .Array ], struct_result : Optional [pa .Array ]) -> Optional [pa .Array ]:
10481041 return struct_result
10491042
10501043 def struct (
1051- self , struct : StructType , struct_array : Optional [pa .Array ], field_results : List [Optional [pa .Array ]]
1044+ self , struct : StructType , struct_array : Optional [pa .Array ], field_results : List [Optional [pa .Array ]]
10521045 ) -> Optional [pa .Array ]:
10531046 if struct_array is None :
10541047 return None
@@ -1071,17 +1064,15 @@ def struct(
10711064 def field (self , field : NestedField , _ : Optional [pa .Array ], field_array : Optional [pa .Array ]) -> Optional [pa .Array ]:
10721065 return field_array
10731066
1074- def list (self , list_type : ListType , list_array : Optional [pa .Array ], value_array : Optional [pa .Array ]) -> Optional [
1075- pa .Array ]:
1067+ def list (self , list_type : ListType , list_array : Optional [pa .Array ], value_array : Optional [pa .Array ]) -> Optional [pa .Array ]:
10761068 return (
10771069 pa .ListArray .from_arrays (list_array .offsets , self .cast_if_needed (list_type .element_field , value_array ))
10781070 if isinstance (list_array , pa .ListArray )
10791071 else None
10801072 )
10811073
10821074 def map (
1083- self , map_type : MapType , map_array : Optional [pa .Array ], key_result : Optional [pa .Array ],
1084- value_result : Optional [pa .Array ]
1075+ self , map_type : MapType , map_array : Optional [pa .Array ], key_result : Optional [pa .Array ], value_result : Optional [pa .Array ]
10851076 ) -> Optional [pa .Array ]:
10861077 return (
10871078 pa .MapArray .from_arrays (
@@ -1202,8 +1193,7 @@ class StatsAggregator:
12021193 current_max : Any
12031194 trunc_length : Optional [int ]
12041195
1205- def __init__ (self , iceberg_type : PrimitiveType , physical_type_string : str ,
1206- trunc_length : Optional [int ] = None ) -> None :
1196+ def __init__ (self , iceberg_type : PrimitiveType , physical_type_string : str , trunc_length : Optional [int ] = None ) -> None :
12071197 self .current_min = None
12081198 self .current_max = None
12091199 self .trunc_length = trunc_length
@@ -1316,30 +1306,27 @@ def __init__(self, schema: Schema, properties: Dict[str, str]):
13161306 self ._properties = properties
13171307 self ._default_mode = self ._properties .get (DEFAULT_METRICS_MODE_KEY )
13181308
1319- def schema (self , schema : Schema , struct_result : Callable [[], List [StatisticsCollector ]]) -> List [
1320- StatisticsCollector ]:
1309+ def schema (self , schema : Schema , struct_result : Callable [[], List [StatisticsCollector ]]) -> List [StatisticsCollector ]:
13211310 return struct_result ()
13221311
13231312 def struct (
1324- self , struct : StructType , field_results : List [Callable [[], List [StatisticsCollector ]]]
1313+ self , struct : StructType , field_results : List [Callable [[], List [StatisticsCollector ]]]
13251314 ) -> List [StatisticsCollector ]:
13261315 return list (chain (* [result () for result in field_results ]))
13271316
1328- def field (self , field : NestedField , field_result : Callable [[], List [StatisticsCollector ]]) -> List [
1329- StatisticsCollector ]:
1317+ def field (self , field : NestedField , field_result : Callable [[], List [StatisticsCollector ]]) -> List [StatisticsCollector ]:
13301318 self ._field_id = field .field_id
13311319 return field_result ()
13321320
1333- def list (self , list_type : ListType , element_result : Callable [[], List [StatisticsCollector ]]) -> List [
1334- StatisticsCollector ]:
1321+ def list (self , list_type : ListType , element_result : Callable [[], List [StatisticsCollector ]]) -> List [StatisticsCollector ]:
13351322 self ._field_id = list_type .element_id
13361323 return element_result ()
13371324
13381325 def map (
1339- self ,
1340- map_type : MapType ,
1341- key_result : Callable [[], List [StatisticsCollector ]],
1342- value_result : Callable [[], List [StatisticsCollector ]],
1326+ self ,
1327+ map_type : MapType ,
1328+ key_result : Callable [[], List [StatisticsCollector ]],
1329+ value_result : Callable [[], List [StatisticsCollector ]],
13431330 ) -> List [StatisticsCollector ]:
13441331 self ._field_id = map_type .key_id
13451332 k = key_result ()
@@ -1362,8 +1349,8 @@ def primitive(self, primitive: PrimitiveType) -> List[StatisticsCollector]:
13621349 metrics_mode = match_metrics_mode (col_mode )
13631350
13641351 if (
1365- not (isinstance (primitive , StringType ) or isinstance (primitive , BinaryType ))
1366- and metrics_mode .type == MetricModeTypes .TRUNCATE
1352+ not (isinstance (primitive , StringType ) or isinstance (primitive , BinaryType ))
1353+ and metrics_mode .type == MetricModeTypes .TRUNCATE
13671354 ):
13681355 metrics_mode = MetricsMode (MetricModeTypes .FULL )
13691356
@@ -1372,13 +1359,12 @@ def primitive(self, primitive: PrimitiveType) -> List[StatisticsCollector]:
13721359 if is_nested and metrics_mode .type in [MetricModeTypes .TRUNCATE , MetricModeTypes .FULL ]:
13731360 metrics_mode = MetricsMode (MetricModeTypes .COUNTS )
13741361
1375- return [StatisticsCollector (field_id = self ._field_id , iceberg_type = primitive , mode = metrics_mode ,
1376- column_name = column_name )]
1362+ return [StatisticsCollector (field_id = self ._field_id , iceberg_type = primitive , mode = metrics_mode , column_name = column_name )]
13771363
13781364
13791365def compute_statistics_plan (
1380- schema : Schema ,
1381- table_properties : Dict [str , str ],
1366+ schema : Schema ,
1367+ table_properties : Dict [str , str ],
13821368) -> Dict [int , StatisticsCollector ]:
13831369 """
13841370 Compute the statistics plan for all columns.
@@ -1417,8 +1403,7 @@ def __init__(self) -> None:
14171403 def schema (self , schema : Schema , struct_result : Callable [[], List [ID2ParquetPath ]]) -> List [ID2ParquetPath ]:
14181404 return struct_result ()
14191405
1420- def struct (self , struct : StructType , field_results : List [Callable [[], List [ID2ParquetPath ]]]) -> List [
1421- ID2ParquetPath ]:
1406+ def struct (self , struct : StructType , field_results : List [Callable [[], List [ID2ParquetPath ]]]) -> List [ID2ParquetPath ]:
14221407 return list (chain (* [result () for result in field_results ]))
14231408
14241409 def field (self , field : NestedField , field_result : Callable [[], List [ID2ParquetPath ]]) -> List [ID2ParquetPath ]:
@@ -1436,10 +1421,10 @@ def list(self, list_type: ListType, element_result: Callable[[], List[ID2Parquet
14361421 return result
14371422
14381423 def map (
1439- self ,
1440- map_type : MapType ,
1441- key_result : Callable [[], List [ID2ParquetPath ]],
1442- value_result : Callable [[], List [ID2ParquetPath ]],
1424+ self ,
1425+ map_type : MapType ,
1426+ key_result : Callable [[], List [ID2ParquetPath ]],
1427+ value_result : Callable [[], List [ID2ParquetPath ]],
14431428 ) -> List [ID2ParquetPath ]:
14441429 self ._field_id = map_type .key_id
14451430 self ._path .append ("key_value.key" )
@@ -1456,7 +1441,7 @@ def primitive(self, primitive: PrimitiveType) -> List[ID2ParquetPath]:
14561441
14571442
14581443def parquet_path_to_id_mapping (
1459- schema : Schema ,
1444+ schema : Schema ,
14601445) -> Dict [str , int ]:
14611446 """
14621447 Compute the mapping of parquet column path to Iceberg ID.
@@ -1475,11 +1460,11 @@ def parquet_path_to_id_mapping(
14751460
14761461
14771462def fill_parquet_file_metadata (
1478- df : DataFile ,
1479- parquet_metadata : pq .FileMetaData ,
1480- file_size : int ,
1481- stats_columns : Dict [int , StatisticsCollector ],
1482- parquet_column_mapping : Dict [str , int ],
1463+ df : DataFile ,
1464+ parquet_metadata : pq .FileMetaData ,
1465+ file_size : int ,
1466+ stats_columns : Dict [int , StatisticsCollector ],
1467+ parquet_column_mapping : Dict [str , int ],
14831468) -> None :
14841469 """
14851470 Compute and fill the following fields of the DataFile object.
0 commit comments