Skip to content

Commit b8c2ae3

Browse files
committed
removing whole file reformatting
1 parent 523ff61 commit b8c2ae3

File tree

1 file changed

+51
-66
lines changed

1 file changed

+51
-66
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 51 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -590,16 +590,14 @@ def _get_file_format(file_format: FileFormat, **kwargs: Dict[str, Any]) -> ds.Fi
590590
raise ValueError(f"Unsupported file format: {file_format}")
591591

592592

593-
def _construct_fragment(fs: FileSystem, data_file: DataFile,
594-
file_format_kwargs: Dict[str, Any] = EMPTY_DICT) -> ds.Fragment:
593+
def _construct_fragment(fs: FileSystem, data_file: DataFile, file_format_kwargs: Dict[str, Any] = EMPTY_DICT) -> ds.Fragment:
595594
_, _, path = PyArrowFileIO.parse_location(data_file.file_path)
596595
return _get_file_format(data_file.file_format, **file_format_kwargs).make_fragment(path, fs)
597596

598597

599598
def _read_deletes(fs: FileSystem, data_file: DataFile) -> Dict[str, pa.ChunkedArray]:
600599
delete_fragment = _construct_fragment(
601-
fs, data_file,
602-
file_format_kwargs={"dictionary_columns": ("file_path",), "pre_buffer": True, "buffer_size": ONE_MEGABYTE}
600+
fs, data_file, file_format_kwargs={"dictionary_columns": ("file_path",), "pre_buffer": True, "buffer_size": ONE_MEGABYTE}
603601
)
604602
table = ds.Scanner.from_fragment(fragment=delete_fragment).to_table()
605603
table = table.unify_dictionaries()
@@ -731,8 +729,7 @@ def _get_field_doc(field: pa.Field) -> Optional[str]:
731729

732730

733731
class _ConvertToIceberg(PyArrowSchemaVisitor[Union[IcebergType, Schema]]):
734-
def _convert_fields(self, arrow_fields: Iterable[pa.Field], field_results: List[Optional[IcebergType]]) -> List[
735-
NestedField]:
732+
def _convert_fields(self, arrow_fields: Iterable[pa.Field], field_results: List[Optional[IcebergType]]) -> List[NestedField]:
736733
fields = []
737734
for i, field in enumerate(arrow_fields):
738735
field_id = _get_field_id(field)
@@ -756,7 +753,7 @@ def list(self, list_type: pa.ListType, element_result: Optional[IcebergType]) ->
756753
return None
757754

758755
def map(
759-
self, map_type: pa.MapType, key_result: Optional[IcebergType], value_result: Optional[IcebergType]
756+
self, map_type: pa.MapType, key_result: Optional[IcebergType], value_result: Optional[IcebergType]
760757
) -> Optional[IcebergType]:
761758
key_field = map_type.key_field
762759
key_id = _get_field_id(key_field)
@@ -825,15 +822,15 @@ def _hack_names(column_name_list: list[str], enabled: bool):
825822
return column_name_list
826823

827824
def _task_to_table(
828-
fs: FileSystem,
829-
task: FileScanTask,
830-
bound_row_filter: BooleanExpression,
831-
projected_schema: Schema,
832-
projected_field_ids: Set[int],
833-
positional_deletes: Optional[List[ChunkedArray]],
834-
case_sensitive: bool,
835-
row_counts: List[int],
836-
limit: Optional[int] = None,
825+
fs: FileSystem,
826+
task: FileScanTask,
827+
bound_row_filter: BooleanExpression,
828+
projected_schema: Schema,
829+
projected_field_ids: Set[int],
830+
positional_deletes: Optional[List[ChunkedArray]],
831+
case_sensitive: bool,
832+
row_counts: List[int],
833+
limit: Optional[int] = None,
837834
) -> Optional[pa.Table]:
838835
if limit and sum(row_counts) >= limit:
839836
return None
@@ -848,17 +845,15 @@ def _task_to_table(
848845
schema_raw = metadata.get(ICEBERG_SCHEMA)
849846
# TODO: if field_ids are not present, Name Mapping should be implemented to look them up in the table schema,
850847
# see https://github.com/apache/iceberg/issues/7451
851-
file_schema = Schema.model_validate_json(schema_raw) if schema_raw is not None else pyarrow_to_schema(
852-
physical_schema)
848+
file_schema = Schema.model_validate_json(schema_raw) if schema_raw is not None else pyarrow_to_schema(physical_schema)
853849

854850
pyarrow_filter = None
855851
if bound_row_filter is not AlwaysTrue():
856852
translated_row_filter = translate_column_names(bound_row_filter, file_schema, case_sensitive=case_sensitive)
857853
bound_file_filter = bind(file_schema, translated_row_filter, case_sensitive=case_sensitive)
858854
pyarrow_filter = expression_to_pyarrow(bound_file_filter)
859855

860-
file_project_schema = sanitize_column_names(
861-
prune_columns(file_schema, projected_field_ids, select_full_types=False))
856+
file_project_schema = sanitize_column_names(prune_columns(file_schema, projected_field_ids, select_full_types=False))
862857

863858
if file_schema is None:
864859
raise ValueError(f"Missing Iceberg schema in Metadata for file: {path}")
@@ -929,12 +924,12 @@ def _read_all_delete_files(fs: FileSystem, tasks: Iterable[FileScanTask]) -> Dic
929924

930925

931926
def project_table(
932-
tasks: Iterable[FileScanTask],
933-
table: Table,
934-
row_filter: BooleanExpression,
935-
projected_schema: Schema,
936-
case_sensitive: bool = True,
937-
limit: Optional[int] = None,
927+
tasks: Iterable[FileScanTask],
928+
table: Table,
929+
row_filter: BooleanExpression,
930+
projected_schema: Schema,
931+
case_sensitive: bool = True,
932+
limit: Optional[int] = None,
938933
) -> pa.Table:
939934
"""Resolve the right columns based on the identifier.
940935
@@ -1019,8 +1014,7 @@ def project_table(
10191014

10201015

10211016
def to_requested_schema(requested_schema: Schema, file_schema: Schema, table: pa.Table) -> pa.Table:
1022-
struct_array = visit_with_partner(requested_schema, table, ArrowProjectionVisitor(file_schema),
1023-
ArrowAccessor(file_schema))
1017+
struct_array = visit_with_partner(requested_schema, table, ArrowProjectionVisitor(file_schema), ArrowAccessor(file_schema))
10241018

10251019
arrays = []
10261020
fields = []
@@ -1043,12 +1037,11 @@ def cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array:
10431037
return values.cast(schema_to_pyarrow(promote(file_field.field_type, field.field_type)))
10441038
return values
10451039

1046-
def schema(self, schema: Schema, schema_partner: Optional[pa.Array], struct_result: Optional[pa.Array]) -> Optional[
1047-
pa.Array]:
1040+
def schema(self, schema: Schema, schema_partner: Optional[pa.Array], struct_result: Optional[pa.Array]) -> Optional[pa.Array]:
10481041
return struct_result
10491042

10501043
def struct(
1051-
self, struct: StructType, struct_array: Optional[pa.Array], field_results: List[Optional[pa.Array]]
1044+
self, struct: StructType, struct_array: Optional[pa.Array], field_results: List[Optional[pa.Array]]
10521045
) -> Optional[pa.Array]:
10531046
if struct_array is None:
10541047
return None
@@ -1071,17 +1064,15 @@ def struct(
10711064
def field(self, field: NestedField, _: Optional[pa.Array], field_array: Optional[pa.Array]) -> Optional[pa.Array]:
10721065
return field_array
10731066

1074-
def list(self, list_type: ListType, list_array: Optional[pa.Array], value_array: Optional[pa.Array]) -> Optional[
1075-
pa.Array]:
1067+
def list(self, list_type: ListType, list_array: Optional[pa.Array], value_array: Optional[pa.Array]) -> Optional[pa.Array]:
10761068
return (
10771069
pa.ListArray.from_arrays(list_array.offsets, self.cast_if_needed(list_type.element_field, value_array))
10781070
if isinstance(list_array, pa.ListArray)
10791071
else None
10801072
)
10811073

10821074
def map(
1083-
self, map_type: MapType, map_array: Optional[pa.Array], key_result: Optional[pa.Array],
1084-
value_result: Optional[pa.Array]
1075+
self, map_type: MapType, map_array: Optional[pa.Array], key_result: Optional[pa.Array], value_result: Optional[pa.Array]
10851076
) -> Optional[pa.Array]:
10861077
return (
10871078
pa.MapArray.from_arrays(
@@ -1202,8 +1193,7 @@ class StatsAggregator:
12021193
current_max: Any
12031194
trunc_length: Optional[int]
12041195

1205-
def __init__(self, iceberg_type: PrimitiveType, physical_type_string: str,
1206-
trunc_length: Optional[int] = None) -> None:
1196+
def __init__(self, iceberg_type: PrimitiveType, physical_type_string: str, trunc_length: Optional[int] = None) -> None:
12071197
self.current_min = None
12081198
self.current_max = None
12091199
self.trunc_length = trunc_length
@@ -1316,30 +1306,27 @@ def __init__(self, schema: Schema, properties: Dict[str, str]):
13161306
self._properties = properties
13171307
self._default_mode = self._properties.get(DEFAULT_METRICS_MODE_KEY)
13181308

1319-
def schema(self, schema: Schema, struct_result: Callable[[], List[StatisticsCollector]]) -> List[
1320-
StatisticsCollector]:
1309+
def schema(self, schema: Schema, struct_result: Callable[[], List[StatisticsCollector]]) -> List[StatisticsCollector]:
13211310
return struct_result()
13221311

13231312
def struct(
1324-
self, struct: StructType, field_results: List[Callable[[], List[StatisticsCollector]]]
1313+
self, struct: StructType, field_results: List[Callable[[], List[StatisticsCollector]]]
13251314
) -> List[StatisticsCollector]:
13261315
return list(chain(*[result() for result in field_results]))
13271316

1328-
def field(self, field: NestedField, field_result: Callable[[], List[StatisticsCollector]]) -> List[
1329-
StatisticsCollector]:
1317+
def field(self, field: NestedField, field_result: Callable[[], List[StatisticsCollector]]) -> List[StatisticsCollector]:
13301318
self._field_id = field.field_id
13311319
return field_result()
13321320

1333-
def list(self, list_type: ListType, element_result: Callable[[], List[StatisticsCollector]]) -> List[
1334-
StatisticsCollector]:
1321+
def list(self, list_type: ListType, element_result: Callable[[], List[StatisticsCollector]]) -> List[StatisticsCollector]:
13351322
self._field_id = list_type.element_id
13361323
return element_result()
13371324

13381325
def map(
1339-
self,
1340-
map_type: MapType,
1341-
key_result: Callable[[], List[StatisticsCollector]],
1342-
value_result: Callable[[], List[StatisticsCollector]],
1326+
self,
1327+
map_type: MapType,
1328+
key_result: Callable[[], List[StatisticsCollector]],
1329+
value_result: Callable[[], List[StatisticsCollector]],
13431330
) -> List[StatisticsCollector]:
13441331
self._field_id = map_type.key_id
13451332
k = key_result()
@@ -1362,8 +1349,8 @@ def primitive(self, primitive: PrimitiveType) -> List[StatisticsCollector]:
13621349
metrics_mode = match_metrics_mode(col_mode)
13631350

13641351
if (
1365-
not (isinstance(primitive, StringType) or isinstance(primitive, BinaryType))
1366-
and metrics_mode.type == MetricModeTypes.TRUNCATE
1352+
not (isinstance(primitive, StringType) or isinstance(primitive, BinaryType))
1353+
and metrics_mode.type == MetricModeTypes.TRUNCATE
13671354
):
13681355
metrics_mode = MetricsMode(MetricModeTypes.FULL)
13691356

@@ -1372,13 +1359,12 @@ def primitive(self, primitive: PrimitiveType) -> List[StatisticsCollector]:
13721359
if is_nested and metrics_mode.type in [MetricModeTypes.TRUNCATE, MetricModeTypes.FULL]:
13731360
metrics_mode = MetricsMode(MetricModeTypes.COUNTS)
13741361

1375-
return [StatisticsCollector(field_id=self._field_id, iceberg_type=primitive, mode=metrics_mode,
1376-
column_name=column_name)]
1362+
return [StatisticsCollector(field_id=self._field_id, iceberg_type=primitive, mode=metrics_mode, column_name=column_name)]
13771363

13781364

13791365
def compute_statistics_plan(
1380-
schema: Schema,
1381-
table_properties: Dict[str, str],
1366+
schema: Schema,
1367+
table_properties: Dict[str, str],
13821368
) -> Dict[int, StatisticsCollector]:
13831369
"""
13841370
Compute the statistics plan for all columns.
@@ -1417,8 +1403,7 @@ def __init__(self) -> None:
14171403
def schema(self, schema: Schema, struct_result: Callable[[], List[ID2ParquetPath]]) -> List[ID2ParquetPath]:
14181404
return struct_result()
14191405

1420-
def struct(self, struct: StructType, field_results: List[Callable[[], List[ID2ParquetPath]]]) -> List[
1421-
ID2ParquetPath]:
1406+
def struct(self, struct: StructType, field_results: List[Callable[[], List[ID2ParquetPath]]]) -> List[ID2ParquetPath]:
14221407
return list(chain(*[result() for result in field_results]))
14231408

14241409
def field(self, field: NestedField, field_result: Callable[[], List[ID2ParquetPath]]) -> List[ID2ParquetPath]:
@@ -1436,10 +1421,10 @@ def list(self, list_type: ListType, element_result: Callable[[], List[ID2Parquet
14361421
return result
14371422

14381423
def map(
1439-
self,
1440-
map_type: MapType,
1441-
key_result: Callable[[], List[ID2ParquetPath]],
1442-
value_result: Callable[[], List[ID2ParquetPath]],
1424+
self,
1425+
map_type: MapType,
1426+
key_result: Callable[[], List[ID2ParquetPath]],
1427+
value_result: Callable[[], List[ID2ParquetPath]],
14431428
) -> List[ID2ParquetPath]:
14441429
self._field_id = map_type.key_id
14451430
self._path.append("key_value.key")
@@ -1456,7 +1441,7 @@ def primitive(self, primitive: PrimitiveType) -> List[ID2ParquetPath]:
14561441

14571442

14581443
def parquet_path_to_id_mapping(
1459-
schema: Schema,
1444+
schema: Schema,
14601445
) -> Dict[str, int]:
14611446
"""
14621447
Compute the mapping of parquet column path to Iceberg ID.
@@ -1475,11 +1460,11 @@ def parquet_path_to_id_mapping(
14751460

14761461

14771462
def fill_parquet_file_metadata(
1478-
df: DataFile,
1479-
parquet_metadata: pq.FileMetaData,
1480-
file_size: int,
1481-
stats_columns: Dict[int, StatisticsCollector],
1482-
parquet_column_mapping: Dict[str, int],
1463+
df: DataFile,
1464+
parquet_metadata: pq.FileMetaData,
1465+
file_size: int,
1466+
stats_columns: Dict[int, StatisticsCollector],
1467+
parquet_column_mapping: Dict[str, int],
14831468
) -> None:
14841469
"""
14851470
Compute and fill the following fields of the DataFile object.

0 commit comments

Comments
 (0)