2121from copy import copy
2222from enum import Enum
2323from types import TracebackType
24- from typing import Any , Generator
25- from typing import Callable
26- from typing import Dict
27- from typing import Iterator
28- from typing import List
29- from typing import Literal
30- from typing import Optional
31- from typing import Type
24+ from typing import (
25+ Any ,
26+ Dict ,
27+ Iterator ,
28+ List ,
29+ Literal ,
30+ Optional ,
31+ Type ,
32+ )
3233
3334from pydantic_core import to_json
3435
3536from pyiceberg .avro .file import AvroFile , AvroOutputFile
3637from pyiceberg .conversions import to_bytes
3738from pyiceberg .exceptions import ValidationError
38- from pyiceberg .io import FileIO
39- from pyiceberg .io import InputFile
40- from pyiceberg .io import OutputFile
39+ from pyiceberg .io import FileIO , InputFile , OutputFile
4140from pyiceberg .partitioning import PartitionSpec
4241from pyiceberg .schema import Schema
4342from pyiceberg .typedef import Record , TableVersion
5352 StringType ,
5453 StructType ,
5554)
56- from pyiceberg .typedef import EMPTY_DICT
5755
5856UNASSIGNED_SEQ = - 1
5957DEFAULT_BLOCK_SIZE = 67108864 # 64 * 1024 * 1024
@@ -103,9 +101,7 @@ def __repr__(self) -> str:
103101
104102DATA_FILE_TYPE : Dict [int , StructType ] = {
105103 1 : StructType (
106- NestedField (
107- field_id = 100 , name = "file_path" , field_type = StringType (), required = True , doc = "Location URI with FS scheme"
108- ),
104+ NestedField (field_id = 100 , name = "file_path" , field_type = StringType (), required = True , doc = "Location URI with FS scheme" ),
109105 NestedField (
110106 field_id = 101 ,
111107 name = "file_format" ,
@@ -120,15 +116,9 @@ def __repr__(self) -> str:
120116 required = True ,
121117 doc = "Partition data tuple, schema based on the partition spec" ,
122118 ),
119+ NestedField (field_id = 103 , name = "record_count" , field_type = LongType (), required = True , doc = "Number of records in the file" ),
123120 NestedField (
124- field_id = 103 , name = "record_count" , field_type = LongType (), required = True , doc = "Number of records in the file"
125- ),
126- NestedField (
127- field_id = 104 ,
128- name = "file_size_in_bytes" ,
129- field_type = LongType (),
130- required = True ,
131- doc = "Total file size in bytes" ,
121+ field_id = 104 , name = "file_size_in_bytes" , field_type = LongType (), required = True , doc = "Total file size in bytes"
132122 ),
133123 NestedField (
134124 field_id = 105 ,
@@ -181,11 +171,7 @@ def __repr__(self) -> str:
181171 doc = "Map of column id to upper bound" ,
182172 ),
183173 NestedField (
184- field_id = 131 ,
185- name = "key_metadata" ,
186- field_type = BinaryType (),
187- required = False ,
188- doc = "Encryption key metadata blob" ,
174+ field_id = 131 , name = "key_metadata" , field_type = BinaryType (), required = False , doc = "Encryption key metadata blob"
189175 ),
190176 NestedField (
191177 field_id = 132 ,
@@ -205,9 +191,7 @@ def __repr__(self) -> str:
205191 doc = "File format name: avro, orc, or parquet" ,
206192 initial_default = DataFileContent .DATA ,
207193 ),
208- NestedField (
209- field_id = 100 , name = "file_path" , field_type = StringType (), required = True , doc = "Location URI with FS scheme"
210- ),
194+ NestedField (field_id = 100 , name = "file_path" , field_type = StringType (), required = True , doc = "Location URI with FS scheme" ),
211195 NestedField (
212196 field_id = 101 ,
213197 name = "file_format" ,
@@ -222,15 +206,9 @@ def __repr__(self) -> str:
222206 required = True ,
223207 doc = "Partition data tuple, schema based on the partition spec" ,
224208 ),
209+ NestedField (field_id = 103 , name = "record_count" , field_type = LongType (), required = True , doc = "Number of records in the file" ),
225210 NestedField (
226- field_id = 103 , name = "record_count" , field_type = LongType (), required = True , doc = "Number of records in the file"
227- ),
228- NestedField (
229- field_id = 104 ,
230- name = "file_size_in_bytes" ,
231- field_type = LongType (),
232- required = True ,
233- doc = "Total file size in bytes" ,
211+ field_id = 104 , name = "file_size_in_bytes" , field_type = LongType (), required = True , doc = "Total file size in bytes"
234212 ),
235213 NestedField (
236214 field_id = 108 ,
@@ -275,11 +253,7 @@ def __repr__(self) -> str:
275253 doc = "Map of column id to upper bound" ,
276254 ),
277255 NestedField (
278- field_id = 131 ,
279- name = "key_metadata" ,
280- field_type = BinaryType (),
281- required = False ,
282- doc = "Encryption key metadata blob" ,
256+ field_id = 131 , name = "key_metadata" , field_type = BinaryType (), required = False , doc = "Encryption key metadata blob"
283257 ),
284258 NestedField (
285259 field_id = 132 ,
@@ -307,34 +281,28 @@ def __repr__(self) -> str:
307281
308282
309283def data_file_with_partition (partition_type : StructType , format_version : TableVersion ) -> StructType :
310- data_file_partition_type = StructType (
311- * [
312- NestedField (
313- field_id = field .field_id ,
314- name = field .name ,
315- field_type = field .field_type ,
316- required = field .required ,
317- )
318- for field in partition_type .fields
319- ]
320- )
284+ data_file_partition_type = StructType (* [
285+ NestedField (
286+ field_id = field .field_id ,
287+ name = field .name ,
288+ field_type = field .field_type ,
289+ required = field .required ,
290+ )
291+ for field in partition_type .fields
292+ ])
321293
322- return StructType (
323- * [
324- (
325- NestedField (
326- field_id = 102 ,
327- name = "partition" ,
328- field_type = data_file_partition_type ,
329- required = True ,
330- doc = "Partition data tuple, schema based on the partition spec" ,
331- )
332- if field .field_id == 102
333- else field
334- )
335- for field in DATA_FILE_TYPE [format_version ].fields
336- ]
337- )
294+ return StructType (* [
295+ NestedField (
296+ field_id = 102 ,
297+ name = "partition" ,
298+ field_type = data_file_partition_type ,
299+ required = True ,
300+ doc = "Partition data tuple, schema based on the partition spec" ,
301+ )
302+ if field .field_id == 102
303+ else field
304+ for field in DATA_FILE_TYPE [format_version ].fields
305+ ])
338306
339307
340308class DataFile (Record ):
@@ -415,18 +383,14 @@ def __eq__(self, other: Any) -> bool:
415383 ),
416384}
417385
418- MANIFEST_ENTRY_SCHEMAS_STRUCT = {
419- format_version : schema .as_struct () for format_version , schema in MANIFEST_ENTRY_SCHEMAS .items ()
420- }
386+ MANIFEST_ENTRY_SCHEMAS_STRUCT = {format_version : schema .as_struct () for format_version , schema in MANIFEST_ENTRY_SCHEMAS .items ()}
421387
422388
423389def manifest_entry_schema_with_data_file (format_version : TableVersion , data_file : StructType ) -> Schema :
424- return Schema (
425- * [
426- NestedField (2 , "data_file" , data_file , required = True ) if field .field_id == 2 else field
427- for field in MANIFEST_ENTRY_SCHEMAS [format_version ].fields
428- ]
429- )
390+ return Schema (* [
391+ NestedField (2 , "data_file" , data_file , required = True ) if field .field_id == 2 else field
392+ for field in MANIFEST_ENTRY_SCHEMAS [format_version ].fields
393+ ])
430394
431395
432396class ManifestEntry (Record ):
@@ -496,9 +460,7 @@ def update(self, value: Any) -> None:
496460 self ._min = min (self ._min , value )
497461
498462
499- def construct_partition_summaries (
500- spec : PartitionSpec , schema : Schema , partitions : List [Record ]
501- ) -> List [PartitionFieldSummary ]:
463+ def construct_partition_summaries (spec : PartitionSpec , schema : Schema , partitions : List [Record ]) -> List [PartitionFieldSummary ]:
502464 types = [field .field_type for field in spec .partition_type (schema ).fields ]
503465 field_stats = [PartitionFieldStats (field_type ) for field_type in types ]
504466 for partition_keys in partitions :
@@ -522,9 +484,7 @@ def construct_partition_summaries(
522484 NestedField (512 , "added_rows_count" , LongType (), required = False ),
523485 NestedField (513 , "existing_rows_count" , LongType (), required = False ),
524486 NestedField (514 , "deleted_rows_count" , LongType (), required = False ),
525- NestedField (
526- 507 , "partitions" , ListType (508 , PARTITION_FIELD_SUMMARY_TYPE , element_required = True ), required = False
527- ),
487+ NestedField (507 , "partitions" , ListType (508 , PARTITION_FIELD_SUMMARY_TYPE , element_required = True ), required = False ),
528488 NestedField (519 , "key_metadata" , BinaryType (), required = False ),
529489 ),
530490 2 : Schema (
@@ -541,16 +501,12 @@ def construct_partition_summaries(
541501 NestedField (512 , "added_rows_count" , LongType (), required = True ),
542502 NestedField (513 , "existing_rows_count" , LongType (), required = True ),
543503 NestedField (514 , "deleted_rows_count" , LongType (), required = True ),
544- NestedField (
545- 507 , "partitions" , ListType (508 , PARTITION_FIELD_SUMMARY_TYPE , element_required = True ), required = False
546- ),
504+ NestedField (507 , "partitions" , ListType (508 , PARTITION_FIELD_SUMMARY_TYPE , element_required = True ), required = False ),
547505 NestedField (519 , "key_metadata" , BinaryType (), required = False ),
548506 ),
549507}
550508
551- MANIFEST_LIST_FILE_STRUCTS = {
552- format_version : schema .as_struct () for format_version , schema in MANIFEST_LIST_FILE_SCHEMAS .items ()
553- }
509+ MANIFEST_LIST_FILE_STRUCTS = {format_version : schema .as_struct () for format_version , schema in MANIFEST_LIST_FILE_SCHEMAS .items ()}
554510
555511
556512POSITIONAL_DELETE_SCHEMA = Schema (
@@ -669,16 +625,12 @@ def _inherit_from_manifest(entry: ManifestEntry, manifest: ManifestFile) -> Mani
669625
670626 # in v1 tables, the data sequence number is not persisted and can be safely defaulted to 0
671627 # in v2 tables, the data sequence number should be inherited iff the entry status is ADDED
672- if entry .data_sequence_number is None and (
673- manifest .sequence_number == 0 or entry .status == ManifestEntryStatus .ADDED
674- ):
628+ if entry .data_sequence_number is None and (manifest .sequence_number == 0 or entry .status == ManifestEntryStatus .ADDED ):
675629 entry .data_sequence_number = manifest .sequence_number
676630
677631 # in v1 tables, the file sequence number is not persisted and can be safely defaulted to 0
678632 # in v2 tables, the file sequence number should be inherited iff the entry status is ADDED
679- if entry .file_sequence_number is None and (
680- manifest .sequence_number == 0 or entry .status == ManifestEntryStatus .ADDED
681- ):
633+ if entry .file_sequence_number is None and (manifest .sequence_number == 0 or entry .status == ManifestEntryStatus .ADDED ):
682634 # Only available in V2, always 0 in V1
683635 entry .file_sequence_number = manifest .sequence_number
684636
@@ -1001,11 +953,7 @@ def __init__(self, output_file: OutputFile, snapshot_id: int, parent_snapshot_id
1001953 super ().__init__ (
1002954 format_version = 1 ,
1003955 output_file = output_file ,
1004- meta = {
1005- "snapshot-id" : str (snapshot_id ),
1006- "parent-snapshot-id" : str (parent_snapshot_id ),
1007- "format-version" : "1" ,
1008- },
956+ meta = {"snapshot-id" : str (snapshot_id ), "parent-snapshot-id" : str (parent_snapshot_id ), "format-version" : "1" },
1009957 )
1010958
1011959 def prepare_manifest (self , manifest_file : ManifestFile ) -> ManifestFile :
@@ -1018,9 +966,7 @@ class ManifestListWriterV2(ManifestListWriter):
1018966 _commit_snapshot_id : int
1019967 _sequence_number : int
1020968
1021- def __init__ (
1022- self , output_file : OutputFile , snapshot_id : int , parent_snapshot_id : Optional [int ], sequence_number : int
1023- ):
969+ def __init__ (self , output_file : OutputFile , snapshot_id : int , parent_snapshot_id : Optional [int ], sequence_number : int ):
1024970 super ().__init__ (
1025971 format_version = 2 ,
1026972 output_file = output_file ,
@@ -1072,5 +1018,3 @@ def write_manifest_list(
10721018 return ManifestListWriterV2 (output_file , snapshot_id , parent_snapshot_id , sequence_number )
10731019 else :
10741020 raise ValueError (f"Cannot write manifest list for table version: { format_version } " )
1075-
1076-
0 commit comments