@@ -428,6 +428,44 @@ def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT)
428
428
for data_file in data_files :
429
429
update_snapshot .append_data_file (data_file )
430
430
431
+ def merge_append (self , df : pa .Table , snapshot_properties : Dict [str , str ] = EMPTY_DICT ) -> None :
432
+ """
433
+ Shorthand API for appending a PyArrow table to a table transaction.
434
+
435
+ Args:
436
+ df: The Arrow dataframe that will be appended to overwrite the table
437
+ snapshot_properties: Custom properties to be added to the snapshot summary
438
+ """
439
+ try :
440
+ import pyarrow as pa
441
+ except ModuleNotFoundError as e :
442
+ raise ModuleNotFoundError ("For writes PyArrow needs to be installed" ) from e
443
+
444
+ if not isinstance (df , pa .Table ):
445
+ raise ValueError (f"Expected PyArrow table, got: { df } " )
446
+
447
+ if unsupported_partitions := [
448
+ field for field in self .table_metadata .spec ().fields if not field .transform .supports_pyarrow_transform
449
+ ]:
450
+ raise ValueError (
451
+ f"Not all partition types are supported for writes. Following partitions cannot be written using pyarrow: { unsupported_partitions } ."
452
+ )
453
+
454
+ _check_schema_compatible (self ._table .schema (), other_schema = df .schema )
455
+ # cast if the two schemas are compatible but not equal
456
+ table_arrow_schema = self ._table .schema ().as_arrow ()
457
+ if table_arrow_schema != df .schema :
458
+ df = df .cast (table_arrow_schema )
459
+
460
+ with self .update_snapshot (snapshot_properties = snapshot_properties ).merge_append () as update_snapshot :
461
+ # skip writing data files if the dataframe is empty
462
+ if df .shape [0 ] > 0 :
463
+ data_files = _dataframe_to_data_files (
464
+ table_metadata = self ._table .metadata , write_uuid = update_snapshot .commit_uuid , df = df ,
465
+ io = self ._table .io
466
+ )
467
+ for data_file in data_files :
468
+ update_snapshot .append_data_file (data_file )
431
469
def overwrite (
432
470
self , df : pa .Table , overwrite_filter : BooleanExpression = ALWAYS_TRUE , snapshot_properties : Dict [str , str ] = EMPTY_DICT
433
471
) -> None :
@@ -1352,6 +1390,17 @@ def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT)
1352
1390
with self .transaction () as tx :
1353
1391
tx .append (df = df , snapshot_properties = snapshot_properties )
1354
1392
1393
+ def merge_append (self , df : pa .Table , snapshot_properties : Dict [str , str ] = EMPTY_DICT ) -> None :
1394
+ """
1395
+ Shorthand API for appending a PyArrow table to the table.
1396
+
1397
+ Args:
1398
+ df: The Arrow dataframe that will be appended to overwrite the table
1399
+ snapshot_properties: Custom properties to be added to the snapshot summary
1400
+ """
1401
+ with self .transaction () as tx :
1402
+ tx .merge_append (df = df , snapshot_properties = snapshot_properties )
1403
+
1355
1404
def overwrite (
1356
1405
self , df : pa .Table , overwrite_filter : BooleanExpression = ALWAYS_TRUE , snapshot_properties : Dict [str , str ] = EMPTY_DICT
1357
1406
) -> None :
0 commit comments