Skip to content

Commit b8b2f66

Browse files
authored
Use ArrowScan.to_table to replace project_table (#1180)
* Use ArrowScan.to_table to replace project_table * Use ArrowScan.to_table to replace project_table on these file: ** pyiceberg\table\__init__.py ** pyiceberg\io\pyarrow.py ** pyiceberg\test_pyarrow.py * Replace all remaining of project_table using ArrowScan.to_table Replace all remaining of project_table using ArrowScan.to_table * Fix format Fix format * Modify by ruff Modify by ruff
1 parent 41a3c8e commit b8b2f66

File tree

2 files changed

+28
-31
lines changed

2 files changed

+28
-31
lines changed

pyiceberg/table/__init__.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -523,9 +523,9 @@ def delete(self, delete_filter: Union[str, BooleanExpression], snapshot_properti
523523
snapshot_properties: Custom properties to be added to the snapshot summary
524524
"""
525525
from pyiceberg.io.pyarrow import (
526+
ArrowScan,
526527
_dataframe_to_data_files,
527528
_expression_to_complementary_pyarrow,
528-
project_table,
529529
)
530530

531531
if (
@@ -559,13 +559,12 @@ def delete(self, delete_filter: Union[str, BooleanExpression], snapshot_properti
559559
# - Apply the latest partition-spec
560560
# - And sort order when added
561561
for original_file in files:
562-
df = project_table(
563-
tasks=[original_file],
562+
df = ArrowScan(
564563
table_metadata=self.table_metadata,
565564
io=self._table.io,
566-
row_filter=AlwaysTrue(),
567565
projected_schema=self.table_metadata.schema(),
568-
)
566+
row_filter=AlwaysTrue(),
567+
).to_table(tasks=[original_file])
569568
filtered_df = df.filter(preserve_row_filter)
570569

571570
# Only rewrite if there are records being deleted

tests/io/test_pyarrow.py

Lines changed: 24 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
from pyiceberg.io import InputStream, OutputStream, load_file_io
5959
from pyiceberg.io.pyarrow import (
6060
ICEBERG_SCHEMA,
61+
ArrowScan,
6162
PyArrowFile,
6263
PyArrowFileIO,
6364
StatsAggregator,
@@ -69,7 +70,6 @@
6970
_to_requested_schema,
7071
bin_pack_arrow_table,
7172
expression_to_pyarrow,
72-
project_table,
7373
schema_to_pyarrow,
7474
)
7575
from pyiceberg.manifest import DataFile, DataFileContent, FileFormat
@@ -952,7 +952,19 @@ def file_map(schema_map: Schema, tmpdir: str) -> str:
952952
def project(
953953
schema: Schema, files: List[str], expr: Optional[BooleanExpression] = None, table_schema: Optional[Schema] = None
954954
) -> pa.Table:
955-
return project_table(
955+
return ArrowScan(
956+
table_metadata=TableMetadataV2(
957+
location="file://a/b/",
958+
last_column_id=1,
959+
format_version=2,
960+
schemas=[table_schema or schema],
961+
partition_specs=[PartitionSpec()],
962+
),
963+
io=PyArrowFileIO(),
964+
projected_schema=schema,
965+
row_filter=expr or AlwaysTrue(),
966+
case_sensitive=True,
967+
).to_table(
956968
tasks=[
957969
FileScanTask(
958970
DataFile(
@@ -965,18 +977,7 @@ def project(
965977
)
966978
)
967979
for file in files
968-
],
969-
table_metadata=TableMetadataV2(
970-
location="file://a/b/",
971-
last_column_id=1,
972-
format_version=2,
973-
schemas=[table_schema or schema],
974-
partition_specs=[PartitionSpec()],
975-
),
976-
io=PyArrowFileIO(),
977-
row_filter=expr or AlwaysTrue(),
978-
projected_schema=schema,
979-
case_sensitive=True,
980+
]
980981
)
981982

982983

@@ -1411,9 +1412,7 @@ def test_delete(deletes_file: str, example_task: FileScanTask, table_schema_simp
14111412
data_file=example_task.file,
14121413
delete_files={DataFile(content=DataFileContent.POSITION_DELETES, file_path=deletes_file, file_format=FileFormat.PARQUET)},
14131414
)
1414-
1415-
with_deletes = project_table(
1416-
tasks=[example_task_with_delete],
1415+
with_deletes = ArrowScan(
14171416
table_metadata=TableMetadataV2(
14181417
location=metadata_location,
14191418
last_column_id=1,
@@ -1423,9 +1422,9 @@ def test_delete(deletes_file: str, example_task: FileScanTask, table_schema_simp
14231422
partition_specs=[PartitionSpec()],
14241423
),
14251424
io=load_file_io(),
1426-
row_filter=AlwaysTrue(),
14271425
projected_schema=table_schema_simple,
1428-
)
1426+
row_filter=AlwaysTrue(),
1427+
).to_table(tasks=[example_task_with_delete])
14291428

14301429
assert (
14311430
str(with_deletes)
@@ -1450,8 +1449,7 @@ def test_delete_duplicates(deletes_file: str, example_task: FileScanTask, table_
14501449
},
14511450
)
14521451

1453-
with_deletes = project_table(
1454-
tasks=[example_task_with_delete],
1452+
with_deletes = ArrowScan(
14551453
table_metadata=TableMetadataV2(
14561454
location=metadata_location,
14571455
last_column_id=1,
@@ -1461,9 +1459,9 @@ def test_delete_duplicates(deletes_file: str, example_task: FileScanTask, table_
14611459
partition_specs=[PartitionSpec()],
14621460
),
14631461
io=load_file_io(),
1464-
row_filter=AlwaysTrue(),
14651462
projected_schema=table_schema_simple,
1466-
)
1463+
row_filter=AlwaysTrue(),
1464+
).to_table(tasks=[example_task_with_delete])
14671465

14681466
assert (
14691467
str(with_deletes)
@@ -1480,8 +1478,8 @@ def test_delete_duplicates(deletes_file: str, example_task: FileScanTask, table_
14801478

14811479
def test_pyarrow_wrap_fsspec(example_task: FileScanTask, table_schema_simple: Schema) -> None:
14821480
metadata_location = "file://a/b/c.json"
1483-
projection = project_table(
1484-
tasks=[example_task],
1481+
1482+
projection = ArrowScan(
14851483
table_metadata=TableMetadataV2(
14861484
location=metadata_location,
14871485
last_column_id=1,
@@ -1494,7 +1492,7 @@ def test_pyarrow_wrap_fsspec(example_task: FileScanTask, table_schema_simple: Sc
14941492
case_sensitive=True,
14951493
projected_schema=table_schema_simple,
14961494
row_filter=AlwaysTrue(),
1497-
)
1495+
).to_table(tasks=[example_task])
14981496

14991497
assert (
15001498
str(projection)

0 commit comments

Comments
 (0)