Skip to content

Commit a396149

Browse files
committed
Test again PyArrow 17.0.0
1 parent 4282d2f commit a396149

File tree

5 files changed

+9
-18
lines changed

5 files changed

+9
-18
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1050,11 +1050,6 @@ def _task_to_record_batches(
10501050

10511051
fragment_scanner = ds.Scanner.from_fragment(
10521052
fragment=fragment,
1053-
# With PyArrow 16.0.0 there is an issue with casting record-batches:
1054-
# https://github.com/apache/arrow/issues/41884
1055-
# https://github.com/apache/arrow/issues/43183
1056-
# Would be good to remove this later on
1057-
schema=_pyarrow_schema_ensure_large_types(physical_schema),
10581053
# This will push down the query to Arrow.
10591054
# But in case there are positional deletes, we have to apply them first
10601055
filter=pyarrow_filter if not positional_deletes else None,
@@ -1070,12 +1065,7 @@ def _task_to_record_batches(
10701065
batch = batch.take(indices)
10711066
# Apply the user filter
10721067
if pyarrow_filter is not None:
1073-
# we need to switch back and forth between RecordBatch and Table
1074-
# as Expression filter isn't yet supported in RecordBatch
1075-
# https://github.com/apache/arrow/issues/39220
1076-
arrow_table = pa.Table.from_batches([batch])
1077-
arrow_table = arrow_table.filter(pyarrow_filter)
1078-
batch = arrow_table.to_batches()[0]
1068+
batch = batch.filter(pyarrow_filter)
10791069
yield _to_requested_schema(projected_schema, file_project_schema, batch, downcast_ns_timestamp_to_us=True)
10801070
current_index += len(batch)
10811071

pyiceberg/table/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2020,7 +2020,7 @@ def to_arrow_batch_reader(self) -> pa.RecordBatchReader:
20202020
case_sensitive=self.case_sensitive,
20212021
limit=self.limit,
20222022
),
2023-
)
2023+
).cast(target_schema=target_schema)
20242024

20252025
def to_pandas(self, **kwargs: Any) -> pd.DataFrame:
20262026
return self.to_arrow().to_pandas(**kwargs)

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ fsspec = ">=2023.1.0,<2025.1.0"
5858
pyparsing = ">=3.1.0,<4.0.0"
5959
zstandard = ">=0.13.0,<1.0.0"
6060
tenacity = ">=8.2.3,<9.0.0"
61-
pyarrow = { version = ">=9.0.0,<18.0.0", optional = true }
61+
pyarrow = { version = ">=17.0.0,<18.0.0", optional = true }
6262
pandas = { version = ">=1.0.0,<3.0.0", optional = true }
6363
duckdb = { version = ">=0.5.0,<2.0.0", optional = true }
6464
ray = { version = ">=2.0.0,<2.10.0", optional = true }

tests/integration/test_add_files.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -549,7 +549,7 @@ def test_add_files_with_large_and_regular_schema(spark: SparkSession, session_ca
549549
tbl.add_files([file_path])
550550

551551
table_schema = tbl.scan().to_arrow().schema
552-
assert table_schema == arrow_schema_large
552+
assert table_schema == arrow_schema
553553

554554
file_path_large = f"s3://warehouse/default/unpartitioned_with_large_types/v{format_version}/test-1.parquet"
555555
_write_parquet(

tests/integration/test_writes/test_writes.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,8 @@ def test_python_writes_dictionary_encoded_column_with_spark_reads(
357357
tbl.overwrite(arrow_table)
358358
spark_df = spark.sql(f"SELECT * FROM {identifier}").toPandas()
359359
pyiceberg_df = tbl.scan().to_pandas()
360-
assert spark_df.equals(pyiceberg_df)
360+
assert spark_df['id'].equals(pyiceberg_df['id'])
361+
assert all(spark_df['name'].values == pyiceberg_df['name'].values)
361362

362363

363364
@pytest.mark.integration
@@ -401,12 +402,12 @@ def test_python_writes_with_small_and_large_types_spark_reads(
401402
assert arrow_table_on_read.schema == pa.schema([
402403
pa.field("foo", pa.large_string()),
403404
pa.field("id", pa.int32()),
404-
pa.field("name", pa.large_string()),
405+
pa.field("name", pa.string()),
405406
pa.field(
406407
"address",
407408
pa.struct([
408-
pa.field("street", pa.large_string()),
409-
pa.field("city", pa.large_string()),
409+
pa.field("street", pa.string()),
410+
pa.field("city", pa.string()),
410411
pa.field("zip", pa.int32()),
411412
pa.field("bar", pa.large_string()),
412413
]),

0 commit comments

Comments
 (0)