Skip to content

Commit c5c1575

Browse files
committed
Merge branch 'main' of github.com:apache/iceberg-python into fd-test-against-pyarrow-17
2 parents 4e48478 + 1ed3abd commit c5c1575

File tree

9 files changed

+645
-128
lines changed

9 files changed

+645
-128
lines changed

mkdocs/requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,13 @@
1616
# under the License.
1717

1818
mkdocs==1.6.0
19-
griffe==0.47.0
19+
griffe==0.48.0
2020
jinja2==3.1.4
2121
mkdocstrings==0.25.1
2222
mkdocstrings-python==1.10.5
2323
mkdocs-literate-nav==0.6.1
2424
mkdocs-autorefs==1.0.1
2525
mkdocs-gen-files==0.5.0
26-
mkdocs-material==9.5.28
26+
mkdocs-material==9.5.29
2727
mkdocs-material-extensions==1.3.1
2828
mkdocs-section-index==0.3.9

poetry.lock

Lines changed: 98 additions & 47 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyiceberg/io/pyarrow.py

Lines changed: 33 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@
120120
Schema,
121121
SchemaVisitorPerPrimitiveType,
122122
SchemaWithPartnerVisitor,
123+
_check_schema_compatible,
123124
pre_order_visit,
124125
promote,
125126
prune_columns,
@@ -1397,7 +1398,7 @@ def list(self, list_type: ListType, list_array: Optional[pa.Array], value_array:
13971398
# This can be removed once this has been fixed:
13981399
# https://github.com/apache/arrow/issues/38809
13991400
list_array = pa.LargeListArray.from_arrays(list_array.offsets, value_array)
1400-
1401+
value_array = self._cast_if_needed(list_type.element_field, value_array)
14011402
arrow_field = pa.large_list(self._construct_field(list_type.element_field, value_array.type))
14021403
return list_array.cast(arrow_field)
14031404
else:
@@ -1407,6 +1408,8 @@ def map(
14071408
self, map_type: MapType, map_array: Optional[pa.Array], key_result: Optional[pa.Array], value_result: Optional[pa.Array]
14081409
) -> Optional[pa.Array]:
14091410
if isinstance(map_array, pa.MapArray) and key_result is not None and value_result is not None:
1411+
key_result = self._cast_if_needed(map_type.key_field, key_result)
1412+
value_result = self._cast_if_needed(map_type.value_field, value_result)
14101413
arrow_field = pa.map_(
14111414
self._construct_field(map_type.key_field, key_result.type),
14121415
self._construct_field(map_type.value_field, value_result.type),
@@ -1539,9 +1542,16 @@ def __init__(self, iceberg_type: PrimitiveType, physical_type_string: str, trunc
15391542

15401543
expected_physical_type = _primitive_to_physical(iceberg_type)
15411544
if expected_physical_type != physical_type_string:
1542-
raise ValueError(
1543-
f"Unexpected physical type {physical_type_string} for {iceberg_type}, expected {expected_physical_type}"
1544-
)
1545+
# Allow promotable physical types
1546+
# INT32 -> INT64 and FLOAT -> DOUBLE are safe type casts
1547+
if (physical_type_string == "INT32" and expected_physical_type == "INT64") or (
1548+
physical_type_string == "FLOAT" and expected_physical_type == "DOUBLE"
1549+
):
1550+
pass
1551+
else:
1552+
raise ValueError(
1553+
f"Unexpected physical type {physical_type_string} for {iceberg_type}, expected {expected_physical_type}"
1554+
)
15451555

15461556
self.primitive_type = iceberg_type
15471557

@@ -1886,16 +1896,6 @@ def data_file_statistics_from_parquet_metadata(
18861896
set the mode for column metrics collection
18871897
parquet_column_mapping (Dict[str, int]): The mapping of the parquet file name to the field ID
18881898
"""
1889-
if parquet_metadata.num_columns != len(stats_columns):
1890-
raise ValueError(
1891-
f"Number of columns in statistics configuration ({len(stats_columns)}) is different from the number of columns in pyarrow table ({parquet_metadata.num_columns})"
1892-
)
1893-
1894-
if parquet_metadata.num_columns != len(parquet_column_mapping):
1895-
raise ValueError(
1896-
f"Number of columns in column mapping ({len(parquet_column_mapping)}) is different from the number of columns in pyarrow table ({parquet_metadata.num_columns})"
1897-
)
1898-
18991899
column_sizes: Dict[int, int] = {}
19001900
value_counts: Dict[int, int] = {}
19011901
split_offsets: List[int] = []
@@ -1988,8 +1988,7 @@ def write_file(io: FileIO, table_metadata: TableMetadata, tasks: Iterator[WriteT
19881988
)
19891989

19901990
def write_parquet(task: WriteTask) -> DataFile:
1991-
table_schema = task.schema
1992-
1991+
table_schema = table_metadata.schema()
19931992
# if schema needs to be transformed, use the transformed schema and adjust the arrow table accordingly
19941993
# otherwise use the original schema
19951994
if (sanitized_schema := sanitize_column_names(table_schema)) != table_schema:
@@ -2001,7 +2000,7 @@ def write_parquet(task: WriteTask) -> DataFile:
20012000
batches = [
20022001
_to_requested_schema(
20032002
requested_schema=file_schema,
2004-
file_schema=table_schema,
2003+
file_schema=task.schema,
20052004
batch=batch,
20062005
downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us,
20072006
include_field_ids=True,
@@ -2060,47 +2059,30 @@ def bin_pack_arrow_table(tbl: pa.Table, target_file_size: int) -> Iterator[List[
20602059
return bin_packed_record_batches
20612060

20622061

2063-
def _check_schema_compatible(table_schema: Schema, other_schema: pa.Schema, downcast_ns_timestamp_to_us: bool = False) -> None:
2062+
def _check_pyarrow_schema_compatible(
2063+
requested_schema: Schema, provided_schema: pa.Schema, downcast_ns_timestamp_to_us: bool = False
2064+
) -> None:
20642065
"""
2065-
Check if the `table_schema` is compatible with `other_schema`.
2066+
Check if the `requested_schema` is compatible with `provided_schema`.
20662067
20672068
Two schemas are considered compatible when they are equal in terms of the Iceberg Schema type.
20682069
20692070
Raises:
20702071
ValueError: If the schemas are not compatible.
20712072
"""
2072-
name_mapping = table_schema.name_mapping
2073+
name_mapping = requested_schema.name_mapping
20732074
try:
2074-
task_schema = pyarrow_to_schema(
2075-
other_schema, name_mapping=name_mapping, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us
2075+
provided_schema = pyarrow_to_schema(
2076+
provided_schema, name_mapping=name_mapping, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us
20762077
)
20772078
except ValueError as e:
2078-
other_schema = _pyarrow_to_schema_without_ids(other_schema, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us)
2079-
additional_names = set(other_schema.column_names) - set(table_schema.column_names)
2079+
provided_schema = _pyarrow_to_schema_without_ids(provided_schema, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us)
2080+
additional_names = set(provided_schema._name_to_id.keys()) - set(requested_schema._name_to_id.keys())
20802081
raise ValueError(
20812082
f"PyArrow table contains more columns: {', '.join(sorted(additional_names))}. Update the schema first (hint, use union_by_name)."
20822083
) from e
20832084

2084-
if table_schema.as_struct() != task_schema.as_struct():
2085-
from rich.console import Console
2086-
from rich.table import Table as RichTable
2087-
2088-
console = Console(record=True)
2089-
2090-
rich_table = RichTable(show_header=True, header_style="bold")
2091-
rich_table.add_column("")
2092-
rich_table.add_column("Table field")
2093-
rich_table.add_column("Dataframe field")
2094-
2095-
for lhs in table_schema.fields:
2096-
try:
2097-
rhs = task_schema.find_field(lhs.field_id)
2098-
rich_table.add_row("✅" if lhs == rhs else "❌", str(lhs), str(rhs))
2099-
except ValueError:
2100-
rich_table.add_row("❌", str(lhs), "Missing")
2101-
2102-
console.print(rich_table)
2103-
raise ValueError(f"Mismatch in fields:\n{console.export_text()}")
2085+
_check_schema_compatible(requested_schema, provided_schema)
21042086

21052087

21062088
def parquet_files_to_data_files(io: FileIO, table_metadata: TableMetadata, file_paths: Iterator[str]) -> Iterator[DataFile]:
@@ -2114,7 +2096,7 @@ def parquet_files_to_data_files(io: FileIO, table_metadata: TableMetadata, file_
21142096
f"Cannot add file {file_path} because it has field IDs. `add_files` only supports addition of files without field_ids"
21152097
)
21162098
schema = table_metadata.schema()
2117-
_check_schema_compatible(schema, parquet_metadata.schema.to_arrow_schema())
2099+
_check_pyarrow_schema_compatible(schema, parquet_metadata.schema.to_arrow_schema())
21182100

21192101
statistics = data_file_statistics_from_parquet_metadata(
21202102
parquet_metadata=parquet_metadata,
@@ -2195,7 +2177,7 @@ def _dataframe_to_data_files(
21952177
Returns:
21962178
An iterable that supplies datafiles that represent the table.
21972179
"""
2198-
from pyiceberg.table import PropertyUtil, TableProperties, WriteTask
2180+
from pyiceberg.table import DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE, PropertyUtil, TableProperties, WriteTask
21992181

22002182
counter = counter or itertools.count(0)
22012183
write_uuid = write_uuid or uuid.uuid4()
@@ -2204,13 +2186,16 @@ def _dataframe_to_data_files(
22042186
property_name=TableProperties.WRITE_TARGET_FILE_SIZE_BYTES,
22052187
default=TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT,
22062188
)
2189+
name_mapping = table_metadata.schema().name_mapping
2190+
downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) or False
2191+
task_schema = pyarrow_to_schema(df.schema, name_mapping=name_mapping, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us)
22072192

22082193
if table_metadata.spec().is_unpartitioned():
22092194
yield from write_file(
22102195
io=io,
22112196
table_metadata=table_metadata,
22122197
tasks=iter([
2213-
WriteTask(write_uuid=write_uuid, task_id=next(counter), record_batches=batches, schema=table_metadata.schema())
2198+
WriteTask(write_uuid=write_uuid, task_id=next(counter), record_batches=batches, schema=task_schema)
22142199
for batches in bin_pack_arrow_table(df, target_file_size)
22152200
]),
22162201
)
@@ -2225,7 +2210,7 @@ def _dataframe_to_data_files(
22252210
task_id=next(counter),
22262211
record_batches=batches,
22272212
partition_key=partition.partition_key,
2228-
schema=table_metadata.schema(),
2213+
schema=task_schema,
22292214
)
22302215
for partition in partitions
22312216
for batches in bin_pack_arrow_table(partition.arrow_table_partition, target_file_size)

pyiceberg/schema.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1616,3 +1616,103 @@ def _(file_type: FixedType, read_type: IcebergType) -> IcebergType:
16161616
return read_type
16171617
else:
16181618
raise ResolveError(f"Cannot promote {file_type} to {read_type}")
1619+
1620+
1621+
def _check_schema_compatible(requested_schema: Schema, provided_schema: Schema) -> None:
1622+
"""
1623+
Check if the `provided_schema` is compatible with `requested_schema`.
1624+
1625+
Both Schemas must have valid IDs and share the same ID for the same field names.
1626+
1627+
Two schemas are considered compatible when:
1628+
1. All `required` fields in `requested_schema` are present and are also `required` in the `provided_schema`
1629+
2. Field Types are consistent for fields that are present in both schemas. I.e. the field type
1630+
in the `provided_schema` can be promoted to the field type of the same field ID in `requested_schema`
1631+
1632+
Raises:
1633+
ValueError: If the schemas are not compatible.
1634+
"""
1635+
pre_order_visit(requested_schema, _SchemaCompatibilityVisitor(provided_schema))
1636+
1637+
1638+
class _SchemaCompatibilityVisitor(PreOrderSchemaVisitor[bool]):
1639+
provided_schema: Schema
1640+
1641+
def __init__(self, provided_schema: Schema):
1642+
from rich.console import Console
1643+
from rich.table import Table as RichTable
1644+
1645+
self.provided_schema = provided_schema
1646+
self.rich_table = RichTable(show_header=True, header_style="bold")
1647+
self.rich_table.add_column("")
1648+
self.rich_table.add_column("Table field")
1649+
self.rich_table.add_column("Dataframe field")
1650+
self.console = Console(record=True)
1651+
1652+
def _is_field_compatible(self, lhs: NestedField) -> bool:
1653+
# Validate nullability first.
1654+
# An optional field can be missing in the provided schema
1655+
# But a required field must exist as a required field
1656+
try:
1657+
rhs = self.provided_schema.find_field(lhs.field_id)
1658+
except ValueError:
1659+
if lhs.required:
1660+
self.rich_table.add_row("❌", str(lhs), "Missing")
1661+
return False
1662+
else:
1663+
self.rich_table.add_row("✅", str(lhs), "Missing")
1664+
return True
1665+
1666+
if lhs.required and not rhs.required:
1667+
self.rich_table.add_row("❌", str(lhs), str(rhs))
1668+
return False
1669+
1670+
# Check type compatibility
1671+
if lhs.field_type == rhs.field_type:
1672+
self.rich_table.add_row("✅", str(lhs), str(rhs))
1673+
return True
1674+
# We only check that the parent node is also of the same type.
1675+
# We check the type of the child nodes when we traverse them later.
1676+
elif any(
1677+
(isinstance(lhs.field_type, container_type) and isinstance(rhs.field_type, container_type))
1678+
for container_type in {StructType, MapType, ListType}
1679+
):
1680+
self.rich_table.add_row("✅", str(lhs), str(rhs))
1681+
return True
1682+
else:
1683+
try:
1684+
# If type can be promoted to the requested schema
1685+
# it is considered compatible
1686+
promote(rhs.field_type, lhs.field_type)
1687+
self.rich_table.add_row("✅", str(lhs), str(rhs))
1688+
return True
1689+
except ResolveError:
1690+
self.rich_table.add_row("❌", str(lhs), str(rhs))
1691+
return False
1692+
1693+
def schema(self, schema: Schema, struct_result: Callable[[], bool]) -> bool:
1694+
if not (result := struct_result()):
1695+
self.console.print(self.rich_table)
1696+
raise ValueError(f"Mismatch in fields:\n{self.console.export_text()}")
1697+
return result
1698+
1699+
def struct(self, struct: StructType, field_results: List[Callable[[], bool]]) -> bool:
1700+
results = [result() for result in field_results]
1701+
return all(results)
1702+
1703+
def field(self, field: NestedField, field_result: Callable[[], bool]) -> bool:
1704+
return self._is_field_compatible(field) and field_result()
1705+
1706+
def list(self, list_type: ListType, element_result: Callable[[], bool]) -> bool:
1707+
return self._is_field_compatible(list_type.element_field) and element_result()
1708+
1709+
def map(self, map_type: MapType, key_result: Callable[[], bool], value_result: Callable[[], bool]) -> bool:
1710+
return all([
1711+
self._is_field_compatible(map_type.key_field),
1712+
self._is_field_compatible(map_type.value_field),
1713+
key_result(),
1714+
value_result(),
1715+
])
1716+
1717+
def primitive(self, primitive: PrimitiveType) -> bool:
1718+
return True

pyiceberg/table/__init__.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,6 @@
7373
manifest_evaluator,
7474
)
7575
from pyiceberg.io import FileIO, OutputFile, load_file_io
76-
from pyiceberg.io.pyarrow import _check_schema_compatible, _dataframe_to_data_files, expression_to_pyarrow, project_table
7776
from pyiceberg.manifest import (
7877
POSITIONAL_DELETE_SCHEMA,
7978
DataFile,
@@ -471,6 +470,8 @@ def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT)
471470
except ModuleNotFoundError as e:
472471
raise ModuleNotFoundError("For writes PyArrow needs to be installed") from e
473472

473+
from pyiceberg.io.pyarrow import _check_pyarrow_schema_compatible, _dataframe_to_data_files
474+
474475
if not isinstance(df, pa.Table):
475476
raise ValueError(f"Expected PyArrow table, got: {df}")
476477

@@ -481,8 +482,8 @@ def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT)
481482
f"Not all partition types are supported for writes. Following partitions cannot be written using pyarrow: {unsupported_partitions}."
482483
)
483484
downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) or False
484-
_check_schema_compatible(
485-
self._table.schema(), other_schema=df.schema, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us
485+
_check_pyarrow_schema_compatible(
486+
self._table.schema(), provided_schema=df.schema, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us
486487
)
487488

488489
manifest_merge_enabled = PropertyUtil.property_as_bool(
@@ -528,6 +529,8 @@ def overwrite(
528529
except ModuleNotFoundError as e:
529530
raise ModuleNotFoundError("For writes PyArrow needs to be installed") from e
530531

532+
from pyiceberg.io.pyarrow import _check_pyarrow_schema_compatible, _dataframe_to_data_files
533+
531534
if not isinstance(df, pa.Table):
532535
raise ValueError(f"Expected PyArrow table, got: {df}")
533536

@@ -538,8 +541,8 @@ def overwrite(
538541
f"Not all partition types are supported for writes. Following partitions cannot be written using pyarrow: {unsupported_partitions}."
539542
)
540543
downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) or False
541-
_check_schema_compatible(
542-
self._table.schema(), other_schema=df.schema, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us
544+
_check_pyarrow_schema_compatible(
545+
self._table.schema(), provided_schema=df.schema, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us
543546
)
544547

545548
self.delete(delete_filter=overwrite_filter, snapshot_properties=snapshot_properties)
@@ -566,6 +569,8 @@ def delete(self, delete_filter: Union[str, BooleanExpression], snapshot_properti
566569
delete_filter: A boolean expression to delete rows from a table
567570
snapshot_properties: Custom properties to be added to the snapshot summary
568571
"""
572+
from pyiceberg.io.pyarrow import _dataframe_to_data_files, expression_to_pyarrow, project_table
573+
569574
if (
570575
self.table_metadata.properties.get(TableProperties.DELETE_MODE, TableProperties.DELETE_MODE_DEFAULT)
571576
== TableProperties.DELETE_MODE_MERGE_ON_READ

0 commit comments

Comments
 (0)