Skip to content

Commit 33438bd

Browse files
authored
Add missing ORC iceberg.required attribute (#2789)
# Rationale for this change Iceberg spec expects `iceberg.required` attribute in addition to `iceberg.id`: > The column IDs must be stored in ORC type attributes using the key `iceberg.id`, and `iceberg.required` to store "true" if the Iceberg column is required, otherwise it will be optional. https://iceberg.apache.org/spec/#orc Fixes #2526 ## Are these changes tested? Yes ## Are there any user-facing changes? <!-- In the case of user-facing changes, please add the changelog label. -->
1 parent d1826f1 commit 33438bd

File tree

2 files changed

+43
-2
lines changed

2 files changed

+43
-2
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@
196196
PYARROW_PARQUET_FIELD_ID_KEY = b"PARQUET:field_id"
197197
# ORC field ID key for Iceberg field IDs in ORC metadata
198198
ORC_FIELD_ID_KEY = b"iceberg.id"
199+
ORC_FIELD_REQUIRED_KEY = b"iceberg.required"
199200
PYARROW_FIELD_DOC_KEY = b"doc"
200201
LIST_ELEMENT_NAME = "element"
201202
MAP_KEY_NAME = "key"
@@ -717,6 +718,8 @@ def field(self, field: NestedField, field_result: pa.DataType) -> pa.Field:
717718
else:
718719
# Default to Parquet for backward compatibility
719720
metadata[PYARROW_PARQUET_FIELD_ID_KEY] = str(field.field_id)
721+
if self._file_format == FileFormat.ORC:
722+
metadata[ORC_FIELD_REQUIRED_KEY] = str(field.required).lower()
720723

721724
return pa.field(
722725
name=field.name,

tests/io/test_pyarrow.py

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3840,8 +3840,46 @@ def test_orc_schema_conversion_with_field_ids() -> None:
38403840
id_field_no_ids = arrow_schema_no_ids.field(0)
38413841
name_field_no_ids = arrow_schema_no_ids.field(1)
38423842

3843-
assert not id_field_no_ids.metadata
3844-
assert not name_field_no_ids.metadata
3843+
assert ORC_FIELD_ID_KEY not in id_field_no_ids.metadata
3844+
assert ORC_FIELD_ID_KEY not in name_field_no_ids.metadata
3845+
assert PYARROW_PARQUET_FIELD_ID_KEY not in id_field_no_ids.metadata
3846+
assert PYARROW_PARQUET_FIELD_ID_KEY not in name_field_no_ids.metadata
3847+
3848+
3849+
def test_orc_schema_conversion_with_required_attribute() -> None:
3850+
"""
3851+
Test that schema_to_pyarrow correctly adds ORC iceberg.required attribute.
3852+
To run just this test:
3853+
pytest tests/io/test_pyarrow.py -k test_orc_schema_conversion_with_required_attribute
3854+
"""
3855+
from pyiceberg.io.pyarrow import ORC_FIELD_REQUIRED_KEY, schema_to_pyarrow
3856+
from pyiceberg.manifest import FileFormat
3857+
from pyiceberg.schema import Schema
3858+
from pyiceberg.types import IntegerType, StringType
3859+
3860+
# Define schema
3861+
schema = Schema(
3862+
NestedField(1, "id", IntegerType(), required=True),
3863+
NestedField(2, "name", StringType(), required=False),
3864+
)
3865+
3866+
# Test 1: Specify Parquet format
3867+
arrow_schema_default = schema_to_pyarrow(schema, file_format=FileFormat.PARQUET)
3868+
3869+
id_field = arrow_schema_default.field(0)
3870+
name_field = arrow_schema_default.field(1)
3871+
3872+
assert ORC_FIELD_REQUIRED_KEY not in id_field.metadata
3873+
assert ORC_FIELD_REQUIRED_KEY not in name_field.metadata
3874+
3875+
# Test 2: Specify ORC format
3876+
arrow_schema_orc = schema_to_pyarrow(schema, file_format=FileFormat.ORC)
3877+
3878+
id_field_orc = arrow_schema_orc.field(0)
3879+
name_field_orc = arrow_schema_orc.field(1)
3880+
3881+
assert id_field_orc.metadata[ORC_FIELD_REQUIRED_KEY] == b"true"
3882+
assert name_field_orc.metadata[ORC_FIELD_REQUIRED_KEY] == b"false"
38453883

38463884

38473885
def test_orc_batching_behavior_documentation(tmp_path: Path) -> None:

0 commit comments

Comments
 (0)