Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pyiceberg/io/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@
PYARROW_PARQUET_FIELD_ID_KEY = b"PARQUET:field_id"
# ORC field ID key for Iceberg field IDs in ORC metadata
ORC_FIELD_ID_KEY = b"iceberg.id"
ORC_FIELD_REQUIRED_KEY = b"iceberg.required"
PYARROW_FIELD_DOC_KEY = b"doc"
LIST_ELEMENT_NAME = "element"
MAP_KEY_NAME = "key"
Expand Down Expand Up @@ -722,6 +723,8 @@ def field(self, field: NestedField, field_result: pa.DataType) -> pa.Field:
else:
# Default to Parquet for backward compatibility
metadata[PYARROW_PARQUET_FIELD_ID_KEY] = str(field.field_id)
if self._file_format == FileFormat.ORC:
metadata[ORC_FIELD_REQUIRED_KEY] = str(field.required).lower()

return pa.field(
name=field.name,
Expand Down
42 changes: 40 additions & 2 deletions tests/io/test_pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -3840,8 +3840,46 @@ def test_orc_schema_conversion_with_field_ids() -> None:
id_field_no_ids = arrow_schema_no_ids.field(0)
name_field_no_ids = arrow_schema_no_ids.field(1)

assert not id_field_no_ids.metadata
assert not name_field_no_ids.metadata
assert PYARROW_PARQUET_FIELD_ID_KEY not in id_field_no_ids.metadata
assert PYARROW_PARQUET_FIELD_ID_KEY not in name_field_no_ids.metadata
assert PYARROW_PARQUET_FIELD_ID_KEY not in id_field_no_ids.metadata
assert PYARROW_PARQUET_FIELD_ID_KEY not in name_field_no_ids.metadata


def test_orc_schema_conversion_with_required_attribute() -> None:
"""
Test that schema_to_pyarrow correctly adds ORC iceberg.required attribute.
To run just this test:
pytest tests/io/test_pyarrow.py -k test_orc_schema_conversion_with_required_attribute
"""
from pyiceberg.io.pyarrow import ORC_FIELD_REQUIRED_KEY, schema_to_pyarrow
from pyiceberg.manifest import FileFormat
from pyiceberg.schema import Schema
from pyiceberg.types import IntegerType, StringType

# Define schema
schema = Schema(
NestedField(1, "id", IntegerType(), required=True),
NestedField(2, "name", StringType(), required=False),
)

# Test 1: Specify Parquet format
arrow_schema_default = schema_to_pyarrow(schema, file_format=FileFormat.PARQUET)

id_field = arrow_schema_default.field(0)
name_field = arrow_schema_default.field(1)

assert ORC_FIELD_REQUIRED_KEY not in id_field.metadata
assert ORC_FIELD_REQUIRED_KEY not in name_field.metadata

# Test 2: Specify ORC format
arrow_schema_orc = schema_to_pyarrow(schema, file_format=FileFormat.ORC)

id_field_orc = arrow_schema_orc.field(0)
name_field_orc = arrow_schema_orc.field(1)

assert id_field_orc.metadata[ORC_FIELD_REQUIRED_KEY] == b"true"
assert name_field_orc.metadata[ORC_FIELD_REQUIRED_KEY] == b"false"


def test_orc_batching_behavior_documentation(tmp_path: Path) -> None:
Expand Down