Add missing ORC iceberg.required attribute (#2789)

ebyhr · web-flow · commit 33438bdb1e78 · 2025-12-01T05:45:16.000+01:00
# Rationale for this change Iceberg spec expects `iceberg.required` attribute in addition to `iceberg.id`: > The column IDs must be stored in ORC type attributes using the key `iceberg.id`, and `iceberg.required` to store "true" if the Iceberg column is required, otherwise it will be optional. https://iceberg.apache.org/spec/#orc Fixes #2526 ## Are these changes tested? Yes ## Are there any user-facing changes?
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -196,6 +196,7 @@
 PYARROW_PARQUET_FIELD_ID_KEY = b"PARQUET:field_id"
 # ORC field ID key for Iceberg field IDs in ORC metadata
 ORC_FIELD_ID_KEY = b"iceberg.id"
+ORC_FIELD_REQUIRED_KEY = b"iceberg.required"
 PYARROW_FIELD_DOC_KEY = b"doc"
 LIST_ELEMENT_NAME = "element"
 MAP_KEY_NAME = "key"
@@ -717,6 +718,8 @@ def field(self, field: NestedField, field_result: pa.DataType) -> pa.Field:
             else:
                 # Default to Parquet for backward compatibility
                 metadata[PYARROW_PARQUET_FIELD_ID_KEY] = str(field.field_id)
+        if self._file_format == FileFormat.ORC:
+            metadata[ORC_FIELD_REQUIRED_KEY] = str(field.required).lower()
 
         return pa.field(
             name=field.name,
diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py
@@ -3840,8 +3840,46 @@ def test_orc_schema_conversion_with_field_ids() -> None:
     id_field_no_ids = arrow_schema_no_ids.field(0)
     name_field_no_ids = arrow_schema_no_ids.field(1)
 
-    assert not id_field_no_ids.metadata
-    assert not name_field_no_ids.metadata
+    assert ORC_FIELD_ID_KEY not in id_field_no_ids.metadata
+    assert ORC_FIELD_ID_KEY not in name_field_no_ids.metadata
+    assert PYARROW_PARQUET_FIELD_ID_KEY not in id_field_no_ids.metadata
+    assert PYARROW_PARQUET_FIELD_ID_KEY not in name_field_no_ids.metadata
+
+
+def test_orc_schema_conversion_with_required_attribute() -> None:
+    """
+    Test that schema_to_pyarrow correctly adds ORC iceberg.required attribute.
+    To run just this test:
+        pytest tests/io/test_pyarrow.py -k test_orc_schema_conversion_with_required_attribute
+    """
+    from pyiceberg.io.pyarrow import ORC_FIELD_REQUIRED_KEY, schema_to_pyarrow
+    from pyiceberg.manifest import FileFormat
+    from pyiceberg.schema import Schema
+    from pyiceberg.types import IntegerType, StringType
+
+    # Define schema
+    schema = Schema(
+        NestedField(1, "id", IntegerType(), required=True),
+        NestedField(2, "name", StringType(), required=False),
+    )
+
+    # Test 1: Specify Parquet format
+    arrow_schema_default = schema_to_pyarrow(schema, file_format=FileFormat.PARQUET)
+
+    id_field = arrow_schema_default.field(0)
+    name_field = arrow_schema_default.field(1)
+
+    assert ORC_FIELD_REQUIRED_KEY not in id_field.metadata
+    assert ORC_FIELD_REQUIRED_KEY not in name_field.metadata
+
+    # Test 2: Specify ORC format
+    arrow_schema_orc = schema_to_pyarrow(schema, file_format=FileFormat.ORC)
+
+    id_field_orc = arrow_schema_orc.field(0)
+    name_field_orc = arrow_schema_orc.field(1)
+
+    assert id_field_orc.metadata[ORC_FIELD_REQUIRED_KEY] == b"true"
+    assert name_field_orc.metadata[ORC_FIELD_REQUIRED_KEY] == b"false"
 
 
 def test_orc_batching_behavior_documentation(tmp_path: Path) -> None: