Add as_arrow() to Schema class

apache · Mar 18, 2024 · 0506356 · 0506356
1 parent b447461
commit 0506356
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 0 deletions.
diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md
@@ -295,6 +295,25 @@ long: [[4.896029,-122.431297,6.0989,2.349014],[6.56667]]
 
 The nested lists indicate the different Arrow buffers, where the first write results into a buffer, and the second append in a separate buffer. This is expected since it will read two parquet files.
 
+
+To avoid any type errors during writing, you can enforce the PyArrow table types using the Iceberg table schema:
+
+```python
+from pyiceberg.catalog import load_catalog
+import pyarrow as pa
+
+catalog = load_catalog("default")
+table = catalog.load_table('default.cities')
+schema = table.schema().as_arrow()
+
+df = pa.Table.from_pylist(
+    [{"city": "Groningen", "lat": 53.21917, "long": 6.56667}],
+    schema=schema
+)
+
+table.append(df)
+```
+
 <!-- prettier-ignore-start -->
 
 !!! example "Under development"

diff --git a/pyiceberg/schema.py b/pyiceberg/schema.py
@@ -64,6 +64,8 @@
 )
 
 if TYPE_CHECKING:
+    import pyarrow as pa
+
     from pyiceberg.table.name_mapping import (
         NameMapping,
     )
@@ -180,6 +182,12 @@ def as_struct(self) -> StructType:
         """Return the schema as a struct."""
         return StructType(*self.fields)
 
+    def as_arrow(self) -> "pa.Schema":
+        """Return the schema as an Arrow schema."""
+
+        from pyiceberg.io.pyarrow import schema_to_pyarrow
+        return schema_to_pyarrow(self)
+
     def find_field(self, name_or_id: Union[str, int], case_sensitive: bool = True) -> NestedField:
         """Find a field using a field name or field ID.
 

diff --git a/tests/test_schema.py b/tests/test_schema.py
@@ -1600,3 +1600,19 @@ def test_union_with_pa_schema(primitive_fields: NestedField) -> None:
     )
 
     assert new_schema == expected_schema
+
+
+def test_arrow_schema() -> None:
+    base_schema = Schema(
+        NestedField(field_id=1, name="foo", field_type=StringType(), required=True),
+        NestedField(field_id=2, name="bar", field_type=IntegerType(), required=False),
+        NestedField(field_id=3, name="baz", field_type=BooleanType(), required=False),
+    )
+
+    expected_schema = pa.schema([
+        pa.field("foo", pa.string(), nullable=False),
+        pa.field("bar", pa.int32(), nullable=True),
+        pa.field("baz", pa.bool_(), nullable=True),
+    ])
+
+    assert base_schema.as_arrow() == expected_schema