diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md index 5897881fcc..4056bc3aec 100644 --- a/mkdocs/docs/api.md +++ b/mkdocs/docs/api.md @@ -295,6 +295,23 @@ long: [[4.896029,-122.431297,6.0989,2.349014],[6.56667]] The nested lists indicate the different Arrow buffers, where the first write results into a buffer, and the second append in a separate buffer. This is expected since it will read two parquet files. +To avoid any type errors during writing, you can enforce the PyArrow table types using the Iceberg table schema: + +```python +from pyiceberg.catalog import load_catalog +import pyarrow as pa + +catalog = load_catalog("default") +table = catalog.load_table("default.cities") +schema = table.schema().as_arrow() + +df = pa.Table.from_pylist( + [{"city": "Groningen", "lat": 53.21917, "long": 6.56667}], schema=schema +) + +table.append(df) +``` + !!! example "Under development" diff --git a/pyiceberg/schema.py b/pyiceberg/schema.py index e805895a7b..b2739d8618 100644 --- a/pyiceberg/schema.py +++ b/pyiceberg/schema.py @@ -64,6 +64,8 @@ ) if TYPE_CHECKING: + import pyarrow as pa + from pyiceberg.table.name_mapping import ( NameMapping, ) @@ -180,6 +182,12 @@ def as_struct(self) -> StructType: """Return the schema as a struct.""" return StructType(*self.fields) + def as_arrow(self) -> "pa.Schema": + """Return the schema as an Arrow schema.""" + from pyiceberg.io.pyarrow import schema_to_pyarrow + + return schema_to_pyarrow(self) + def find_field(self, name_or_id: Union[str, int], case_sensitive: bool = True) -> NestedField: """Find a field using a field name or field ID. diff --git a/tests/test_schema.py b/tests/test_schema.py index 7e10dd5b0d..90bc70652f 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -1600,3 +1600,19 @@ def test_union_with_pa_schema(primitive_fields: NestedField) -> None: ) assert new_schema == expected_schema + + +def test_arrow_schema() -> None: + base_schema = Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=True), + NestedField(field_id=2, name="bar", field_type=IntegerType(), required=False), + NestedField(field_id=3, name="baz", field_type=BooleanType(), required=False), + ) + + expected_schema = pa.schema([ + pa.field("foo", pa.string(), nullable=False), + pa.field("bar", pa.int32(), nullable=True), + pa.field("baz", pa.bool_(), nullable=True), + ]) + + assert base_schema.as_arrow() == expected_schema