diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index bfe1e62fac..ce17931169 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -46,7 +46,20 @@ The environment variable picked up by Iceberg starts with `PYICEBERG_` and then For example, `PYICEBERG_CATALOG__DEFAULT__S3__ACCESS_KEY_ID`, sets `s3.access-key-id` on the `default` catalog. -## FileIO +# Tables + +Iceberg tables support table properties to configure table behavior. + +## Write options + +| Key | Options | Default | Description | +| --------------------------------- | --------------------------------- | ------- | ------------------------------------------------------------------------------------------- | +| `write.parquet.compression-codec` | `{uncompressed,zstd,gzip,snappy}` | zstd | Sets the Parquet compression coddec. | +| `write.parquet.compression-level` | Integer | null | Parquet compression level for the codec. If not set, it is up to PyIceberg | +| `write.parquet.page-size-bytes` | Size in bytes | 1MB | Set a target threshold for the approximate encoded size of data pages within a column chunk | +| `write.parquet.dict-size-bytes` | Size in bytes | 2MB | Set the dictionary page size limit per row group | + +# FileIO Iceberg works with the concept of a FileIO which is a pluggable module for reading, writing, and deleting files. By default, PyIceberg will try to initialize the FileIO that's suitable for the scheme (`s3://`, `gs://`, etc.) and will use the first one that's installed. diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index f7c0aef68c..99c1af5ad6 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -1757,14 +1757,14 @@ def write_file(table: Table, tasks: Iterator[WriteTask]) -> Iterator[DataFile]: def _get_parquet_writer_kwargs(table_properties: Properties) -> Dict[str, Any]: - def _get_int(key: str) -> Optional[int]: + def _get_int(key: str, default: Optional[int] = None) -> Optional[int]: if value := table_properties.get(key): try: return int(value) except ValueError as e: raise ValueError(f"Could not parse table property {key} to an integer: {value}") from e else: - return None + return default for key_pattern in [ "write.parquet.row-group-size-bytes", @@ -1784,5 +1784,5 @@ def _get_int(key: str) -> Optional[int]: "compression": compression_codec, "compression_level": compression_level, "data_page_size": _get_int("write.parquet.page-size-bytes"), - "dictionary_pagesize_limit": _get_int("write.parquet.dict-size-bytes"), + "dictionary_pagesize_limit": _get_int("write.parquet.dict-size-bytes", default=2 * 1024 * 1024), }