From ef829853957d1644dd42530623423bf3d304e3e8 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Sat, 18 Jan 2025 13:57:03 +0000 Subject: [PATCH] Documentation for Location Providers --- mkdocs/docs/configuration.md | 59 ++++++++++++++++++++++++++++++++++++ pyiceberg/table/locations.py | 7 ++++- 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 06eaac1bed..46a27f0177 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -54,6 +54,8 @@ Iceberg tables support table properties to configure table behavior. ### Write options +***TODO:*** Add LocationProvider-related properties here. + | Key | Options | Default | Description | | -------------------------------------- | --------------------------------- | ------- | ------------------------------------------------------------------------------------------- | | `write.parquet.compression-codec` | `{uncompressed,zstd,gzip,snappy}` | zstd | Sets the Parquet compression coddec. | @@ -195,6 +197,63 @@ PyIceberg uses [S3FileSystem](https://arrow.apache.org/docs/python/generated/pya +## Location Providers + +Iceberg works with the concept of a LocationProvider that determines the file paths for a table's data. PyIceberg +introduces a pluggable LocationProvider module; the LocationProvider used may be specified on a per-table basis via +table properties. PyIceberg defaults to the [ObjectStoreLocationProvider](configuration.md#objectstorelocationprovider), +which generates file paths that are optimised for object storage. + +### SimpleLocationProvider + +The SimpleLocationProvider places file names underneath a `data` directory in the table's storage location. For example, +a non-partitioned table might have a data file with location: + +```txt +s3://my-bucket/my_table/data/0000-0-5affc076-96a4-48f2-9cd2-d5efbc9f0c94-00001.parquet +``` + +When data is partitioned, the files under a given partition are grouped into a subdirectory, with that partition key +and value as the directory name. For example, a table partitioned over a string column `category` might have a data file +with location: + +```txt +s3://my-bucket/my_table/data/category=orders/0000-0-5affc076-96a4-48f2-9cd2-d5efbc9f0c94-00001.parquet +``` + +The SimpleLocationProvider is enabled for a table by explicitly setting its `write.object-storage.enabled` table property to `false`. + +### ObjectStoreLocationProvider + +When several files are stored under the same prefix, cloud object stores such as S3 often [throttling requests on prefixes](https://repost.aws/knowledge-center/http-5xx-errors-s3), +resulting in slowdowns. + +The ObjectStoreLocationProvider counteracts this by injecting deterministic hashes, in the form of binary directories, +into file paths, to distribute files across a larger number of object store prefixes. + +Partitions are included in file paths just before the file name, in a similar manner to the [SimpleLocationProvider](configuration.md#simplelocationprovider). +A table partitioned over a string column `category` might have a data file with location: (note the additional binary directories) + +```txt +s3://my-bucket/my_table/data/0101/0110/1001/10110010/category=orders/0000-0-5affc076-96a4-48f2-9cd2-d5efbc9f0c94-00001.parquet +``` + +The `write.object-storage.enabled` table property determines whether the ObjectStoreLocationProvider is enabled for a +table. It is used by default. + +When the ObjectStoreLocationProvider is used, the table property `write.object-storage.partitioned-paths`, which +defaults to `true`, can be set to `false` as an additional optimisation. This omits partition keys and values from data +file paths *entirely* to further reduce key size. With it disabled, the same data file above would instead be written +to: (note the absence of `category=orders`) + +```txt +s3://my-bucket/my_table/data/1101/0100/1011/00111010-00000-0-5affc076-96a4-48f2-9cd2-d5efbc9f0c94-00001.parquet +``` + +### Loading a Custom LocationProvider + +***TODO***. Maybe link to code reference for LocationProvider? + ## Catalogs PyIceberg currently has native catalog type support for REST, SQL, Hive, Glue and DynamoDB. diff --git a/pyiceberg/table/locations.py b/pyiceberg/table/locations.py index 046ee32527..53b41d1e61 100644 --- a/pyiceberg/table/locations.py +++ b/pyiceberg/table/locations.py @@ -30,7 +30,12 @@ class LocationProvider(ABC): - """A base class for location providers, that provide data file locations for write tasks.""" + """A base class for location providers, that provide data file locations for a table's write tasks. + + Args: + table_location (str): The table's base storage location. + table_properties (Properties): The table's properties. + """ table_location: str table_properties: Properties