From 846fc0886d0e5cdfea9448e7b769f6e660a5c786 Mon Sep 17 00:00:00 2001 From: HonahX Date: Sun, 14 Jul 2024 02:08:01 -0700 Subject: [PATCH] add doc --- mkdocs/docs/configuration.md | 145 ++++++++++++++++++++++------------ pyiceberg/catalog/__init__.py | 2 +- pyiceberg/catalog/dynamodb.py | 5 +- pyiceberg/catalog/glue.py | 5 +- 4 files changed, 98 insertions(+), 59 deletions(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 76e1816c3a..f2d954cbea 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -22,35 +22,11 @@ hide: - under the License. --> -# Catalogs - -PyIceberg currently has native support for REST, SQL, Hive, Glue and DynamoDB. - -There are three ways to pass in configuration: - -- Using the `~/.pyiceberg.yaml` configuration file -- Through environment variables -- By passing in credentials through the CLI or the Python API - -The configuration file is recommended since that's the easiest way to manage the credentials. - -Another option is through environment variables: - -```sh -export PYICEBERG_CATALOG__DEFAULT__URI=thrift://localhost:9083 -export PYICEBERG_CATALOG__DEFAULT__S3__ACCESS_KEY_ID=username -export PYICEBERG_CATALOG__DEFAULT__S3__SECRET_ACCESS_KEY=password -``` - -The environment variable picked up by Iceberg starts with `PYICEBERG_` and then follows the yaml structure below, where a double underscore `__` represents a nested field, and the underscore `_` is converted into a dash `-`. - -For example, `PYICEBERG_CATALOG__DEFAULT__S3__ACCESS_KEY_ID`, sets `s3.access-key-id` on the `default` catalog. - -# Tables +## Tables Iceberg tables support table properties to configure table behavior. -## Write options +### Write options | Key | Options | Default | Description | | --------------------------------- | --------------------------------- | ------- | ------------------------------------------------------------------------------------------- | @@ -61,7 +37,7 @@ Iceberg tables support table properties to configure table behavior. | `write.parquet.dict-size-bytes` | Size in bytes | 2MB | Set the dictionary page size limit per row group | | `write.parquet.row-group-limit` | Number of rows | 122880 | The Parquet row group limit | -## Table behavior options +### Table behavior options | Key | Options | Default | Description | | ------------------------------------ | ------------------- | ------------- | ----------------------------------------------------------- | @@ -76,7 +52,7 @@ Iceberg tables support table properties to configure table behavior. -# FileIO +## FileIO Iceberg works with the concept of a FileIO which is a pluggable module for reading, writing, and deleting files. By default, PyIceberg will try to initialize the FileIO that's suitable for the scheme (`s3://`, `gs://`, etc.) and will use the first one that's installed. @@ -101,8 +77,9 @@ For the FileIO there are several configuration options available: | Key | Example | Description | | -------------------- | ------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | s3.endpoint | https://10.0.19.25/ | Configure an alternative endpoint of the S3 service for the FileIO to access. This could be used to use S3FileIO with any s3-compatible object storage service that has a different endpoint, or access a private S3 endpoint in a virtual private cloud. | -| s3.access-key-id | admin | Configure the static secret access key used to access the FileIO. | -| s3.secret-access-key | password | Configure the static session token used to access the FileIO. | +| s3.access-key-id | admin | Configure the static access key id used to access the FileIO. | +| s3.secret-access-key | password | Configure the static secret access key used to access the FileIO. | +| s3.session-token | AQoDYXdzEJr... | Configure the static session token used to access the FileIO. | | s3.signer | bearer | Configure the signature version of the FileIO. | | s3.signer.uri | http://my.signer:8080/s3 | Configure the remote signing uri if it differs from the catalog uri. Remote signing is only implemented for `FsspecFileIO`. The final request is sent to `/v1/aws/s3/sign`. | | s3.region | us-west-2 | Sets the region of the bucket | @@ -160,7 +137,31 @@ For the FileIO there are several configuration options available: -## REST Catalog +## Catalogs + +PyIceberg currently has native support for REST, SQL, Hive, Glue and DynamoDB. + +There are three ways to pass in configuration: + +- Using the `~/.pyiceberg.yaml` configuration file +- Through environment variables +- By passing in credentials through the CLI or the Python API + +The configuration file is recommended since that's the easiest way to manage the credentials. + +Another option is through environment variables: + +```sh +export PYICEBERG_CATALOG__DEFAULT__URI=thrift://localhost:9083 +export PYICEBERG_CATALOG__DEFAULT__S3__ACCESS_KEY_ID=username +export PYICEBERG_CATALOG__DEFAULT__S3__SECRET_ACCESS_KEY=password +``` + +The environment variable picked up by Iceberg starts with `PYICEBERG_` and then follows the yaml structure below, where a double underscore `__` represents a nested field, and the underscore `_` is converted into a dash `-`. + +For example, `PYICEBERG_CATALOG__DEFAULT__S3__ACCESS_KEY_ID`, sets `s3.access-key-id` on the `default` catalog. + +### REST Catalog ```yaml catalog: @@ -195,7 +196,7 @@ catalog: -### Headers in RESTCatalog +#### Headers in RESTCatalog To configure custom headers in RESTCatalog, include them in the catalog properties with the prefix `header.`. This ensures that all HTTP requests to the REST service include the specified headers. @@ -208,7 +209,7 @@ catalog: header.content-type: application/vnd.api+json ``` -## SQL Catalog +### SQL Catalog The SQL catalog requires a database for its backend. PyIceberg supports PostgreSQL and SQLite through psycopg2. The database connection has to be configured using the `uri` property. See SQLAlchemy's [documentation for URL format](https://docs.sqlalchemy.org/en/20/core/engines.html#backend-specific-urls): @@ -243,7 +244,7 @@ catalog: | echo | true | false | SQLAlchemy engine [echo param](https://docs.sqlalchemy.org/en/20/core/engines.html#sqlalchemy.create_engine.params.echo) to log all statements to the default log handler | | pool_pre_ping | true | false | SQLAlchemy engine [pool_pre_ping param](https://docs.sqlalchemy.org/en/20/core/engines.html#sqlalchemy.create_engine.params.pool_pre_ping) to test connections for liveness upon each checkout | -## Hive Catalog +### Hive Catalog ```yaml catalog: @@ -263,7 +264,7 @@ catalog: hive.hive2-compatible: true ``` -## Glue Catalog +### Glue Catalog Your AWS credentials can be passed directly through the Python API. Otherwise, please refer to @@ -274,30 +275,42 @@ If you did not set up a default AWS profile, you can configure the `profile_name catalog: default: type: glue - aws_access_key_id: - aws_secret_access_key: - aws_session_token: - region_name: + glue.access-key-id: + glue.secret-access-key: + glue.session-token: + glue.region: ``` ```yaml catalog: default: type: glue - profile_name: - region_name: + glue.profile-name: + glue.region: ``` -| Key | Example | Description | -| ----------------- | ------------------------------------ | ------------------------------------------------------------------------------- | -| glue.id | 111111111111 | Configure the 12-digit ID of the Glue Catalog | -| glue.skip-archive | true | Configure whether to skip the archival of older table versions. Default to true | -| glue.endpoint | https://glue.us-east-1.amazonaws.com | Configure an alternative endpoint of the Glue service for GlueCatalog to access | +| Key | Example | Description | +| ---------------------- | ------------------------------------ | ------------------------------------------------------------------------------- | +| glue.id | 111111111111 | Configure the 12-digit ID of the Glue Catalog | +| glue.skip-archive | true | Configure whether to skip the archival of older table versions. Default to true | +| glue.endpoint | https://glue.us-east-1.amazonaws.com | Configure an alternative endpoint of the Glue service for GlueCatalog to access | +| glue.profile-name | default | Configure the static profile used to access the Glue Catalog | +| glue.region | us-east-1 | Set the region of the Glue Catalog | +| glue.access-key-id | admin | Configure the static access key id used to access the Glue Catalog | +| glue.secret-access-key | password | Configure the static secret access key used to access the Glue Catalog | +| glue.session-token | AQoDYXdzEJr... | Configure the static session token used to access the Glue Catalog | + + +!!! warning "Deprecated Properties" + `profile_name`, `region_name`, `botocore_session`, `aws_access_key_id`, `aws_secret_access_key`, `aws_session_token` are deprecated and will be removed in 0.8.0: + + + ## DynamoDB Catalog If you want to use AWS DynamoDB as the catalog, you can use the last two ways to configure the pyiceberg and refer @@ -318,12 +331,44 @@ catalog: default: type: dynamodb table-name: iceberg - aws_access_key_id: - aws_secret_access_key: - aws_session_token: - region_name: + dynamodb.access-key-id: + dynamodb.secret-access-key: + dynamodb.session-token: + dynamodb.region: ``` + + +| Key | Example | Description | +| -------------------------- | -------------- | -------------------------------------------------------------------------- | +| dynamodb.profile-name | default | Configure the static profile used to access the Dynamodb Catalog | +| dynamodb.region | us-east-1 | Set the region of the Dynamodb Catalog | +| dynamodb.access-key-id | admin | Configure the static access key id used to access the Dynamodb Catalog | +| dynamodb.secret-access-key | password | Configure the static secret access key used to access the Dynamodb Catalog | +| dynamodb.session-token | AQoDYXdzEJr... | Configure the static session token used to access the Dynamodb Catalog | + + + + + +!!! warning "Deprecated Properties" + `profile_name`, `region_name`, `botocore_session`, `aws_access_key_id`, `aws_secret_access_key`, `aws_session_token` are deprecated and will be removed in 0.8.0: + + + +## Unified AWS Credentials + +You can explicitly set the AWS credentials for both Glue/Dynamodb Catalog and S3 FileIO by configure `client.*` properties. + +| Key | Example | Description | +| ------------------------ | -------------- | ------------------------------------------------------------------------------------------------------ | +| client.region | us-east-1 | Set the region of both the Glue/DynamoDB Catalog and the S3 FileIO | +| client.access-key-id | admin | Configure the static access key id used to access both the Glue/DynamoDB Catalog and the S3 FileIO | +| client.secret-access-key | password | Configure the static secret access key used to access both the Glue/DynamoDB Catalog and the S3 FileIO | +| client.session-token | AQoDYXdzEJr... | Configure the static session token used to access both the Glue/DynamoDB Catalog and the S3 FileIO | + +Note that the `client.*` properties will be overridden by service-specific properties if they are set. For example, if `client.region` is set to `us-west-1` and `s3.region` is set to `us-east-1`, the S3 FileIO will use `us-east-1` as the region. + # Concurrency PyIceberg uses multiple threads to parallelize operations. The number of workers can be configured by supplying a `max-workers` entry in the configuration file, or by setting the `PYICEBERG_MAX_WORKERS` environment variable. The default value depends on the system hardware and Python version. See [the Python documentation](https://docs.python.org/3/library/concurrent.futures.html#threadpoolexecutor) for more details. diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index 829ef62f4a..a84bde0d0c 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -716,7 +716,7 @@ def __init__(self, name: str, **properties: str): deprecated( deprecated_in="0.7.0", removed_in="0.8.0", - help_message=f"The property {property_name} is deprecated. Please use properties that start with aws., glue., and dynamo. instead", + help_message=f"The property {property_name} is deprecated. Please use properties that start with client., glue., and dynamo. instead", )(lambda: None)() def create_table_transaction( diff --git a/pyiceberg/catalog/dynamodb.py b/pyiceberg/catalog/dynamodb.py index afdd5a77f3..7cb5d98502 100644 --- a/pyiceberg/catalog/dynamodb.py +++ b/pyiceberg/catalog/dynamodb.py @@ -86,7 +86,6 @@ DYNAMODB_PROFILE_NAME = "dynamodb.profile-name" DYNAMODB_REGION = "dynamodb.region" -DYNAMODB_BOTOCORE_SESSION = "dynamodb.botocore-session" DYNAMODB_ACCESS_KEY_ID = "dynamodb.access-key-id" DYNAMODB_SECRET_ACCESS_KEY = "dynamodb.secret-access-key" DYNAMODB_SESSION_TOKEN = "dynamodb.session-token" @@ -101,9 +100,7 @@ def __init__(self, name: str, **properties: str): session = boto3.Session( profile_name=PropertyUtil.get_first_property_value(properties, DYNAMODB_PROFILE_NAME, DEPRECATED_PROFILE_NAME), region_name=PropertyUtil.get_first_property_value(properties, DYNAMODB_REGION, AWS_REGION, DEPRECATED_REGION), - botocore_session=PropertyUtil.get_first_property_value( - properties, DYNAMODB_BOTOCORE_SESSION, DEPRECATED_BOTOCORE_SESSION - ), + botocore_session=properties.get(DEPRECATED_BOTOCORE_SESSION), aws_access_key_id=PropertyUtil.get_first_property_value( properties, DYNAMODB_ACCESS_KEY_ID, AWS_ACCESS_KEY_ID, DEPRECATED_ACCESS_KEY_ID ), diff --git a/pyiceberg/catalog/glue.py b/pyiceberg/catalog/glue.py index 26b487f507..fa974a6f5c 100644 --- a/pyiceberg/catalog/glue.py +++ b/pyiceberg/catalog/glue.py @@ -126,7 +126,6 @@ GLUE_PROFILE_NAME = "glue.profile-name" GLUE_REGION = "glue.region" -GLUE_BOTOCORE_SESSION = "glue.botocore-session" GLUE_ACCESS_KEY_ID = "glue.access-key-id" GLUE_SECRET_ACCESS_KEY = "glue.secret-access-key" GLUE_SESSION_TOKEN = "glue.session-token" @@ -304,9 +303,7 @@ def __init__(self, name: str, **properties: Any): session = boto3.Session( profile_name=PropertyUtil.get_first_property_value(properties, GLUE_PROFILE_NAME, DEPRECATED_PROFILE_NAME), region_name=PropertyUtil.get_first_property_value(properties, GLUE_REGION, AWS_REGION, DEPRECATED_REGION), - botocore_session=PropertyUtil.get_first_property_value( - properties, GLUE_BOTOCORE_SESSION, DEPRECATED_BOTOCORE_SESSION - ), + botocore_session=properties.get(DEPRECATED_BOTOCORE_SESSION), aws_access_key_id=PropertyUtil.get_first_property_value( properties, GLUE_ACCESS_KEY_ID, AWS_ACCESS_KEY_ID, DEPRECATED_ACCESS_KEY_ID ),