Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Standardize AWS credential names #922

Merged
merged 14 commits into from
Jul 19, 2024
29 changes: 29 additions & 0 deletions pyiceberg/catalog/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from enum import Enum
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
List,
Expand Down Expand Up @@ -66,6 +67,7 @@
RecursiveDict,
)
from pyiceberg.utils.config import Config, merge_config
from pyiceberg.utils.deprecated import deprecated

if TYPE_CHECKING:
import pyarrow as pa
Expand Down Expand Up @@ -100,6 +102,21 @@
re.X,
)

DEPRECATED_PROFILE_NAME = "profile_name"
DEPRECATED_REGION = "region_name"
DEPRECATED_BOTOCORE_SESSION = "botocore_session"
DEPRECATED_ACCESS_KEY_ID = "aws_access_key_id"
DEPRECATED_SECRET_ACCESS_KEY = "aws_secret_access_key"
DEPRECATED_SESSION_TOKEN = "aws_session_token"
DEPRECATED_PROPERTY_NAMES = {
DEPRECATED_PROFILE_NAME,
DEPRECATED_REGION,
DEPRECATED_BOTOCORE_SESSION,
DEPRECATED_ACCESS_KEY_ID,
DEPRECATED_SECRET_ACCESS_KEY,
DEPRECATED_SESSION_TOKEN,
}


class CatalogType(Enum):
REST = "rest"
Expand Down Expand Up @@ -838,6 +855,18 @@ def _get_default_warehouse_location(self, database_name: str, table_name: str) -

raise ValueError("No default path is set, please specify a location when creating a table")

def _get_first_property_value(self, property_names: Tuple[str, ...]) -> Optional[Any]:
for property_name in property_names:
if property_value := self.properties.get(property_name):
if property_name in DEPRECATED_PROPERTY_NAMES:
deprecated(
deprecated_in="0.7.0",
removed_in="0.8.0",
help_message=f"The property {property_name} is deprecated. Please use properties start with aws., glue., and dynamo. instead",
HonahX marked this conversation as resolved.
Show resolved Hide resolved
)(lambda: None)()
return property_value
return None

@staticmethod
def _write_metadata(metadata: TableMetadata, io: FileIO, metadata_path: str) -> None:
ToOutputFile.table_metadata(metadata, io.new_output(metadata_path))
Expand Down
34 changes: 27 additions & 7 deletions pyiceberg/catalog/dynamodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@
import boto3

from pyiceberg.catalog import (
DEPRECATED_ACCESS_KEY_ID,
DEPRECATED_BOTOCORE_SESSION,
DEPRECATED_PROFILE_NAME,
DEPRECATED_REGION,
DEPRECATED_SECRET_ACCESS_KEY,
DEPRECATED_SESSION_TOKEN,
ICEBERG,
METADATA_LOCATION,
PREVIOUS_METADATA_LOCATION,
Expand All @@ -47,7 +53,7 @@
NoSuchTableError,
TableAlreadyExistsError,
)
from pyiceberg.io import load_file_io
from pyiceberg.io import AWS_ACCESS_KEY_ID, AWS_REGION, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, load_file_io
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec
from pyiceberg.schema import Schema
from pyiceberg.serializers import FromInputFile
Expand Down Expand Up @@ -78,17 +84,31 @@
ACTIVE = "ACTIVE"
ITEM = "Item"

DYNAMODB_PROFILE_NAME = "dynamodb.profile-name"
DYNAMODB_REGION = "dynamodb.region"
DYNAMODB_BOTOCORE_SESSION = "dynamodb.botocore-session"
DYNAMODB_ACCESS_KEY_ID = "dynamodb.access-key-id"
DYNAMODB_SECRET_ACCESS_KEY = "dynamodb.secret-access-key"
DYNAMODB_SESSION_TOKEN = "dynamodb.session-token"

DYNAMODB_PROFILE_NAME_PROPERTIES = (DYNAMODB_PROFILE_NAME, DEPRECATED_PROFILE_NAME)
DYNAMODB_REGION_PROPERTIES = (DYNAMODB_REGION, AWS_REGION, DEPRECATED_REGION)
DYNAMODB_BOTOCORE_SESSION_PROPERTIES = (DYNAMODB_BOTOCORE_SESSION, DEPRECATED_BOTOCORE_SESSION)
DYNAMODB_ACCESS_KEY_ID_PROPERTIES = (DYNAMODB_ACCESS_KEY_ID, AWS_ACCESS_KEY_ID, DEPRECATED_ACCESS_KEY_ID)
DYNAMODB_SECRET_ACCESS_KEY_PROPERTIES = (DYNAMODB_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, DEPRECATED_SECRET_ACCESS_KEY)
DYNAMODB_SESSION_TOKEN_PROPERTIES = (DYNAMODB_SESSION_TOKEN, AWS_SESSION_TOKEN, DEPRECATED_SESSION_TOKEN)


class DynamoDbCatalog(MetastoreCatalog):
def __init__(self, name: str, **properties: str):
super().__init__(name, **properties)
session = boto3.Session(
profile_name=properties.get("profile_name"),
region_name=properties.get("region_name"),
botocore_session=properties.get("botocore_session"),
aws_access_key_id=properties.get("aws_access_key_id"),
aws_secret_access_key=properties.get("aws_secret_access_key"),
aws_session_token=properties.get("aws_session_token"),
profile_name=self._get_first_property_value(DYNAMODB_PROFILE_NAME_PROPERTIES),
region_name=self._get_first_property_value(DYNAMODB_REGION_PROPERTIES),
botocore_session=self._get_first_property_value(DYNAMODB_BOTOCORE_SESSION_PROPERTIES),
aws_access_key_id=self._get_first_property_value(DYNAMODB_ACCESS_KEY_ID_PROPERTIES),
aws_secret_access_key=self._get_first_property_value(DYNAMODB_SECRET_ACCESS_KEY_PROPERTIES),
aws_session_token=self._get_first_property_value(DYNAMODB_SESSION_TOKEN_PROPERTIES),
)
self.dynamodb = session.client(DYNAMODB_CLIENT)
self.dynamodb_table_name = self.properties.get(DYNAMODB_TABLE_NAME, DYNAMODB_TABLE_NAME_DEFAULT)
Expand Down
33 changes: 27 additions & 6 deletions pyiceberg/catalog/glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@
)

from pyiceberg.catalog import (
DEPRECATED_ACCESS_KEY_ID,
DEPRECATED_BOTOCORE_SESSION,
DEPRECATED_PROFILE_NAME,
DEPRECATED_REGION,
DEPRECATED_SECRET_ACCESS_KEY,
DEPRECATED_SESSION_TOKEN,
EXTERNAL_TABLE,
ICEBERG,
LOCATION,
Expand All @@ -58,6 +64,7 @@
NoSuchTableError,
TableAlreadyExistsError,
)
from pyiceberg.io import AWS_ACCESS_KEY_ID, AWS_REGION, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec
from pyiceberg.schema import Schema, SchemaVisitor, visit
from pyiceberg.serializers import FromInputFile
Expand Down Expand Up @@ -113,6 +120,20 @@
ICEBERG_FIELD_OPTIONAL = "iceberg.field.optional"
ICEBERG_FIELD_CURRENT = "iceberg.field.current"

GLUE_PROFILE_NAME = "glue.profile-name"
GLUE_REGION = "glue.region"
GLUE_BOTOCORE_SESSION = "glue.botocore-session"
GLUE_ACCESS_KEY_ID = "glue.access-key-id"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How common is it to have a separate access-key-id for glue and the table itself? The S3FileIO would not pick up the glue.access-key-id:

"access_key": self.properties.get(S3_ACCESS_KEY_ID),

This way you would need to set both glue.access-key-id (which is used for authenticating against Glue), and then you need to set s3.access-key-id to ensure that it can access the metadata on the bucket. Maybe good to highlight this on the docs, WDYT?

I'm not an AWS expert, but my gut feeling is that normally people rely on AWS_ACCESS_KEY_ID to be picked up for both Glue and S3, but I prefer to be able to put this in the ~/.pyiceberg.yaml as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the suggestions! I've updated the doc to explicitly indicating that glue.* properties are for Glue Catalog only. I've also added the s3.* properties to the example to make this clear.

but my gut feeling is that normally people rely on AWS_ACCESS_KEY_ID to be picked up for both Glue and S3, but I prefer to be able to put this in the ~/.pyiceberg.yaml as well.

The client.* properties will cover this case. When user set client.* properties, both catalog and FileIO can pick up the credentials. However, due to the limitation of pyarrow's S3FileSystem as noted in #570, we cannot yet support a unified profile-name property.

I added a separate section Unified AWS Credentials for the client.* properties. Please let me know WDYT.

Here are some preview of the updated page:
Screenshot 2024-07-15 at 00 04 10
Screenshot 2024-07-15 at 00 05 18

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense, I'm good with that now. At some point we should have a bigger conversation across languages to unify this.

GLUE_SECRET_ACCESS_KEY = "glue.secret-access-key"
GLUE_SESSION_TOKEN = "glue.session-token"

GLUE_PROFILE_NAME_PROPERTIES = (GLUE_PROFILE_NAME, DEPRECATED_PROFILE_NAME)
GLUE_REGION_PROPERTIES = (GLUE_REGION, AWS_REGION, DEPRECATED_REGION)
GLUE_BOTOCORE_SESSION_PROPERTIES = (GLUE_BOTOCORE_SESSION, DEPRECATED_BOTOCORE_SESSION)
GLUE_ACCESS_KEY_ID_PROPERTIES = (GLUE_ACCESS_KEY_ID, AWS_ACCESS_KEY_ID, DEPRECATED_ACCESS_KEY_ID)
GLUE_SECRET_ACCESS_KEY_PROPERTIES = (GLUE_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, DEPRECATED_SECRET_ACCESS_KEY)
GLUE_SESSION_TOKEN_PROPERTIES = (GLUE_SESSION_TOKEN, AWS_SESSION_TOKEN, DEPRECATED_SESSION_TOKEN)


def _construct_parameters(
metadata_location: str, glue_table: Optional[TableTypeDef] = None, prev_metadata_location: Optional[str] = None
Expand Down Expand Up @@ -282,12 +303,12 @@ def __init__(self, name: str, **properties: Any):
super().__init__(name, **properties)

session = boto3.Session(
profile_name=properties.get("profile_name"),
region_name=properties.get("region_name"),
botocore_session=properties.get("botocore_session"),
aws_access_key_id=properties.get("aws_access_key_id"),
aws_secret_access_key=properties.get("aws_secret_access_key"),
aws_session_token=properties.get("aws_session_token"),
profile_name=self._get_first_property_value(GLUE_PROFILE_NAME_PROPERTIES),
region_name=self._get_first_property_value(GLUE_REGION_PROPERTIES),
botocore_session=self._get_first_property_value(GLUE_BOTOCORE_SESSION_PROPERTIES),
aws_access_key_id=self._get_first_property_value(GLUE_ACCESS_KEY_ID_PROPERTIES),
aws_secret_access_key=self._get_first_property_value(GLUE_SECRET_ACCESS_KEY_PROPERTIES),
aws_session_token=self._get_first_property_value(GLUE_SESSION_TOKEN_PROPERTIES),
)
self.glue: GlueClient = session.client("glue")

Expand Down
18 changes: 18 additions & 0 deletions pyiceberg/io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,12 @@
from io import SEEK_SET
from types import TracebackType
from typing import (
Any,
Dict,
List,
Optional,
Protocol,
Tuple,
Type,
Union,
runtime_checkable,
Expand All @@ -46,6 +48,10 @@

logger = logging.getLogger(__name__)

AWS_REGION = "client.region"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I chose client. for unified configurations because in java:
https://github.com/apache/iceberg/blob/c68abfc9fd3956077b43aba20441f089bb8b93d6/aws/src/main/java/org/apache/iceberg/aws/AwsClientProperties.java#L67
client.region is used for configurations that affect all the clients.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice, that's a good find 👍

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would be good to highlight this in the PR description!

AWS_ACCESS_KEY_ID = "client.access-key-id"
AWS_SECRET_ACCESS_KEY = "client.secret-access-key"
AWS_SESSION_TOKEN = "client.session-token"
S3_ENDPOINT = "s3.endpoint"
S3_ACCESS_KEY_ID = "s3.access-key-id"
S3_SECRET_ACCESS_KEY = "s3.secret-access-key"
Expand Down Expand Up @@ -77,6 +83,11 @@
GCS_DEFAULT_LOCATION = "gcs.default-bucket-location"
GCS_VERSION_AWARE = "gcs.version-aware"

S3_REGION_PROPERTIES = (S3_REGION, AWS_REGION)
S3_ACCESS_KEY_ID_PROPERTIES = (S3_ACCESS_KEY_ID, AWS_ACCESS_KEY_ID)
S3_SECRET_ACCESS_KEY_PROPERTIES = (S3_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY)
S3_SESSION_TOKEN_PROPERTIES = (S3_SESSION_TOKEN, AWS_SESSION_TOKEN)


@runtime_checkable
class InputStream(Protocol):
Expand Down Expand Up @@ -320,6 +331,13 @@ def _infer_file_io_from_scheme(path: str, properties: Properties) -> Optional[Fi
return None


def _get_first_property_value(properties: Properties, property_names: Tuple[str, ...]) -> Optional[Any]:
HonahX marked this conversation as resolved.
Show resolved Hide resolved
for property_name in property_names:
if property_value := properties.get(property_name):
return property_value
return None


def load_file_io(properties: Properties = EMPTY_DICT, location: Optional[str] = None) -> FileIO:
# First look for the py-io-impl property to directly load the class
if io_impl := properties.get(PY_IO_IMPL):
Expand Down
17 changes: 9 additions & 8 deletions pyiceberg/io/fsspec.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,20 +56,21 @@
GCS_SESSION_KWARGS,
GCS_TOKEN,
GCS_VERSION_AWARE,
S3_ACCESS_KEY_ID,
S3_ACCESS_KEY_ID_PROPERTIES,
S3_CONNECT_TIMEOUT,
S3_ENDPOINT,
S3_PROXY_URI,
S3_REGION,
S3_SECRET_ACCESS_KEY,
S3_SESSION_TOKEN,
S3_REGION_PROPERTIES,
S3_SECRET_ACCESS_KEY_PROPERTIES,
S3_SESSION_TOKEN_PROPERTIES,
S3_SIGNER_URI,
ADLFS_ClIENT_SECRET,
FileIO,
InputFile,
InputStream,
OutputFile,
OutputStream,
_get_first_property_value,
)
from pyiceberg.typedef import Properties

Expand Down Expand Up @@ -116,10 +117,10 @@ def _s3(properties: Properties) -> AbstractFileSystem:

client_kwargs = {
"endpoint_url": properties.get(S3_ENDPOINT),
"aws_access_key_id": properties.get(S3_ACCESS_KEY_ID),
"aws_secret_access_key": properties.get(S3_SECRET_ACCESS_KEY),
"aws_session_token": properties.get(S3_SESSION_TOKEN),
"region_name": properties.get(S3_REGION),
"aws_access_key_id": _get_first_property_value(properties, S3_ACCESS_KEY_ID_PROPERTIES),
"aws_secret_access_key": _get_first_property_value(properties, S3_SECRET_ACCESS_KEY_PROPERTIES),
"aws_session_token": _get_first_property_value(properties, S3_SESSION_TOKEN_PROPERTIES),
"region_name": _get_first_property_value(properties, S3_REGION_PROPERTIES),
}
config_kwargs = {}
register_events: Dict[str, Callable[[Properties], None]] = {}
Expand Down
17 changes: 9 additions & 8 deletions pyiceberg/io/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,18 +95,19 @@
HDFS_KERB_TICKET,
HDFS_PORT,
HDFS_USER,
S3_ACCESS_KEY_ID,
S3_ACCESS_KEY_ID_PROPERTIES,
S3_CONNECT_TIMEOUT,
S3_ENDPOINT,
S3_PROXY_URI,
S3_REGION,
S3_SECRET_ACCESS_KEY,
S3_SESSION_TOKEN,
S3_REGION_PROPERTIES,
S3_SECRET_ACCESS_KEY_PROPERTIES,
S3_SESSION_TOKEN_PROPERTIES,
FileIO,
InputFile,
InputStream,
OutputFile,
OutputStream,
_get_first_property_value,
)
from pyiceberg.manifest import (
DataFile,
Expand Down Expand Up @@ -344,10 +345,10 @@ def _initialize_fs(self, scheme: str, netloc: Optional[str] = None) -> FileSyste

client_kwargs: Dict[str, Any] = {
"endpoint_override": self.properties.get(S3_ENDPOINT),
"access_key": self.properties.get(S3_ACCESS_KEY_ID),
"secret_key": self.properties.get(S3_SECRET_ACCESS_KEY),
"session_token": self.properties.get(S3_SESSION_TOKEN),
"region": self.properties.get(S3_REGION),
"access_key": _get_first_property_value(self.properties, S3_ACCESS_KEY_ID_PROPERTIES),
"secret_key": _get_first_property_value(self.properties, S3_SECRET_ACCESS_KEY_PROPERTIES),
"session_token": _get_first_property_value(self.properties, S3_SESSION_TOKEN_PROPERTIES),
"region": _get_first_property_value(self.properties, S3_REGION_PROPERTIES),
}

if proxy_uri := self.properties.get(S3_PROXY_URI):
Expand Down