diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index ab85893ab4..c756487c32 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -87,9 +87,12 @@ ADLS_ACCOUNT_NAME, ADLS_BLOB_STORAGE_AUTHORITY, ADLS_BLOB_STORAGE_SCHEME, + ADLS_CLIENT_ID, + ADLS_CLIENT_SECRET, ADLS_DFS_STORAGE_AUTHORITY, ADLS_DFS_STORAGE_SCHEME, ADLS_SAS_TOKEN, + ADLS_TENANT_ID, AWS_ACCESS_KEY_ID, AWS_REGION, AWS_ROLE_ARN, @@ -501,6 +504,7 @@ def _initialize_s3_fs(self, netloc: Optional[str]) -> FileSystem: return S3FileSystem(**client_kwargs) def _initialize_azure_fs(self) -> FileSystem: + # https://arrow.apache.org/docs/python/generated/pyarrow.fs.AzureFileSystem.html from packaging import version MIN_PYARROW_VERSION_SUPPORTING_AZURE_FS = "20.0.0" @@ -535,6 +539,24 @@ def _initialize_azure_fs(self) -> FileSystem: if sas_token := self.properties.get(ADLS_SAS_TOKEN): client_kwargs["sas_token"] = sas_token + if client_id := self.properties.get(ADLS_CLIENT_ID): + client_kwargs["client_id"] = client_id + if client_secret := self.properties.get(ADLS_CLIENT_SECRET): + client_kwargs["client_secret"] = client_secret + if tenant_id := self.properties.get(ADLS_TENANT_ID): + client_kwargs["tenant_id"] = tenant_id + + # Validate that all three are provided together for ClientSecretCredential + credential_keys = ["client_id", "client_secret", "tenant_id"] + provided_keys = [key for key in credential_keys if key in client_kwargs] + if provided_keys and len(provided_keys) != len(credential_keys): + missing_keys = [key for key in credential_keys if key not in client_kwargs] + raise ValueError( + f"client_id, client_secret, and tenant_id must all be provided together " + f"to use ClientSecretCredential for Azure authentication. " + f"Provided: {provided_keys}, Missing: {missing_keys}" + ) + return AzureFileSystem(**client_kwargs) def _initialize_hdfs_fs(self, scheme: str, netloc: Optional[str]) -> FileSystem: