From 1675f74065e7a575fbbb40ee241208168bace063 Mon Sep 17 00:00:00 2001 From: Pat Patterson Date: Thu, 23 Jan 2025 12:16:16 -0800 Subject: [PATCH] add request timeout config --- mkdocs/docs/configuration.md | 1 + pyiceberg/io/__init__.py | 1 + pyiceberg/io/fsspec.py | 4 ++++ pyiceberg/io/pyarrow.py | 7 +++++++ 4 files changed, 13 insertions(+) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 06eaac1bed..a0b8dce936 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -116,6 +116,7 @@ For the FileIO there are several configuration options available: | s3.region | us-west-2 | Configure the default region used to initialize an `S3FileSystem`. `PyArrowFileIO` attempts to automatically resolve the region for each S3 bucket, falling back to this value if resolution fails. | | s3.proxy-uri | | Configure the proxy server to be used by the FileIO. | | s3.connect-timeout | 60.0 | Configure socket connection timeout, in seconds. | +| s3.request-timeout | 60.0 | Configure socket read timeouts on Windows and macOS, in seconds. | | s3.force-virtual-addressing | False | Whether to use virtual addressing of buckets. If true, then virtual addressing is always enabled. If false, then virtual addressing is only enabled if endpoint_override is empty. This can be used for non-AWS backends that only support virtual hosted-style access. | diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py index f322221e4b..cee40f83a9 100644 --- a/pyiceberg/io/__init__.py +++ b/pyiceberg/io/__init__.py @@ -61,6 +61,7 @@ S3_REGION = "s3.region" S3_PROXY_URI = "s3.proxy-uri" S3_CONNECT_TIMEOUT = "s3.connect-timeout" +S3_REQUEST_TIMEOUT = "s3.request-timeout" S3_SIGNER_URI = "s3.signer.uri" S3_SIGNER_ENDPOINT = "s3.signer.endpoint" S3_SIGNER_ENDPOINT_DEFAULT = "v1/aws/s3/sign" diff --git a/pyiceberg/io/fsspec.py b/pyiceberg/io/fsspec.py index 5ac5ce7d4c..962459ed4f 100644 --- a/pyiceberg/io/fsspec.py +++ b/pyiceberg/io/fsspec.py @@ -62,6 +62,7 @@ GCS_VERSION_AWARE, S3_ACCESS_KEY_ID, S3_CONNECT_TIMEOUT, + S3_REQUEST_TIMEOUT, S3_ENDPOINT, S3_PROXY_URI, S3_REGION, @@ -150,6 +151,9 @@ def _s3(properties: Properties) -> AbstractFileSystem: if connect_timeout := properties.get(S3_CONNECT_TIMEOUT): config_kwargs["connect_timeout"] = float(connect_timeout) + if request_timeout := properties.get(S3_REQUEST_TIMEOUT): + config_kwargs["request_timeout"] = float(request_timeout) + fs = S3FileSystem(client_kwargs=client_kwargs, config_kwargs=config_kwargs) for event_name, event_function in register_events.items(): diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index e367aa586c..254cd6182c 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -100,6 +100,7 @@ PYARROW_USE_LARGE_TYPES_ON_READ, S3_ACCESS_KEY_ID, S3_CONNECT_TIMEOUT, + S3_REQUEST_TIMEOUT, S3_ENDPOINT, S3_FORCE_VIRTUAL_ADDRESSING, S3_PROXY_URI, @@ -394,6 +395,9 @@ def _initialize_oss_fs(self) -> FileSystem: if connect_timeout := self.properties.get(S3_CONNECT_TIMEOUT): client_kwargs["connect_timeout"] = float(connect_timeout) + if request_timeout := self.properties.get(S3_REQUEST_TIMEOUT): + client_kwargs["request_timeout"] = float(request_timeout) + if role_arn := get_first_property_value(self.properties, S3_ROLE_ARN, AWS_ROLE_ARN): client_kwargs["role_arn"] = role_arn @@ -438,6 +442,9 @@ def _initialize_s3_fs(self, netloc: Optional[str]) -> FileSystem: if connect_timeout := self.properties.get(S3_CONNECT_TIMEOUT): client_kwargs["connect_timeout"] = float(connect_timeout) + if request_timeout := self.properties.get(S3_REQUEST_TIMEOUT): + client_kwargs["request_timeout"] = float(request_timeout) + if role_arn := get_first_property_value(self.properties, S3_ROLE_ARN, AWS_ROLE_ARN): client_kwargs["role_arn"] = role_arn