From f76192a034b036e6582cfad92af6b7a3c1d737f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stef=C3=A1n=20P=C3=A1ll=20Sturluson?= Date: Mon, 14 Mar 2022 11:48:34 +0000 Subject: [PATCH] Add custom_encoding input to table_import_from_s3 Some tools do not properly set the ContentEncoding metadata on write in S3. This update adds functionality to specify a custom encoding (e.g. "gzip") as input to the `table_import_from_s3` function. --- aws_s3--0.0.1.sql | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/aws_s3--0.0.1.sql b/aws_s3--0.0.1.sql index 93ee6d0..0f153c4 100644 --- a/aws_s3--0.0.1.sql +++ b/aws_s3--0.0.1.sql @@ -48,7 +48,8 @@ CREATE OR REPLACE FUNCTION aws_s3.table_import_from_s3 ( access_key text default null, secret_key text default null, session_token text default null, - endpoint_url text default null + endpoint_url text default null, + content_encoding text default null ) RETURNS int LANGUAGE plpython3u AS $$ @@ -90,10 +91,10 @@ AS $$ obj = s3.Object(bucket, file_path) response = obj.get() - content_encoding = response.get('ContentEncoding') - body = response['Body'] + content_encoding = content_encoding or response.get('ContentEncoding') user_content_encoding = response.get('x-amz-meta-content-encoding') - + body = response['Body'] + with tempfile.NamedTemporaryFile() as fd: if (content_encoding and content_encoding.lower() == 'gzip') or (user_content_encoding and user_content_encoding.lower() == 'gzip'): with gzip.GzipFile(fileobj=body) as gzipfile: @@ -124,14 +125,15 @@ CREATE OR REPLACE FUNCTION aws_s3.table_import_from_s3( options text, s3_info aws_commons._s3_uri_1, credentials aws_commons._aws_credentials_1, - endpoint_url text default null + endpoint_url text default null, + content_encoding text default null ) RETURNS INT LANGUAGE plpython3u AS $$ plan = plpy.prepare( - 'SELECT aws_s3.table_import_from_s3($1, $2, $3, $4, $5, $6, $7, $8, $9) AS num_rows', - ['TEXT', 'TEXT', 'TEXT', 'TEXT', 'TEXT', 'TEXT', 'TEXT', 'TEXT', 'TEXT', 'TEXT'] + 'SELECT aws_s3.table_import_from_s3($1, $2, $3, $4, $5, $6, $7, $8, $9, $10) AS num_rows', + ['TEXT', 'TEXT', 'TEXT', 'TEXT', 'TEXT', 'TEXT', 'TEXT', 'TEXT', 'TEXT', 'TEXT', 'TEXT'] ) return plan.execute( [ @@ -144,7 +146,8 @@ AS $$ credentials['access_key'], credentials['secret_key'], credentials['session_token'], - endpoint_url + endpoint_url, + content_encoding ] )[0]['num_rows'] $$;