-
Notifications
You must be signed in to change notification settings - Fork 371
Commit
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import logging | ||
import os | ||
|
||
from .mime import is_csv, is_json, is_json_list, is_sqlite | ||
from explorer.ee.db_connections.type_infer import json_to_typed_df, json_list_to_typed_df, csv_to_typed_df | ||
from explorer.ee.db_connections.utils import pandas_to_sqlite | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def get_bytes_and_name_for_upload(file): | ||
if is_csv(file): | ||
df_fun = csv_to_typed_df | ||
elif is_json_list(file): # must go before is_json, as it is a subset | ||
df_fun = json_list_to_typed_df | ||
elif is_json(file): | ||
df_fun = json_to_typed_df | ||
elif is_sqlite(file): | ||
df_fun = None | ||
else: | ||
logger.error(f'File {file.name} is not a csv, json, or sqlite file.') | ||
Check failure on line 22 in explorer/ee/db_connections/create_sqlite.py GitHub Actions / ruffRuff (Q000)
|
||
raise TypeError(f'File {file.name} is not a csv, json, or sqlite file.') | ||
Check failure on line 23 in explorer/ee/db_connections/create_sqlite.py GitHub Actions / ruffRuff (Q000)
|
||
|
||
try: | ||
return parse_to_sqlite(file, df_fun) | ||
except ValueError as e: | ||
logger.error(f'Error parsing {file.name}: {e}') | ||
Check failure on line 28 in explorer/ee/db_connections/create_sqlite.py GitHub Actions / ruffRuff (Q000)
|
||
raise e | ||
|
||
|
||
def parse_to_sqlite(file, df_parser): | ||
f_name = file.name | ||
f_bytes = file.read() | ||
if df_parser: | ||
df = df_parser(f_bytes) | ||
try: | ||
f_bytes = pandas_to_sqlite(df) | ||
except Exception as e: # noqa | ||
logger.exception(f"Exception while parsing file {f_name}: {e}") | ||
raise ValueError("Error while parsing the file.") | ||
Check failure on line 41 in explorer/ee/db_connections/create_sqlite.py GitHub Actions / ruffRuff (B904)
Check failure on line 41 in explorer/ee/db_connections/create_sqlite.py GitHub Actions / ruffRuff (B904)
|
||
# replace the previous extension with .db, as it is now a sqlite file | ||
name, _ = os.path.splitext(f_name) | ||
f_name = f"{name}.db" | ||
return f_bytes, f_name | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
import csv | ||
import json | ||
|
||
# These are 'shallow' checks. They are just to understand if the upload appears valid at surface-level. | ||
# A deeper check will happen when pandas tries to parse the file. | ||
# This is designed to be quick, and simply assigned the right (full) parsing function to the uploaded file. | ||
|
||
|
||
def is_csv(file): | ||
if file.content_type != "text/csv": | ||
return False | ||
try: | ||
# Check if the file content can be read as a CSV | ||
file.seek(0) | ||
sample = file.read(1024).decode('utf-8') | ||
Check failure on line 15 in explorer/ee/db_connections/mime.py GitHub Actions / ruffRuff (Q000)
|
||
csv.Sniffer().sniff(sample) | ||
file.seek(0) | ||
return True | ||
except csv.Error: | ||
return False | ||
|
||
|
||
def is_json(file): | ||
if file.content_type != "application/json": | ||
return False | ||
if not file.name.lower().endswith('.json'): | ||
Check failure on line 26 in explorer/ee/db_connections/mime.py GitHub Actions / ruffRuff (Q000)
|
||
return False | ||
return True | ||
|
||
|
||
def is_json_list(file): | ||
if not file.name.lower().endswith('.json'): | ||
Check failure on line 32 in explorer/ee/db_connections/mime.py GitHub Actions / ruffRuff (Q000)
|
||
return False | ||
file.seek(0) | ||
first_line = file.readline() | ||
file.seek(0) | ||
try: | ||
json.loads(first_line.decode('utf-8')) | ||
Check failure on line 38 in explorer/ee/db_connections/mime.py GitHub Actions / ruffRuff (Q000)
|
||
return True | ||
except ValueError: | ||
return False | ||
|
||
|
||
def is_sqlite(file): | ||
if file.content_type != "application/x-sqlite3": | ||
return False | ||
try: | ||
# Check if the file starts with the SQLite file header | ||
file.seek(0) | ||
header = file.read(16) | ||
file.seek(0) | ||
return header == b'SQLite format 3\x00' | ||
Check failure on line 52 in explorer/ee/db_connections/mime.py GitHub Actions / ruffRuff (Q000)
|
||
except Exception as e: # noqa | ||
return False |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
import io | ||
import json | ||
|
||
|
||
MAX_TYPING_SAMPLE_SIZE = 10000 | ||
SHORTEST_PLAUSIBLE_DATE_STRING = 5 | ||
|
||
|
||
def csv_to_typed_df(csv_bytes, delimiter=",", has_headers=True): | ||
import pandas as pd | ||
csv_file = io.BytesIO(csv_bytes) | ||
df = pd.read_csv(csv_file, sep=delimiter, header=0 if has_headers else None) | ||
return df_to_typed_df(df) | ||
|
||
|
||
def json_list_to_typed_df(json_bytes): | ||
import pandas as pd | ||
data = [] | ||
for line in io.BytesIO(json_bytes).readlines(): | ||
data.append(json.loads(line.decode('utf-8'))) | ||
Check failure on line 20 in explorer/ee/db_connections/type_infer.py GitHub Actions / ruffRuff (Q000)
|
||
|
||
df = pd.json_normalize(data) | ||
return df_to_typed_df(df) | ||
|
||
|
||
def json_to_typed_df(json_bytes): | ||
import pandas as pd | ||
json_file = io.BytesIO(json_bytes) | ||
json_content = json.load(json_file) | ||
df = pd.json_normalize(json_content) | ||
return df_to_typed_df(df) | ||
|
||
|
||
def atof_custom(value): | ||
# Remove any thousands separators and convert the decimal point | ||
if "," in value and "." in value: | ||
if value.index(",") < value.index("."): | ||
# 0,000.00 format | ||
value = value.replace(",", "") | ||
else: | ||
# 0.000,00 format | ||
value = value.replace(".", "").replace(",", ".") | ||
elif "," in value: | ||
# No decimal point, only thousands separator | ||
value = value.replace(",", "") | ||
return float(value) | ||
|
||
|
||
|
||
def df_to_typed_df(df): # noqa | ||
import pandas as pd | ||
from dateutil import parser | ||
try: | ||
|
||
for column in df.columns: | ||
values = df[column].dropna().unique() | ||
if len(values) > MAX_TYPING_SAMPLE_SIZE: | ||
values = pd.Series(values).sample(MAX_TYPING_SAMPLE_SIZE, random_state=42).to_numpy() | ||
|
||
is_date = False | ||
is_integer = True | ||
is_float = True | ||
|
||
for value in values: | ||
try: | ||
float_val = atof_custom(str(value)) | ||
if float_val == int(float_val): | ||
continue # This is effectively an integer | ||
else: | ||
is_integer = False | ||
except ValueError: | ||
is_integer = False | ||
is_float = False | ||
break | ||
|
||
if is_integer: | ||
is_float = False | ||
|
||
if not is_integer and not is_float: | ||
is_date = True | ||
|
||
# The dateutil parser is very aggressive and will interpret many short strings as dates. | ||
# For example "12a" will be interpreted as 12:00 AM on the current date. | ||
# That is not the behavior anyone wants. The shortest plausible date string is e.g. 1-1-23 | ||
try_parse = [v for v in values if len(str(v)) > SHORTEST_PLAUSIBLE_DATE_STRING] | ||
if len(try_parse) > 0: | ||
for value in try_parse: | ||
try: | ||
parser.parse(str(value)) | ||
except (ValueError, TypeError, OverflowError): | ||
is_date = False | ||
break | ||
else: | ||
is_date = False | ||
|
||
if is_date: | ||
df[column] = pd.to_datetime(df[column], errors="coerce", utc=True) | ||
elif is_integer: | ||
df[column] = df[column].apply(lambda x: int(atof_custom(str(x))) if pd.notna(x) else x) | ||
# If there are NaN / blank values, the column will be converted to float | ||
# Convert it back to integer | ||
df[column] = df[column].astype("Int64") | ||
elif is_float: | ||
df[column] = df[column].apply(lambda x: atof_custom(str(x)) if pd.notna(x) else x) | ||
else: | ||
inferred_type = pd.api.types.infer_dtype(values) | ||
if inferred_type == "integer": | ||
df[column] = pd.to_numeric(df[column], errors="coerce", downcast="integer") | ||
elif inferred_type == "floating": | ||
df[column] = pd.to_numeric(df[column], errors="coerce") | ||
|
||
return df | ||
|
||
except pd.errors.ParserError as e: | ||
return str(e) |