Skip to content

Commit

Permalink
refactor and handling different json docs
Browse files Browse the repository at this point in the history
  • Loading branch information
chrisclark committed Jul 15, 2024
1 parent 0306d64 commit 125fd42
Show file tree
Hide file tree
Showing 8 changed files with 354 additions and 229 deletions.
46 changes: 46 additions & 0 deletions explorer/ee/db_connections/create_sqlite.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import logging
import os

from .mime import is_csv, is_json, is_json_list, is_sqlite
from explorer.ee.db_connections.type_infer import json_to_typed_df, json_list_to_typed_df, csv_to_typed_df
from explorer.ee.db_connections.utils import pandas_to_sqlite


logger = logging.getLogger(__name__)


def get_bytes_and_name_for_upload(file):
if is_csv(file):
df_fun = csv_to_typed_df
elif is_json_list(file): # must go before is_json, as it is a subset
df_fun = json_list_to_typed_df
elif is_json(file):
df_fun = json_to_typed_df
elif is_sqlite(file):
df_fun = None
else:
logger.error(f'File {file.name} is not a csv, json, or sqlite file.')

Check failure on line 22 in explorer/ee/db_connections/create_sqlite.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (Q000)

explorer/ee/db_connections/create_sqlite.py:22:22: Q000 Single quotes found but double quotes preferred

Check failure on line 22 in explorer/ee/db_connections/create_sqlite.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (Q000)

explorer/ee/db_connections/create_sqlite.py:22:22: Q000 Single quotes found but double quotes preferred
raise TypeError(f'File {file.name} is not a csv, json, or sqlite file.')

Check failure on line 23 in explorer/ee/db_connections/create_sqlite.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (Q000)

explorer/ee/db_connections/create_sqlite.py:23:25: Q000 Single quotes found but double quotes preferred

Check failure on line 23 in explorer/ee/db_connections/create_sqlite.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (Q000)

explorer/ee/db_connections/create_sqlite.py:23:25: Q000 Single quotes found but double quotes preferred

try:
return parse_to_sqlite(file, df_fun)
except ValueError as e:
logger.error(f'Error parsing {file.name}: {e}')

Check failure on line 28 in explorer/ee/db_connections/create_sqlite.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (Q000)

explorer/ee/db_connections/create_sqlite.py:28:22: Q000 Single quotes found but double quotes preferred

Check failure on line 28 in explorer/ee/db_connections/create_sqlite.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (Q000)

explorer/ee/db_connections/create_sqlite.py:28:22: Q000 Single quotes found but double quotes preferred
raise e


def parse_to_sqlite(file, df_parser):
f_name = file.name
f_bytes = file.read()
if df_parser:
df = df_parser(f_bytes)
try:
f_bytes = pandas_to_sqlite(df)
except Exception as e: # noqa
logger.exception(f"Exception while parsing file {f_name}: {e}")
raise ValueError("Error while parsing the file.")

Check failure on line 41 in explorer/ee/db_connections/create_sqlite.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (B904)

explorer/ee/db_connections/create_sqlite.py:41:13: B904 Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling

Check failure on line 41 in explorer/ee/db_connections/create_sqlite.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (B904)

explorer/ee/db_connections/create_sqlite.py:41:13: B904 Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling
# replace the previous extension with .db, as it is now a sqlite file
name, _ = os.path.splitext(f_name)
f_name = f"{name}.db"
return f_bytes, f_name

54 changes: 54 additions & 0 deletions explorer/ee/db_connections/mime.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import csv
import json

# These are 'shallow' checks. They are just to understand if the upload appears valid at surface-level.
# A deeper check will happen when pandas tries to parse the file.
# This is designed to be quick, and simply assigned the right (full) parsing function to the uploaded file.


def is_csv(file):
if file.content_type != "text/csv":
return False
try:
# Check if the file content can be read as a CSV
file.seek(0)
sample = file.read(1024).decode('utf-8')

Check failure on line 15 in explorer/ee/db_connections/mime.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (Q000)

explorer/ee/db_connections/mime.py:15:41: Q000 Single quotes found but double quotes preferred

Check failure on line 15 in explorer/ee/db_connections/mime.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (Q000)

explorer/ee/db_connections/mime.py:15:41: Q000 Single quotes found but double quotes preferred
csv.Sniffer().sniff(sample)
file.seek(0)
return True
except csv.Error:
return False


def is_json(file):
if file.content_type != "application/json":
return False
if not file.name.lower().endswith('.json'):

Check failure on line 26 in explorer/ee/db_connections/mime.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (Q000)

explorer/ee/db_connections/mime.py:26:39: Q000 Single quotes found but double quotes preferred

Check failure on line 26 in explorer/ee/db_connections/mime.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (Q000)

explorer/ee/db_connections/mime.py:26:39: Q000 Single quotes found but double quotes preferred
return False
return True


def is_json_list(file):
if not file.name.lower().endswith('.json'):

Check failure on line 32 in explorer/ee/db_connections/mime.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (Q000)

explorer/ee/db_connections/mime.py:32:39: Q000 Single quotes found but double quotes preferred

Check failure on line 32 in explorer/ee/db_connections/mime.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (Q000)

explorer/ee/db_connections/mime.py:32:39: Q000 Single quotes found but double quotes preferred
return False
file.seek(0)
first_line = file.readline()
file.seek(0)
try:
json.loads(first_line.decode('utf-8'))

Check failure on line 38 in explorer/ee/db_connections/mime.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (Q000)

explorer/ee/db_connections/mime.py:38:38: Q000 Single quotes found but double quotes preferred

Check failure on line 38 in explorer/ee/db_connections/mime.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (Q000)

explorer/ee/db_connections/mime.py:38:38: Q000 Single quotes found but double quotes preferred
return True
except ValueError:
return False


def is_sqlite(file):
if file.content_type != "application/x-sqlite3":
return False
try:
# Check if the file starts with the SQLite file header
file.seek(0)
header = file.read(16)
file.seek(0)
return header == b'SQLite format 3\x00'

Check failure on line 52 in explorer/ee/db_connections/mime.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (Q000)

explorer/ee/db_connections/mime.py:52:26: Q000 Single quotes found but double quotes preferred

Check failure on line 52 in explorer/ee/db_connections/mime.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (Q000)

explorer/ee/db_connections/mime.py:52:26: Q000 Single quotes found but double quotes preferred
except Exception as e: # noqa
return False
115 changes: 115 additions & 0 deletions explorer/ee/db_connections/type_infer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import io
import json


MAX_TYPING_SAMPLE_SIZE = 10000
SHORTEST_PLAUSIBLE_DATE_STRING = 5


def csv_to_typed_df(csv_bytes, delimiter=",", has_headers=True):
import pandas as pd
csv_file = io.BytesIO(csv_bytes)
df = pd.read_csv(csv_file, sep=delimiter, header=0 if has_headers else None)
return df_to_typed_df(df)


def json_list_to_typed_df(json_bytes):
import pandas as pd
data = []
for line in io.BytesIO(json_bytes).readlines():
data.append(json.loads(line.decode('utf-8')))

Check failure on line 20 in explorer/ee/db_connections/type_infer.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (Q000)

explorer/ee/db_connections/type_infer.py:20:44: Q000 Single quotes found but double quotes preferred

Check failure on line 20 in explorer/ee/db_connections/type_infer.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (Q000)

explorer/ee/db_connections/type_infer.py:20:44: Q000 Single quotes found but double quotes preferred

df = pd.json_normalize(data)
return df_to_typed_df(df)


def json_to_typed_df(json_bytes):
import pandas as pd
json_file = io.BytesIO(json_bytes)
json_content = json.load(json_file)
df = pd.json_normalize(json_content)
return df_to_typed_df(df)


def atof_custom(value):
# Remove any thousands separators and convert the decimal point
if "," in value and "." in value:
if value.index(",") < value.index("."):
# 0,000.00 format
value = value.replace(",", "")
else:
# 0.000,00 format
value = value.replace(".", "").replace(",", ".")
elif "," in value:
# No decimal point, only thousands separator
value = value.replace(",", "")
return float(value)



def df_to_typed_df(df): # noqa
import pandas as pd
from dateutil import parser
try:

for column in df.columns:
values = df[column].dropna().unique()
if len(values) > MAX_TYPING_SAMPLE_SIZE:
values = pd.Series(values).sample(MAX_TYPING_SAMPLE_SIZE, random_state=42).to_numpy()

is_date = False
is_integer = True
is_float = True

for value in values:
try:
float_val = atof_custom(str(value))
if float_val == int(float_val):
continue # This is effectively an integer
else:
is_integer = False
except ValueError:
is_integer = False
is_float = False
break

if is_integer:
is_float = False

if not is_integer and not is_float:
is_date = True

# The dateutil parser is very aggressive and will interpret many short strings as dates.
# For example "12a" will be interpreted as 12:00 AM on the current date.
# That is not the behavior anyone wants. The shortest plausible date string is e.g. 1-1-23
try_parse = [v for v in values if len(str(v)) > SHORTEST_PLAUSIBLE_DATE_STRING]
if len(try_parse) > 0:
for value in try_parse:
try:
parser.parse(str(value))
except (ValueError, TypeError, OverflowError):
is_date = False
break
else:
is_date = False

if is_date:
df[column] = pd.to_datetime(df[column], errors="coerce", utc=True)
elif is_integer:
df[column] = df[column].apply(lambda x: int(atof_custom(str(x))) if pd.notna(x) else x)
# If there are NaN / blank values, the column will be converted to float
# Convert it back to integer
df[column] = df[column].astype("Int64")
elif is_float:
df[column] = df[column].apply(lambda x: atof_custom(str(x)) if pd.notna(x) else x)
else:
inferred_type = pd.api.types.infer_dtype(values)
if inferred_type == "integer":
df[column] = pd.to_numeric(df[column], errors="coerce", downcast="integer")
elif inferred_type == "floating":
df[column] = pd.to_numeric(df[column], errors="coerce")

return df

except pd.errors.ParserError as e:
return str(e)
118 changes: 1 addition & 117 deletions explorer/ee/db_connections/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from django.db import DatabaseError
from django.db.utils import load_backend
import os, json
import os

import sqlite3
import io
Expand Down Expand Up @@ -102,119 +102,3 @@ def pandas_to_sqlite(df, local_path="local_database.db"):
# Delete the local SQLite database file
# Finally block to ensure we don't litter files around
os.remove(local_path)


def json_list_to_typed_df(json_bytes):
import pandas as pd
data = []
for line in io.BytesIO(json_bytes).readlines():
data.append(json.loads(line.decode('utf-8')))

df = pd.json_normalize(data)
return df_to_typed_df(df)


MAX_TYPING_SAMPLE_SIZE = 10000
SHORTEST_PLAUSIBLE_DATE_STRING = 5


def atof_custom(value):
# Remove any thousands separators and convert the decimal point
if "," in value and "." in value:
if value.index(",") < value.index("."):
# 0,000.00 format
value = value.replace(",", "")
else:
# 0.000,00 format
value = value.replace(".", "").replace(",", ".")
elif "," in value:
# No decimal point, only thousands separator
value = value.replace(",", "")
return float(value)


def csv_to_typed_df(csv_bytes, delimiter=",", has_headers=True):
import pandas as pd
csv_file = io.BytesIO(csv_bytes)
df = pd.read_csv(csv_file, sep=delimiter, header=0 if has_headers else None)
return df_to_typed_df(df)


def df_to_typed_df(df): # noqa
import pandas as pd
from dateutil import parser
try:

for column in df.columns:
values = df[column].dropna().unique()
if len(values) > MAX_TYPING_SAMPLE_SIZE:
values = pd.Series(values).sample(MAX_TYPING_SAMPLE_SIZE, random_state=42).to_numpy()

is_date = False
is_integer = True
is_float = True

for value in values:
try:
float_val = atof_custom(str(value))
if float_val == int(float_val):
continue # This is effectively an integer
else:
is_integer = False
except ValueError:
is_integer = False
is_float = False
break

if is_integer:
is_float = False

if not is_integer and not is_float:
is_date = True

# The dateutil parser is very aggressive and will interpret many short strings as dates.
# For example "12a" will be interpreted as 12:00 AM on the current date.
# That is not the behavior anyone wants. The shortest plausible date string is e.g. 1-1-23
try_parse = [v for v in values if len(str(v)) > SHORTEST_PLAUSIBLE_DATE_STRING]
if len(try_parse) > 0:
for value in try_parse:
try:
parser.parse(str(value))
except (ValueError, TypeError, OverflowError):
is_date = False
break
else:
is_date = False

if is_date:
df[column] = pd.to_datetime(df[column], errors="coerce", utc=True)
elif is_integer:
df[column] = df[column].apply(lambda x: int(atof_custom(str(x))) if pd.notna(x) else x)
# If there are NaN / blank values, the column will be converted to float
# Convert it back to integer
df[column] = df[column].astype("Int64")
elif is_float:
df[column] = df[column].apply(lambda x: atof_custom(str(x)) if pd.notna(x) else x)
else:
inferred_type = pd.api.types.infer_dtype(values)
if inferred_type == "integer":
df[column] = pd.to_numeric(df[column], errors="coerce", downcast="integer")
elif inferred_type == "floating":
df[column] = pd.to_numeric(df[column], errors="coerce")

return df

except pd.errors.ParserError as e:
return str(e)


def is_csv(file):
return file.content_type == "text/csv"


def is_json(file):
return file.content_type == "application/json"


def is_sqlite(file):
return file.content_type == "application/x-sqlite3"
Loading

0 comments on commit 125fd42

Please sign in to comment.