-
Notifications
You must be signed in to change notification settings - Fork 371
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Upload json or json-list files, in addition to CSV and SQLite * Improvements to user uploads; refactoring, test coverage, better UI, bug fixes, logging * Clicking a field name in schema explorer copies it to clipboard * Limit charts to 10 numerical series for performance (past 10 it's incomprehensible anyway) * Limit sampling of uploaded files to 5k rows for the purposes of type inference
- Loading branch information
1 parent
4b31630
commit 8972f12
Showing
27 changed files
with
1,433 additions
and
250 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
import os | ||
from io import BytesIO | ||
|
||
from explorer.ee.db_connections.type_infer import get_parser | ||
from explorer.ee.db_connections.utils import pandas_to_sqlite | ||
|
||
|
||
def parse_to_sqlite(file) -> (BytesIO, str): | ||
f_name = file.name | ||
f_bytes = file.read() | ||
df_parser = get_parser(file) | ||
if df_parser: | ||
df = df_parser(f_bytes) | ||
try: | ||
f_bytes = pandas_to_sqlite(df, local_path=f"{f_name}_tmp_local.db") | ||
except Exception as e: # noqa | ||
raise ValueError(f"Error while parsing {f_name}: {e}") from e | ||
# replace the previous extension with .db, as it is now a sqlite file | ||
name, _ = os.path.splitext(f_name) | ||
f_name = f"{name}.db" | ||
else: | ||
return BytesIO(f_bytes), f_name # if it's a SQLite file already, simply cough it up as a BytesIO object | ||
return f_bytes, f_name | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
import csv | ||
import json | ||
|
||
# These are 'shallow' checks. They are just to understand if the upload appears valid at surface-level. | ||
# A deeper check will happen when pandas tries to parse the file. | ||
# This is designed to be quick, and simply assigned the right (full) parsing function to the uploaded file. | ||
|
||
|
||
def is_csv(file): | ||
if file.content_type != "text/csv": | ||
return False | ||
try: | ||
# Check if the file content can be read as a CSV | ||
file.seek(0) | ||
sample = file.read(1024).decode("utf-8") | ||
csv.Sniffer().sniff(sample) | ||
file.seek(0) | ||
return True | ||
except csv.Error: | ||
return False | ||
|
||
|
||
def is_json(file): | ||
if file.content_type != "application/json": | ||
return False | ||
if not file.name.lower().endswith(".json"): | ||
return False | ||
return True | ||
|
||
|
||
def is_json_list(file): | ||
if not file.name.lower().endswith(".json"): | ||
return False | ||
file.seek(0) | ||
first_line = file.readline() | ||
file.seek(0) | ||
try: | ||
json.loads(first_line.decode("utf-8")) | ||
return True | ||
except ValueError: | ||
return False | ||
|
||
|
||
def is_sqlite(file): | ||
if file.content_type != "application/x-sqlite3": | ||
return False | ||
try: | ||
# Check if the file starts with the SQLite file header | ||
file.seek(0) | ||
header = file.read(16) | ||
file.seek(0) | ||
return header == b"SQLite format 3\x00" | ||
except Exception as e: # noqa | ||
return False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
import io | ||
import json | ||
from explorer.ee.db_connections.mime import is_csv, is_json, is_sqlite, is_json_list | ||
|
||
|
||
MAX_TYPING_SAMPLE_SIZE = 5000 | ||
SHORTEST_PLAUSIBLE_DATE_STRING = 5 | ||
|
||
|
||
def get_parser(file): | ||
if is_csv(file): | ||
return csv_to_typed_df | ||
if is_json_list(file): | ||
return json_list_to_typed_df | ||
if is_json(file): | ||
return json_to_typed_df | ||
if is_sqlite(file): | ||
return None | ||
raise ValueError(f"File {file.content_type} not supported.") | ||
|
||
|
||
def csv_to_typed_df(csv_bytes, delimiter=",", has_headers=True): | ||
import pandas as pd | ||
csv_file = io.BytesIO(csv_bytes) | ||
df = pd.read_csv(csv_file, sep=delimiter, header=0 if has_headers else None) | ||
return df_to_typed_df(df) | ||
|
||
|
||
def json_list_to_typed_df(json_bytes): | ||
import pandas as pd | ||
data = [] | ||
for line in io.BytesIO(json_bytes).readlines(): | ||
data.append(json.loads(line.decode("utf-8"))) | ||
|
||
df = pd.json_normalize(data) | ||
return df_to_typed_df(df) | ||
|
||
|
||
def json_to_typed_df(json_bytes): | ||
import pandas as pd | ||
json_file = io.BytesIO(json_bytes) | ||
json_content = json.load(json_file) | ||
df = pd.json_normalize(json_content) | ||
return df_to_typed_df(df) | ||
|
||
|
||
def atof_custom(value): | ||
# Remove any thousands separators and convert the decimal point | ||
if "," in value and "." in value: | ||
if value.index(",") < value.index("."): | ||
# 0,000.00 format | ||
value = value.replace(",", "") | ||
else: | ||
# 0.000,00 format | ||
value = value.replace(".", "").replace(",", ".") | ||
elif "," in value: | ||
# No decimal point, only thousands separator | ||
value = value.replace(",", "") | ||
return float(value) | ||
|
||
|
||
|
||
def df_to_typed_df(df): # noqa | ||
import pandas as pd | ||
from dateutil import parser | ||
try: | ||
|
||
for column in df.columns: | ||
|
||
# If we somehow have an array within a field (e.g. from a json object) then convert it to a string | ||
df[column] = df[column].apply(lambda x: str(x) if isinstance(x, list) else x) | ||
|
||
values = df[column].dropna().unique() | ||
if len(values) > MAX_TYPING_SAMPLE_SIZE: | ||
values = pd.Series(values).sample(MAX_TYPING_SAMPLE_SIZE, random_state=42).to_numpy() | ||
|
||
is_date = False | ||
is_integer = True | ||
is_float = True | ||
|
||
for value in values: | ||
try: | ||
float_val = atof_custom(str(value)) | ||
if float_val == int(float_val): | ||
continue # This is effectively an integer | ||
else: | ||
is_integer = False | ||
except ValueError: | ||
is_integer = False | ||
is_float = False | ||
break | ||
|
||
if is_integer: | ||
is_float = False | ||
|
||
if not is_integer and not is_float: | ||
is_date = True | ||
|
||
# The dateutil parser is very aggressive and will interpret many short strings as dates. | ||
# For example "12a" will be interpreted as 12:00 AM on the current date. | ||
# That is not the behavior anyone wants. The shortest plausible date string is e.g. 1-1-23 | ||
try_parse = [v for v in values if len(str(v)) > SHORTEST_PLAUSIBLE_DATE_STRING] | ||
if len(try_parse) > 0: | ||
for value in try_parse: | ||
try: | ||
parser.parse(str(value)) | ||
except (ValueError, TypeError, OverflowError): | ||
is_date = False | ||
break | ||
else: | ||
is_date = False | ||
|
||
if is_date: | ||
df[column] = pd.to_datetime(df[column], errors="coerce", utc=True) | ||
elif is_integer: | ||
df[column] = df[column].apply(lambda x: int(atof_custom(str(x))) if pd.notna(x) else x) | ||
# If there are NaN / blank values, the column will be converted to float | ||
# Convert it back to integer | ||
df[column] = df[column].astype("Int64") | ||
elif is_float: | ||
df[column] = df[column].apply(lambda x: atof_custom(str(x)) if pd.notna(x) else x) | ||
else: | ||
inferred_type = pd.api.types.infer_dtype(values) | ||
if inferred_type == "integer": | ||
df[column] = pd.to_numeric(df[column], errors="coerce", downcast="integer") | ||
elif inferred_type == "floating": | ||
df[column] = pd.to_numeric(df[column], errors="coerce") | ||
|
||
return df | ||
|
||
except pd.errors.ParserError as e: | ||
return str(e) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.