refactor and handling different json docs

explorerhq · Jul 15, 2024 · 125fd42 · 125fd42
1 parent 0306d64
commit 125fd42
Show file tree

Hide file tree

Showing 8 changed files with 354 additions and 229 deletions.
diff --git a/explorer/ee/db_connections/create_sqlite.py b/explorer/ee/db_connections/create_sqlite.py
@@ -0,0 +1,46 @@
+import logging
+import os
+
+from .mime import is_csv, is_json, is_json_list, is_sqlite
+from explorer.ee.db_connections.type_infer import json_to_typed_df, json_list_to_typed_df, csv_to_typed_df
+from explorer.ee.db_connections.utils import pandas_to_sqlite
+
+
+logger = logging.getLogger(__name__)
+
+
+def get_bytes_and_name_for_upload(file):
+    if is_csv(file):
+        df_fun = csv_to_typed_df
+    elif is_json_list(file):  # must go before is_json, as it is a subset
+        df_fun = json_list_to_typed_df
+    elif is_json(file):
+        df_fun = json_to_typed_df
+    elif is_sqlite(file):
+        df_fun = None
+    else:
+        logger.error(f'File {file.name} is not a csv, json, or sqlite file.')
+        raise TypeError(f'File {file.name} is not a csv, json, or sqlite file.')
+
+    try:
+        return parse_to_sqlite(file, df_fun)
+    except ValueError as e:
+        logger.error(f'Error parsing {file.name}: {e}')
+        raise e
+
+
+def parse_to_sqlite(file, df_parser):
+    f_name = file.name
+    f_bytes = file.read()
+    if df_parser:
+        df = df_parser(f_bytes)
+        try:
+            f_bytes = pandas_to_sqlite(df)
+        except Exception as e:  # noqa
+            logger.exception(f"Exception while parsing file {f_name}: {e}")
+            raise ValueError("Error while parsing the file.")
+        # replace the previous extension with .db, as it is now a sqlite file
+        name, _ = os.path.splitext(f_name)
+        f_name = f"{name}.db"
+    return f_bytes, f_name
+
diff --git a/explorer/ee/db_connections/mime.py b/explorer/ee/db_connections/mime.py
@@ -0,0 +1,54 @@
+import csv
+import json
+
+# These are 'shallow' checks. They are just to understand if the upload appears valid at surface-level.
+# A deeper check will happen when pandas tries to parse the file.
+# This is designed to be quick, and simply assigned the right (full) parsing function to the uploaded file.
+
+
+def is_csv(file):
+    if file.content_type != "text/csv":
+        return False
+    try:
+        # Check if the file content can be read as a CSV
+        file.seek(0)
+        sample = file.read(1024).decode('utf-8')
+        csv.Sniffer().sniff(sample)
+        file.seek(0)
+        return True
+    except csv.Error:
+        return False
+
+
+def is_json(file):
+    if file.content_type != "application/json":
+        return False
+    if not file.name.lower().endswith('.json'):
+        return False
+    return True
+
+
+def is_json_list(file):
+    if not file.name.lower().endswith('.json'):
+        return False
+    file.seek(0)
+    first_line = file.readline()
+    file.seek(0)
+    try:
+        json.loads(first_line.decode('utf-8'))
+        return True
+    except ValueError:
+        return False
+
+
+def is_sqlite(file):
+    if file.content_type != "application/x-sqlite3":
+        return False
+    try:
+        # Check if the file starts with the SQLite file header
+        file.seek(0)
+        header = file.read(16)
+        file.seek(0)
+        return header == b'SQLite format 3\x00'
+    except Exception as e:  # noqa
+        return False
diff --git a/explorer/ee/db_connections/type_infer.py b/explorer/ee/db_connections/type_infer.py
@@ -0,0 +1,115 @@
+import io
+import json
+
+
+MAX_TYPING_SAMPLE_SIZE = 10000
+SHORTEST_PLAUSIBLE_DATE_STRING = 5
+
+
+def csv_to_typed_df(csv_bytes, delimiter=",", has_headers=True):
+    import pandas as pd
+    csv_file = io.BytesIO(csv_bytes)
+    df = pd.read_csv(csv_file, sep=delimiter, header=0 if has_headers else None)
+    return df_to_typed_df(df)
+
+
+def json_list_to_typed_df(json_bytes):
+    import pandas as pd
+    data = []
+    for line in io.BytesIO(json_bytes).readlines():
+        data.append(json.loads(line.decode('utf-8')))
+
+    df = pd.json_normalize(data)
+    return df_to_typed_df(df)
+
+
+def json_to_typed_df(json_bytes):
+    import pandas as pd
+    json_file = io.BytesIO(json_bytes)
+    json_content = json.load(json_file)
+    df = pd.json_normalize(json_content)
+    return df_to_typed_df(df)
+
+
+def atof_custom(value):
+    # Remove any thousands separators and convert the decimal point
+    if "," in value and "." in value:
+        if value.index(",") < value.index("."):
+            # 0,000.00 format
+            value = value.replace(",", "")
+        else:
+            # 0.000,00 format
+            value = value.replace(".", "").replace(",", ".")
+    elif "," in value:
+        # No decimal point, only thousands separator
+        value = value.replace(",", "")
+    return float(value)
+
+
+
+def df_to_typed_df(df):  # noqa
+    import pandas as pd
+    from dateutil import parser
+    try:
+
+        for column in df.columns:
+            values = df[column].dropna().unique()
+            if len(values) > MAX_TYPING_SAMPLE_SIZE:
+                values = pd.Series(values).sample(MAX_TYPING_SAMPLE_SIZE, random_state=42).to_numpy()
+
+            is_date = False
+            is_integer = True
+            is_float = True
+
+            for value in values:
+                try:
+                    float_val = atof_custom(str(value))
+                    if float_val == int(float_val):
+                        continue  # This is effectively an integer
+                    else:
+                        is_integer = False
+                except ValueError:
+                    is_integer = False
+                    is_float = False
+                    break
+
+            if is_integer:
+                is_float = False
+
+            if not is_integer and not is_float:
+                is_date = True
+
+                # The dateutil parser is very aggressive and will interpret many short strings as dates.
+                # For example "12a" will be interpreted as 12:00 AM on the current date.
+                # That is not the behavior anyone wants. The shortest plausible date string is e.g. 1-1-23
+                try_parse = [v for v in values if len(str(v)) > SHORTEST_PLAUSIBLE_DATE_STRING]
+                if len(try_parse) > 0:
+                    for value in try_parse:
+                        try:
+                            parser.parse(str(value))
+                        except (ValueError, TypeError, OverflowError):
+                            is_date = False
+                            break
+                else:
+                    is_date = False
+
+            if is_date:
+                df[column] = pd.to_datetime(df[column], errors="coerce", utc=True)
+            elif is_integer:
+                df[column] = df[column].apply(lambda x: int(atof_custom(str(x))) if pd.notna(x) else x)
+                # If there are NaN / blank values, the column will be converted to float
+                # Convert it back to integer
+                df[column] = df[column].astype("Int64")
+            elif is_float:
+                df[column] = df[column].apply(lambda x: atof_custom(str(x)) if pd.notna(x) else x)
+            else:
+                inferred_type = pd.api.types.infer_dtype(values)
+                if inferred_type == "integer":
+                    df[column] = pd.to_numeric(df[column], errors="coerce", downcast="integer")
+                elif inferred_type == "floating":
+                    df[column] = pd.to_numeric(df[column], errors="coerce")
+
+        return df
+
+    except pd.errors.ParserError as e:
+        return str(e)
diff --git a/explorer/ee/db_connections/utils.py b/explorer/ee/db_connections/utils.py
@@ -1,6 +1,6 @@
 from django.db import DatabaseError
 from django.db.utils import load_backend
-import os, json
+import os
 
 import sqlite3
 import io
@@ -102,119 +102,3 @@ def pandas_to_sqlite(df, local_path="local_database.db"):
         # Delete the local SQLite database file
         # Finally block to ensure we don't litter files around
         os.remove(local_path)
-
-
-def json_list_to_typed_df(json_bytes):
-    import pandas as pd
-    data = []
-    for line in io.BytesIO(json_bytes).readlines():
-        data.append(json.loads(line.decode('utf-8')))
-
-    df = pd.json_normalize(data)
-    return df_to_typed_df(df)
-
-
-MAX_TYPING_SAMPLE_SIZE = 10000
-SHORTEST_PLAUSIBLE_DATE_STRING = 5
-
-
-def atof_custom(value):
-    # Remove any thousands separators and convert the decimal point
-    if "," in value and "." in value:
-        if value.index(",") < value.index("."):
-            # 0,000.00 format
-            value = value.replace(",", "")
-        else:
-            # 0.000,00 format
-            value = value.replace(".", "").replace(",", ".")
-    elif "," in value:
-        # No decimal point, only thousands separator
-        value = value.replace(",", "")
-    return float(value)
-
-
-def csv_to_typed_df(csv_bytes, delimiter=",", has_headers=True):
-    import pandas as pd
-    csv_file = io.BytesIO(csv_bytes)
-    df = pd.read_csv(csv_file, sep=delimiter, header=0 if has_headers else None)
-    return df_to_typed_df(df)
-
-
-def df_to_typed_df(df):  # noqa
-    import pandas as pd
-    from dateutil import parser
-    try:
-
-        for column in df.columns:
-            values = df[column].dropna().unique()
-            if len(values) > MAX_TYPING_SAMPLE_SIZE:
-                values = pd.Series(values).sample(MAX_TYPING_SAMPLE_SIZE, random_state=42).to_numpy()
-
-            is_date = False
-            is_integer = True
-            is_float = True
-
-            for value in values:
-                try:
-                    float_val = atof_custom(str(value))
-                    if float_val == int(float_val):
-                        continue  # This is effectively an integer
-                    else:
-                        is_integer = False
-                except ValueError:
-                    is_integer = False
-                    is_float = False
-                    break
-
-            if is_integer:
-                is_float = False
-
-            if not is_integer and not is_float:
-                is_date = True
-
-                # The dateutil parser is very aggressive and will interpret many short strings as dates.
-                # For example "12a" will be interpreted as 12:00 AM on the current date.
-                # That is not the behavior anyone wants. The shortest plausible date string is e.g. 1-1-23
-                try_parse = [v for v in values if len(str(v)) > SHORTEST_PLAUSIBLE_DATE_STRING]
-                if len(try_parse) > 0:
-                    for value in try_parse:
-                        try:
-                            parser.parse(str(value))
-                        except (ValueError, TypeError, OverflowError):
-                            is_date = False
-                            break
-                else:
-                    is_date = False
-
-            if is_date:
-                df[column] = pd.to_datetime(df[column], errors="coerce", utc=True)
-            elif is_integer:
-                df[column] = df[column].apply(lambda x: int(atof_custom(str(x))) if pd.notna(x) else x)
-                # If there are NaN / blank values, the column will be converted to float
-                # Convert it back to integer
-                df[column] = df[column].astype("Int64")
-            elif is_float:
-                df[column] = df[column].apply(lambda x: atof_custom(str(x)) if pd.notna(x) else x)
-            else:
-                inferred_type = pd.api.types.infer_dtype(values)
-                if inferred_type == "integer":
-                    df[column] = pd.to_numeric(df[column], errors="coerce", downcast="integer")
-                elif inferred_type == "floating":
-                    df[column] = pd.to_numeric(df[column], errors="coerce")
-
-        return df
-
-    except pd.errors.ParserError as e:
-        return str(e)
-
-
-def is_csv(file):
-    return file.content_type == "text/csv"
-
-
-def is_json(file):
-    return file.content_type == "application/json"
-
-
-def is_sqlite(file):
-    return file.content_type == "application/x-sqlite3"