From 125fd42fdf2f3f3bcdc323a1d1228171ca4bb866 Mon Sep 17 00:00:00 2001
From: Chris Clark <chris@untrod.com>
Date: Mon, 15 Jul 2024 10:11:03 -0400
Subject: [PATCH] refactor and handling different json docs

---
 explorer/ee/db_connections/create_sqlite.py |  46 ++++++++
 explorer/ee/db_connections/mime.py          |  54 +++++++++
 explorer/ee/db_connections/type_infer.py    | 115 +++++++++++++++++++
 explorer/ee/db_connections/utils.py         | 118 +-------------------
 explorer/ee/db_connections/views.py         |  52 ++-------
 explorer/tests/test_db_connection_utils.py  |  69 +-----------
 explorer/tests/test_mime.py                 |  71 ++++++++++++
 explorer/tests/test_type_infer.py           |  58 ++++++++++
 8 files changed, 354 insertions(+), 229 deletions(-)
 create mode 100644 explorer/ee/db_connections/create_sqlite.py
 create mode 100644 explorer/ee/db_connections/mime.py
 create mode 100644 explorer/ee/db_connections/type_infer.py
 create mode 100644 explorer/tests/test_mime.py
 create mode 100644 explorer/tests/test_type_infer.py

diff --git a/explorer/ee/db_connections/create_sqlite.py b/explorer/ee/db_connections/create_sqlite.py
new file mode 100644
index 00000000..67d16069
--- /dev/null
+++ b/explorer/ee/db_connections/create_sqlite.py
@@ -0,0 +1,46 @@
+import logging
+import os
+
+from .mime import is_csv, is_json, is_json_list, is_sqlite
+from explorer.ee.db_connections.type_infer import json_to_typed_df, json_list_to_typed_df, csv_to_typed_df
+from explorer.ee.db_connections.utils import pandas_to_sqlite
+
+
+logger = logging.getLogger(__name__)
+
+
+def get_bytes_and_name_for_upload(file):
+    if is_csv(file):
+        df_fun = csv_to_typed_df
+    elif is_json_list(file):  # must go before is_json, as it is a subset
+        df_fun = json_list_to_typed_df
+    elif is_json(file):
+        df_fun = json_to_typed_df
+    elif is_sqlite(file):
+        df_fun = None
+    else:
+        logger.error(f'File {file.name} is not a csv, json, or sqlite file.')
+        raise TypeError(f'File {file.name} is not a csv, json, or sqlite file.')
+
+    try:
+        return parse_to_sqlite(file, df_fun)
+    except ValueError as e:
+        logger.error(f'Error parsing {file.name}: {e}')
+        raise e
+
+
+def parse_to_sqlite(file, df_parser):
+    f_name = file.name
+    f_bytes = file.read()
+    if df_parser:
+        df = df_parser(f_bytes)
+        try:
+            f_bytes = pandas_to_sqlite(df)
+        except Exception as e:  # noqa
+            logger.exception(f"Exception while parsing file {f_name}: {e}")
+            raise ValueError("Error while parsing the file.")
+        # replace the previous extension with .db, as it is now a sqlite file
+        name, _ = os.path.splitext(f_name)
+        f_name = f"{name}.db"
+    return f_bytes, f_name
+
diff --git a/explorer/ee/db_connections/mime.py b/explorer/ee/db_connections/mime.py
new file mode 100644
index 00000000..5ac7bb2f
--- /dev/null
+++ b/explorer/ee/db_connections/mime.py
@@ -0,0 +1,54 @@
+import csv
+import json
+
+# These are 'shallow' checks. They are just to understand if the upload appears valid at surface-level.
+# A deeper check will happen when pandas tries to parse the file.
+# This is designed to be quick, and simply assigned the right (full) parsing function to the uploaded file.
+
+
+def is_csv(file):
+    if file.content_type != "text/csv":
+        return False
+    try:
+        # Check if the file content can be read as a CSV
+        file.seek(0)
+        sample = file.read(1024).decode('utf-8')
+        csv.Sniffer().sniff(sample)
+        file.seek(0)
+        return True
+    except csv.Error:
+        return False
+
+
+def is_json(file):
+    if file.content_type != "application/json":
+        return False
+    if not file.name.lower().endswith('.json'):
+        return False
+    return True
+
+
+def is_json_list(file):
+    if not file.name.lower().endswith('.json'):
+        return False
+    file.seek(0)
+    first_line = file.readline()
+    file.seek(0)
+    try:
+        json.loads(first_line.decode('utf-8'))
+        return True
+    except ValueError:
+        return False
+
+
+def is_sqlite(file):
+    if file.content_type != "application/x-sqlite3":
+        return False
+    try:
+        # Check if the file starts with the SQLite file header
+        file.seek(0)
+        header = file.read(16)
+        file.seek(0)
+        return header == b'SQLite format 3\x00'
+    except Exception as e:  # noqa
+        return False
diff --git a/explorer/ee/db_connections/type_infer.py b/explorer/ee/db_connections/type_infer.py
new file mode 100644
index 00000000..537a6bdc
--- /dev/null
+++ b/explorer/ee/db_connections/type_infer.py
@@ -0,0 +1,115 @@
+import io
+import json
+
+
+MAX_TYPING_SAMPLE_SIZE = 10000
+SHORTEST_PLAUSIBLE_DATE_STRING = 5
+
+
+def csv_to_typed_df(csv_bytes, delimiter=",", has_headers=True):
+    import pandas as pd
+    csv_file = io.BytesIO(csv_bytes)
+    df = pd.read_csv(csv_file, sep=delimiter, header=0 if has_headers else None)
+    return df_to_typed_df(df)
+
+
+def json_list_to_typed_df(json_bytes):
+    import pandas as pd
+    data = []
+    for line in io.BytesIO(json_bytes).readlines():
+        data.append(json.loads(line.decode('utf-8')))
+
+    df = pd.json_normalize(data)
+    return df_to_typed_df(df)
+
+
+def json_to_typed_df(json_bytes):
+    import pandas as pd
+    json_file = io.BytesIO(json_bytes)
+    json_content = json.load(json_file)
+    df = pd.json_normalize(json_content)
+    return df_to_typed_df(df)
+
+
+def atof_custom(value):
+    # Remove any thousands separators and convert the decimal point
+    if "," in value and "." in value:
+        if value.index(",") < value.index("."):
+            # 0,000.00 format
+            value = value.replace(",", "")
+        else:
+            # 0.000,00 format
+            value = value.replace(".", "").replace(",", ".")
+    elif "," in value:
+        # No decimal point, only thousands separator
+        value = value.replace(",", "")
+    return float(value)
+
+
+
+def df_to_typed_df(df):  # noqa
+    import pandas as pd
+    from dateutil import parser
+    try:
+
+        for column in df.columns:
+            values = df[column].dropna().unique()
+            if len(values) > MAX_TYPING_SAMPLE_SIZE:
+                values = pd.Series(values).sample(MAX_TYPING_SAMPLE_SIZE, random_state=42).to_numpy()
+
+            is_date = False
+            is_integer = True
+            is_float = True
+
+            for value in values:
+                try:
+                    float_val = atof_custom(str(value))
+                    if float_val == int(float_val):
+                        continue  # This is effectively an integer
+                    else:
+                        is_integer = False
+                except ValueError:
+                    is_integer = False
+                    is_float = False
+                    break
+
+            if is_integer:
+                is_float = False
+
+            if not is_integer and not is_float:
+                is_date = True
+
+                # The dateutil parser is very aggressive and will interpret many short strings as dates.
+                # For example "12a" will be interpreted as 12:00 AM on the current date.
+                # That is not the behavior anyone wants. The shortest plausible date string is e.g. 1-1-23
+                try_parse = [v for v in values if len(str(v)) > SHORTEST_PLAUSIBLE_DATE_STRING]
+                if len(try_parse) > 0:
+                    for value in try_parse:
+                        try:
+                            parser.parse(str(value))
+                        except (ValueError, TypeError, OverflowError):
+                            is_date = False
+                            break
+                else:
+                    is_date = False
+
+            if is_date:
+                df[column] = pd.to_datetime(df[column], errors="coerce", utc=True)
+            elif is_integer:
+                df[column] = df[column].apply(lambda x: int(atof_custom(str(x))) if pd.notna(x) else x)
+                # If there are NaN / blank values, the column will be converted to float
+                # Convert it back to integer
+                df[column] = df[column].astype("Int64")
+            elif is_float:
+                df[column] = df[column].apply(lambda x: atof_custom(str(x)) if pd.notna(x) else x)
+            else:
+                inferred_type = pd.api.types.infer_dtype(values)
+                if inferred_type == "integer":
+                    df[column] = pd.to_numeric(df[column], errors="coerce", downcast="integer")
+                elif inferred_type == "floating":
+                    df[column] = pd.to_numeric(df[column], errors="coerce")
+
+        return df
+
+    except pd.errors.ParserError as e:
+        return str(e)
diff --git a/explorer/ee/db_connections/utils.py b/explorer/ee/db_connections/utils.py
index afa2b0e0..35bcefef 100644
--- a/explorer/ee/db_connections/utils.py
+++ b/explorer/ee/db_connections/utils.py
@@ -1,6 +1,6 @@
 from django.db import DatabaseError
 from django.db.utils import load_backend
-import os, json
+import os
 
 import sqlite3
 import io
@@ -102,119 +102,3 @@ def pandas_to_sqlite(df, local_path="local_database.db"):
         # Delete the local SQLite database file
         # Finally block to ensure we don't litter files around
         os.remove(local_path)
-
-
-def json_list_to_typed_df(json_bytes):
-    import pandas as pd
-    data = []
-    for line in io.BytesIO(json_bytes).readlines():
-        data.append(json.loads(line.decode('utf-8')))
-
-    df = pd.json_normalize(data)
-    return df_to_typed_df(df)
-
-
-MAX_TYPING_SAMPLE_SIZE = 10000
-SHORTEST_PLAUSIBLE_DATE_STRING = 5
-
-
-def atof_custom(value):
-    # Remove any thousands separators and convert the decimal point
-    if "," in value and "." in value:
-        if value.index(",") < value.index("."):
-            # 0,000.00 format
-            value = value.replace(",", "")
-        else:
-            # 0.000,00 format
-            value = value.replace(".", "").replace(",", ".")
-    elif "," in value:
-        # No decimal point, only thousands separator
-        value = value.replace(",", "")
-    return float(value)
-
-
-def csv_to_typed_df(csv_bytes, delimiter=",", has_headers=True):
-    import pandas as pd
-    csv_file = io.BytesIO(csv_bytes)
-    df = pd.read_csv(csv_file, sep=delimiter, header=0 if has_headers else None)
-    return df_to_typed_df(df)
-
-
-def df_to_typed_df(df):  # noqa
-    import pandas as pd
-    from dateutil import parser
-    try:
-
-        for column in df.columns:
-            values = df[column].dropna().unique()
-            if len(values) > MAX_TYPING_SAMPLE_SIZE:
-                values = pd.Series(values).sample(MAX_TYPING_SAMPLE_SIZE, random_state=42).to_numpy()
-
-            is_date = False
-            is_integer = True
-            is_float = True
-
-            for value in values:
-                try:
-                    float_val = atof_custom(str(value))
-                    if float_val == int(float_val):
-                        continue  # This is effectively an integer
-                    else:
-                        is_integer = False
-                except ValueError:
-                    is_integer = False
-                    is_float = False
-                    break
-
-            if is_integer:
-                is_float = False
-
-            if not is_integer and not is_float:
-                is_date = True
-
-                # The dateutil parser is very aggressive and will interpret many short strings as dates.
-                # For example "12a" will be interpreted as 12:00 AM on the current date.
-                # That is not the behavior anyone wants. The shortest plausible date string is e.g. 1-1-23
-                try_parse = [v for v in values if len(str(v)) > SHORTEST_PLAUSIBLE_DATE_STRING]
-                if len(try_parse) > 0:
-                    for value in try_parse:
-                        try:
-                            parser.parse(str(value))
-                        except (ValueError, TypeError, OverflowError):
-                            is_date = False
-                            break
-                else:
-                    is_date = False
-
-            if is_date:
-                df[column] = pd.to_datetime(df[column], errors="coerce", utc=True)
-            elif is_integer:
-                df[column] = df[column].apply(lambda x: int(atof_custom(str(x))) if pd.notna(x) else x)
-                # If there are NaN / blank values, the column will be converted to float
-                # Convert it back to integer
-                df[column] = df[column].astype("Int64")
-            elif is_float:
-                df[column] = df[column].apply(lambda x: atof_custom(str(x)) if pd.notna(x) else x)
-            else:
-                inferred_type = pd.api.types.infer_dtype(values)
-                if inferred_type == "integer":
-                    df[column] = pd.to_numeric(df[column], errors="coerce", downcast="integer")
-                elif inferred_type == "floating":
-                    df[column] = pd.to_numeric(df[column], errors="coerce")
-
-        return df
-
-    except pd.errors.ParserError as e:
-        return str(e)
-
-
-def is_csv(file):
-    return file.content_type == "text/csv"
-
-
-def is_json(file):
-    return file.content_type == "application/json"
-
-
-def is_sqlite(file):
-    return file.content_type == "application/x-sqlite3"
diff --git a/explorer/ee/db_connections/views.py b/explorer/ee/db_connections/views.py
index 9b7abf79..0811650e 100644
--- a/explorer/ee/db_connections/views.py
+++ b/explorer/ee/db_connections/views.py
@@ -7,14 +7,9 @@
 from explorer.models import DatabaseConnection
 from explorer.ee.db_connections.utils import (
     upload_sqlite,
-    create_connection_for_uploaded_sqlite,
-    is_csv,
-    is_json,
-    is_sqlite,
-    csv_to_typed_df,
-    json_list_to_typed_df,
-    pandas_to_sqlite
+    create_connection_for_uploaded_sqlite
 )
+from explorer.ee.db_connections.create_sqlite import get_bytes_and_name_for_upload
 from explorer import app_settings
 from explorer.app_settings import EXPLORER_MAX_UPLOAD_SIZE
 from explorer.ee.db_connections.forms import DatabaseConnectionForm
@@ -27,37 +22,6 @@
 logger = logging.getLogger(__name__)
 
 
-def handle_json(file):
-    f_name = file.name
-    f_bytes = file.read()
-    df = json_list_to_typed_df(f_bytes)
-    try:
-        f_bytes = pandas_to_sqlite(df)
-    except Exception as e:  # noqa
-        logger.exception(f"Exception while parsing file {f_name}: {e}")
-        return JsonResponse({"error": "Error while parsing the file."}, status=400)
-
-    f_name = f_name.replace("json", "db")
-    return f_bytes, f_name
-
-
-def handle_csv(file):
-    f_name = file.name
-    f_bytes = file.read()
-    df = csv_to_typed_df(f_bytes)
-    try:
-        f_bytes = pandas_to_sqlite(df)
-    except Exception as e:  # noqa
-        logger.exception(f"Exception while parsing file {f_name}: {e}")
-        return JsonResponse({"error": "Error while parsing the file."}, status=400)
-
-    f_name = f_name.replace("csv", "db")
-
-
-def handle_sqlite(file):
-    return file.read(), file.name
-
-
 class UploadDbView(PermissionRequiredMixin, View):
 
     permission_required = "connections_permission"
@@ -69,14 +33,12 @@ def post(self, request):
                 friendly = EXPLORER_MAX_UPLOAD_SIZE / (1024 * 1024)
                 return JsonResponse({"error": f"File size exceeds the limit of {friendly} MB"}, status=400)
 
-            if is_json(file):
-                f_bytes, f_name = handle_json(file)
-            elif is_csv(file):
-                f_bytes, f_name = handle_csv(file)
-            elif is_sqlite(file):
-                f_bytes, f_name = handle_sqlite(file)
-            else:
+            try:
+                f_bytes, f_name = get_bytes_and_name_for_upload(file)
+            except ValueError as e:
                 return JsonResponse({"error": "File was not csv, json, or sqlite."}, status=400)
+            except TypeError as e:
+                return JsonResponse({"error": "Error parsing file."}, status=400)
 
             try:
                 s3_path = f"user_dbs/user_{request.user.id}/{f_name}"
diff --git a/explorer/tests/test_db_connection_utils.py b/explorer/tests/test_db_connection_utils.py
index d4576675..fa6d7cf4 100644
--- a/explorer/tests/test_db_connection_utils.py
+++ b/explorer/tests/test_db_connection_utils.py
@@ -1,6 +1,5 @@
 from django.test import TestCase
 from unittest import skipIf
-from django.core.files.uploadedfile import SimpleUploadedFile
 from explorer.app_settings import EXPLORER_USER_UPLOADS_ENABLED
 if EXPLORER_USER_UPLOADS_ENABLED:
     import pandas as pd
@@ -11,58 +10,10 @@
 from explorer.ee.db_connections.utils import (
     get_sqlite_for_connection,
     create_django_style_connection,
-    pandas_to_sqlite,
-    is_csv,
-    csv_to_typed_df
+    pandas_to_sqlite
 )
 
 
-def _get_csv(csv_name):
-    current_script_dir = os.path.dirname(os.path.abspath(__file__))
-    file_path = os.path.join(current_script_dir, "csvs", csv_name)
-
-    # Open the file in binary mode and read its contents
-    with open(file_path, "rb") as file:
-        csv_bytes = file.read()
-
-    return csv_bytes
-
-
-@skipIf(not EXPLORER_USER_UPLOADS_ENABLED, "User uploads not enabled")
-class TestCsvToTypedDf(TestCase):
-
-    def test_mixed_types(self):
-        df = csv_to_typed_df(_get_csv("mixed.csv"))
-        self.assertTrue(pd.api.types.is_object_dtype(df["Value1"]))
-        self.assertTrue(pd.api.types.is_object_dtype(df["Value2"]))
-        self.assertTrue(pd.api.types.is_object_dtype(df["Value3"]))
-
-    def test_all_types(self):
-        df = csv_to_typed_df(_get_csv("all_types.csv"))
-        self.assertTrue(pd.api.types.is_datetime64_ns_dtype(df["Dates"]))
-        print(df["Integers"].dtype)
-        self.assertTrue(pd.api.types.is_integer_dtype(df["Integers"]))
-        self.assertTrue(pd.api.types.is_float_dtype(df["Floats"]))
-        self.assertTrue(pd.api.types.is_object_dtype(df["Strings"]))
-
-    def test_integer_parsing(self):
-        df = csv_to_typed_df(_get_csv("integers.csv"))
-        self.assertTrue(pd.api.types.is_integer_dtype(df["Integers"]))
-        self.assertTrue(pd.api.types.is_integer_dtype(df["More_integers"]))
-
-    def test_float_parsing(self):
-        df = csv_to_typed_df(_get_csv("floats.csv"))
-        self.assertTrue(pd.api.types.is_float_dtype(df["Floats"]))
-
-    def test_date_parsing(self):
-
-        # Will not handle these formats:
-        # Unix Timestamp: 1706232300 (Seconds since Unix Epoch - 1970-01-01 00:00:00 UTC)
-        # ISO 8601 Week Number: 2024-W04-3 (Year-WWeekNumber-Weekday)
-        # Day of Year: 2024-024 (Year-DayOfYear)
-
-        df = csv_to_typed_df(_get_csv("dates.csv"))
-        self.assertTrue(pd.api.types.is_datetime64_ns_dtype(df["Dates"]))
 
 
 @skipIf(not EXPLORER_USER_UPLOADS_ENABLED, "User uploads not enabled")
@@ -167,7 +118,7 @@ def test_pandas_to_sqlite(self):
         con = sqlite3.connect(temp_db_path)
         try:
             cursor = con.cursor()
-            cursor.execute("SELECT * FROM data")
+            cursor.execute("SELECT * FROM data")  # noqa
             rows = cursor.fetchall()
 
             # Verify the content of the SQLite database
@@ -180,19 +131,3 @@ def test_pandas_to_sqlite(self):
             os.remove(temp_db_path)
 
 
-class TestIsCsvFunction(TestCase):
-
-    def test_is_csv_with_csv_file(self):
-        # Create a SimpleUploadedFile with content_type set to "text/csv"
-        csv_file = SimpleUploadedFile("test.csv", b"column1,column2\n1,A\n2,B", content_type="text/csv")
-        self.assertTrue(is_csv(csv_file))
-
-    def test_is_csv_with_non_csv_file(self):
-        # Create a SimpleUploadedFile with content_type set to "text/plain"
-        txt_file = SimpleUploadedFile("test.txt", b"Just some text", content_type="text/plain")
-        self.assertFalse(is_csv(txt_file))
-
-    def test_is_csv_with_empty_content_type(self):
-        # Create a SimpleUploadedFile with an empty content_type
-        empty_file = SimpleUploadedFile("test.csv", b"column1,column2\n1,A\n2,B", content_type="")
-        self.assertFalse(is_csv(empty_file))
diff --git a/explorer/tests/test_mime.py b/explorer/tests/test_mime.py
new file mode 100644
index 00000000..f4c395cc
--- /dev/null
+++ b/explorer/tests/test_mime.py
@@ -0,0 +1,71 @@
+from django.test import TestCase
+from django.core.files.uploadedfile import SimpleUploadedFile
+from explorer.ee.db_connections.mime import is_sqlite, is_json, is_json_list, is_csv
+
+
+class TestIsCsvFunction(TestCase):
+
+    def test_is_csv_with_csv_file(self):
+        # Create a SimpleUploadedFile with content_type set to "text/csv"
+        csv_file = SimpleUploadedFile("test.csv", b"column1,column2\n1,A\n2,B", content_type="text/csv")
+        self.assertTrue(is_csv(csv_file))
+
+    def test_is_csv_with_non_csv_file(self):
+        # Create a SimpleUploadedFile with content_type set to "text/plain"
+        txt_file = SimpleUploadedFile("test.txt", b"Just some text", content_type="text/plain")
+        self.assertFalse(is_csv(txt_file))
+
+    def test_is_csv_with_empty_content_type(self):
+        # Create a SimpleUploadedFile with an empty content_type
+        empty_file = SimpleUploadedFile("test.csv", b"column1,column2\n1,A\n2,B", content_type="")
+        self.assertFalse(is_csv(empty_file))
+
+
+class TestIsJsonFunction(TestCase):
+
+    def test_is_json_with_valid_json(self):
+        long_json = '{"key1": "value1", "key2": {"subkey1": "subvalue1", "subkey2": "subvalue2"}, "key3": [1, 2, 3, 4]}'
+        json_file = SimpleUploadedFile("test.json", long_json.encode('utf-8'), content_type="application/json")
+        self.assertTrue(is_json(json_file))
+
+    def test_is_json_with_non_json_file(self):
+        txt_file = SimpleUploadedFile("test.txt", b'Just some text', content_type="text/plain")
+        self.assertFalse(is_json(txt_file))
+
+    def test_is_json_with_wrong_extension(self):
+        long_json = '{"key1": "value1", "key2": {"subkey1": "subvalue1", "subkey2": "subvalue2"}, "key3": [1, 2, 3, 4]}'
+        json_file = SimpleUploadedFile("test.txt", long_json.encode('utf-8'), content_type="application/json")
+        self.assertFalse(is_json(json_file))
+
+    def test_is_json_with_empty_content_type(self):
+        long_json = '{"key1": "value1", "key2": {"subkey1": "subvalue1", "subkey2": "subvalue2"}, "key3": [1, 2, 3, 4]}'
+        json_file = SimpleUploadedFile("test.json", long_json.encode('utf-8'), content_type="")
+        self.assertFalse(is_json(json_file))
+
+
+class TestIsJsonListFunction(TestCase):
+
+    def test_is_json_list_with_valid_json_lines(self):
+        json_lines = b'{"key1": "value1"}\n{"key2": "value2"}\n{"key3": {"subkey1": "subvalue1"}}\n'
+        json_file = SimpleUploadedFile("test.json", json_lines, content_type="application/json")
+        self.assertTrue(is_json_list(json_file))
+
+    def test_is_json_list_with_non_json_file(self):
+        txt_file = SimpleUploadedFile("test.txt", b'Just some text', content_type="text/plain")
+        self.assertFalse(is_json_list(txt_file))
+
+    def test_is_json_list_with_invalid_json_lines(self):
+        # This is actually going to *pass* the check, because it's a shallow file-type check, not a comprehensive
+        # one. That's ok! This type of error will get caught later, when pandas tries to parse it
+        invalid_json_lines = b'{"key1": "value1"}\nNot a JSON content\n{"key3": {"subkey1": "subvalue1"}}\n'
+        json_file = SimpleUploadedFile("test.json", invalid_json_lines, content_type="application/json")
+        self.assertTrue(is_json_list(json_file))
+
+    def test_is_json_list_with_wrong_extension(self):
+        json_lines = b'{"key1": "value1"}\n{"key2": "value2"}\n{"key3": {"subkey1": "subvalue1"}}\n'
+        json_file = SimpleUploadedFile("test.txt", json_lines, content_type="application/json")
+        self.assertFalse(is_json_list(json_file))
+
+    def test_is_json_list_with_empty_file(self):
+        json_file = SimpleUploadedFile("test.json", b'', content_type="application/json")
+        self.assertFalse(is_json_list(json_file))
diff --git a/explorer/tests/test_type_infer.py b/explorer/tests/test_type_infer.py
new file mode 100644
index 00000000..50c789da
--- /dev/null
+++ b/explorer/tests/test_type_infer.py
@@ -0,0 +1,58 @@
+from django.test import TestCase
+from unittest import skipIf
+from explorer.app_settings import EXPLORER_USER_UPLOADS_ENABLED
+if EXPLORER_USER_UPLOADS_ENABLED:
+    import pandas as pd
+import os
+from explorer.ee.db_connections.type_infer import csv_to_typed_df
+
+
+
+def _get_csv(csv_name):
+    current_script_dir = os.path.dirname(os.path.abspath(__file__))
+    file_path = os.path.join(current_script_dir, "csvs", csv_name)
+
+    # Open the file in binary mode and read its contents
+    with open(file_path, "rb") as file:
+        csv_bytes = file.read()
+
+    return csv_bytes
+
+
+@skipIf(not EXPLORER_USER_UPLOADS_ENABLED, "User uploads not enabled")
+class TestCsvToTypedDf(TestCase):
+
+    def test_mixed_types(self):
+        df = csv_to_typed_df(_get_csv("mixed.csv"))
+        self.assertTrue(pd.api.types.is_object_dtype(df["Value1"]))
+        self.assertTrue(pd.api.types.is_object_dtype(df["Value2"]))
+        self.assertTrue(pd.api.types.is_object_dtype(df["Value3"]))
+
+    def test_all_types(self):
+        df = csv_to_typed_df(_get_csv("all_types.csv"))
+        self.assertTrue(pd.api.types.is_datetime64_ns_dtype(df["Dates"]))
+        print(df["Integers"].dtype)
+        self.assertTrue(pd.api.types.is_integer_dtype(df["Integers"]))
+        self.assertTrue(pd.api.types.is_float_dtype(df["Floats"]))
+        self.assertTrue(pd.api.types.is_object_dtype(df["Strings"]))
+
+    def test_integer_parsing(self):
+        df = csv_to_typed_df(_get_csv("integers.csv"))
+        self.assertTrue(pd.api.types.is_integer_dtype(df["Integers"]))
+        self.assertTrue(pd.api.types.is_integer_dtype(df["More_integers"]))
+
+    def test_float_parsing(self):
+        df = csv_to_typed_df(_get_csv("floats.csv"))
+        self.assertTrue(pd.api.types.is_float_dtype(df["Floats"]))
+
+    def test_date_parsing(self):
+
+        # Will not handle these formats:
+        # Unix Timestamp: 1706232300 (Seconds since Unix Epoch - 1970-01-01 00:00:00 UTC)
+        # ISO 8601 Week Number: 2024-W04-3 (Year-WWeekNumber-Weekday)
+        # Day of Year: 2024-024 (Year-DayOfYear)
+
+        df = csv_to_typed_df(_get_csv("dates.csv"))
+        self.assertTrue(pd.api.types.is_datetime64_ns_dtype(df["Dates"]))
+
+