Userland connections and licensing

explorerhq · May 25, 2024 · 220b2b3 · 220b2b3
1 parent 5693ac3
commit 220b2b3
Show file tree

Hide file tree

Showing 71 changed files with 2,172 additions and 273 deletions.
diff --git a/.gitignore b/.gitignore
@@ -22,3 +22,5 @@ docs/_build/
 .env
 tst
 tst2
+user_dbs/*
+tmp2
diff --git a/LICENSE b/LICENSE
@@ -1,3 +1,12 @@
+* All content that resides under the "explorer/ee/" directory of this repository,
+if that directory exists, is licensed under the license defined in "explorer/ee/LICENSE".
+
+* All third party components incorporated into the SQL Explorer Software are
+licensed under the original license provided by the owner of the applicable component.
+
+* Content outside of the above mentioned directories or restrictions above
+is available under the "MIT" license as defined below.
+
 The MIT License (MIT)
 
 Copyright (c) 2013 Chris Clark, ePantry LLC
@@ -18,4 +27,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
+THE SOFTWARE.
diff --git a/docs/features.rst b/docs/features.rst
@@ -46,10 +46,10 @@ Snapshots
 .. code-block:: python
 
     app.conf.beat_schedule = {
-       'explorer.tasks.snapshot_queries': {
-           'task': 'explorer.tasks.snapshot_queries',
-           'schedule': crontab(hour=1, minute=0)
-       }
+       "explorer.tasks.snapshot_queries": {
+            "task": "explorer.tasks.snapshot_queries",
+            "schedule": crontab(hour="1", minute="0")
+        },
     }
 
 - Requires celery, obviously. Also uses boto3. All
@@ -168,10 +168,10 @@ Query Logs
 .. code-block:: python
 
    app.conf.beat_schedule = {
-       'explorer.tasks.truncate_querylogs': {
-           'task': 'explorer.tasks.truncate_querylogs',
-           'schedule': crontab(hour=1, minute=0),
-           'kwargs': {'days': 30}
+       "explorer.tasks.truncate_querylogs": {
+           "task": "explorer.tasks.truncate_querylogs",
+           "schedule": crontab(hour="1", minute="10"),
+           "kwargs": {"days": 30}
        }
    }
 

diff --git a/explorer/admin.py b/explorer/admin.py
@@ -2,6 +2,7 @@
 
 from explorer.actions import generate_report_action
 from explorer.models import Query, ExplorerValue
+from explorer.ee.user_connections.admin import DatabaseConnectionAdmin  # noqa
 
 
 @admin.register(Query)
@@ -16,12 +17,10 @@ class QueryAdmin(admin.ModelAdmin):
 class ExplorerValueAdmin(admin.ModelAdmin):
     list_display = ("key", "value", "display_key")
     list_filter = ("key",)
-    readonly_fields = ("key",)
     search_fields = ("key", "value")
 
     def display_key(self, obj):
         # Human-readable name for the key
         return dict(ExplorerValue.EXPLORER_SETTINGS_CHOICES).get(obj.key, "")
 
     display_key.short_description = "Setting Name"
-
diff --git a/explorer/app_settings.py b/explorer/app_settings.py
@@ -2,6 +2,8 @@
 
 from django.conf import settings
 
+from explorer.ee import has_valid_license
+
 
 EXPLORER_CONNECTIONS = getattr(settings, "EXPLORER_CONNECTIONS", {})
 EXPLORER_DEFAULT_CONNECTION = getattr(
@@ -70,7 +72,7 @@
     settings, "EXPLORER_PERMISSION_CHANGE", lambda r: r.user.is_staff
 )
 EXPLORER_RECENT_QUERY_COUNT = getattr(
-    settings, "EXPLORER_RECENT_QUERY_COUNT", 10
+    settings, "EXPLORER_RECENT_QUERY_COUNT", 5
 )
 EXPLORER_ASYNC_SCHEMA = getattr(settings, "EXPLORER_ASYNC_SCHEMA", False)
 
@@ -125,7 +127,7 @@
 S3_REGION = getattr(settings, "EXPLORER_S3_REGION", "us-east-1")
 S3_ENDPOINT_URL = getattr(settings, "EXPLORER_S3_ENDPOINT_URL", None)
 S3_DESTINATION = getattr(settings, "EXPLORER_S3_DESTINATION", "")
-S3_SIGNATURE_VERSION = getattr(settings, "EXPLORER_S3_SIGNATURE_VERSION", "v2")
+S3_SIGNATURE_VERSION = getattr(settings, "EXPLORER_S3_SIGNATURE_VERSION", "v4")
 
 UNSAFE_RENDERING = getattr(settings, "EXPLORER_UNSAFE_RENDERING", False)
 
@@ -146,8 +148,18 @@
 
 # AI Assistant settings. Setting the first to an OpenAI key is the simplest way to enable the assistant
 EXPLORER_AI_API_KEY = getattr(settings, "EXPLORER_AI_API_KEY", None)
+
 EXPLORER_ASSISTANT_BASE_URL = getattr(settings, "EXPLORER_ASSISTANT_BASE_URL", "https://api.openai.com/v1")
 EXPLORER_ASSISTANT_MODEL = getattr(settings, "EXPLORER_ASSISTANT_MODEL",
                                    # Return the model name and max_tokens it supports
-                                   {"name": "gpt-4-0125-preview",
+                                   {"name": "gpt-4o",
                                     "max_tokens": 128000})
+
+EXPLORER_USER_UPLOADS_ENABLED = getattr(settings, "EXPLORER_USER_UPLOADS_ENABLED", False)
+EXPLORER_PRUNE_LOCAL_UPLOAD_COPY_DAYS_INACTIVITY = getattr(settings,
+                                                           "EXPLORER_PRUNE_LOCAL_UPLOAD_COPY_DAYS_INACTIVITY", 7)
+EXPLORER_LICENSE_KEY = getattr(settings,"EXPLORER_LICENSE_KEY", None)
+
+
+def has_assistant(): return EXPLORER_AI_API_KEY is not None
+def has_user_uploads(): return has_valid_license() and EXPLORER_USER_UPLOADS_ENABLED
diff --git a/explorer/apps.py b/explorer/apps.py
@@ -28,8 +28,9 @@ def _get_explorer_connections():
 
 def _validate_connections():
 
-    # Validate connections
-    if _get_default() not in _get_explorer_connections().values():
+    # Validate connections, when using settings.EXPLORER_CONNECTIONS
+    # Skip if none are configured, as the app will use user-configured connections (DatabaseConnection models)
+    if _get_explorer_connections().values() and _get_default() not in _get_explorer_connections().values():
         raise ImproperlyConfigured(
             f"EXPLORER_DEFAULT_CONNECTION is {_get_default()}, "
             f"but that alias is not present in the values of "

diff --git a/explorer/assistant/tests.py b/explorer/assistant/tests.py
diff --git a/explorer/assistant/utils.py b/explorer/assistant/utils.py
@@ -1,11 +1,13 @@
 from explorer import app_settings
 from explorer.schema import schema_info
+from explorer.models import ExplorerValue
 from explorer.utils import get_valid_connection
 from django.db.utils import OperationalError
 
 
 OPENAI_MODEL = app_settings.EXPLORER_ASSISTANT_MODEL["name"]
 ROW_SAMPLE_SIZE = 2
+MAX_FIELD_SAMPLE_SIZE = 500  # characters
 
 
 def openai_client():
@@ -41,20 +43,46 @@ def tables_from_schema_info(connection, table_names):
 def sample_rows_from_tables(connection, table_names):
     ret = ""
     for table_name in table_names:
-        ret = f"SAMPLE FROM TABLE {table_name}:\n"
-        ret = ret + format_rows_from_table(
+        ret += f"SAMPLE FROM TABLE {table_name}:\n"
+        ret += format_rows_from_table(
             sample_rows_from_table(connection, table_name)
         ) + "\n\n"
     return ret
 
 
 def sample_rows_from_table(connection, table_name):
+    """
+    Fetches a sample of rows from the specified table and ensures that any field values
+    exceeding 500 characters (or bytes) are truncated. This is useful for handling fields
+    like "description" that might contain very long strings of text or binary data.
+    Truncating these fields prevents issues with displaying or processing overly large values.
+    An ellipsis ("...") is appended to indicate that the data has been truncated.
+
+    Args:
+        connection: The database connection.
+        table_name: The name of the table to sample rows from.
+
+    Returns:
+        A list of rows with field values truncated if they exceed 500 characters/bytes.
+    """
     conn = get_valid_connection(connection)
     cursor = conn.cursor()
     try:
         cursor.execute(f"SELECT * FROM {table_name} LIMIT {ROW_SAMPLE_SIZE}")
         ret = [[header[0] for header in cursor.description]]
-        ret = ret + cursor.fetchall()
+        rows = cursor.fetchall()
+
+        for row in rows:
+            processed_row = []
+            for field in row:
+                new_val = field
+                if isinstance(field, str) and len(field) > MAX_FIELD_SAMPLE_SIZE:
+                    new_val = field[:MAX_FIELD_SAMPLE_SIZE] + "..."  # Truncate and add ellipsis
+                elif isinstance(field, (bytes, bytearray)) and len(field) > MAX_FIELD_SAMPLE_SIZE:
+                    new_val = field[:MAX_FIELD_SAMPLE_SIZE] + b"..."  # Truncate binary data
+                processed_row.append(new_val)
+            ret.append(processed_row)
+
         return ret
     except OperationalError as e:
         return [[str(e)]]
@@ -83,7 +111,10 @@ def get_table_names_from_query(sql):
 def num_tokens_from_string(string: str) -> int:
     """Returns the number of tokens in a text string."""
     import tiktoken
-    encoding = tiktoken.encoding_for_model(OPENAI_MODEL)
+    try:
+        encoding = tiktoken.encoding_for_model(OPENAI_MODEL)
+    except KeyError:
+        encoding = tiktoken.get_encoding("cl100k_base")
     num_tokens = len(encoding.encode(string))
     return num_tokens
 
@@ -92,3 +123,35 @@ def fits_in_window(string: str) -> bool:
     # Ratchet down by 5% to account for other boilerplate and system prompt
     # TODO make this better by actually looking at the token count of the system prompt
     return num_tokens_from_string(string) < (app_settings.EXPLORER_ASSISTANT_MODEL["max_tokens"] * 0.95)
+
+
+def build_prompt(request_data, included_tables):
+    user_prompt = ""
+
+    db_vendor = get_valid_connection(request_data.get("connection")).vendor
+    user_prompt += f"## Database Vendor / SQL Flavor is {db_vendor}\n\n"
+
+    db_error = request_data.get("db_error")
+    if db_error:
+        user_prompt += f"## Query Error ##\n\n{db_error}\n\n"
+
+    sql = request_data.get("sql")
+    if sql:
+        user_prompt += f"## Existing SQL ##\n\n{sql}\n\n"
+
+    results_sample = sample_rows_from_tables(request_data["connection"],
+                                             included_tables)
+    if fits_in_window(user_prompt + results_sample):
+        user_prompt += f"## Table Structure with Sampled Data ##\n\n{results_sample}\n\n"
+    else:  # If it's too large with sampling, then provide *just* the structure
+        table_struct = tables_from_schema_info(request_data["connection"],
+                                               included_tables)
+        user_prompt += f"## Table Structure ##\n\n{table_struct}\n\n"
+
+    user_prompt += f"## User's Request to Assistant ##\n\n{request_data['assistant_request']}\n\n"
+
+    prompt = {
+        "system": ExplorerValue.objects.get_item(ExplorerValue.ASSISTANT_SYSTEM_PROMPT).value,
+        "user": user_prompt
+    }
+    return prompt
-Original file line number
+Diff line change
@@ Expand Up / @@ -22,3 +22,5 @@ docs/_build/ @@
     .env
     tst
     tst2
+    user_dbs/*
+    tmp2