Additional table movement experiment improvements (#506)

This is primarily for the specialized scenario. - Remove the embeddings table from Athena - Exclude it from the all tables constraint (in the VDBE abstraction, this table will have different constraints anyways, so this is natural) - Add a defensive data type conversion for the vector data type (unsure if unloading will even work) Part of #487.
mitdbg · May 3, 2024 · e297115 · e297115
1 parent 025dff6
commit e297115
Show file tree

Hide file tree

Showing 5 changed files with 25 additions and 4 deletions.
diff --git a/experiments/15-e2e-scenarios-v2/specialized/set_up_vector_blueprint.py b/experiments/15-e2e-scenarios-v2/specialized/set_up_vector_blueprint.py
@@ -133,8 +133,10 @@ def main():
         new_placement[table.name] = [Engine.Aurora, Engine.Athena, Engine.Redshift]
         if table.name == "telemetry":
             new_placement[table.name] = [Engine.Athena]
-        if table.name == "embeddings" or table.name == "title":
+        if table.name == "title":
             new_placement[table.name] = [Engine.Aurora, Engine.Athena]
+        if table.name == "embeddings":
+            new_placement[table.name] = [Engine.Aurora]
     enum_blueprint.set_table_locations(new_placement)
 
     # 6. Transition to the new blueprint.

diff --git a/src/brad/blueprint/sql_gen/table.py b/src/brad/blueprint/sql_gen/table.py
@@ -318,5 +318,12 @@ def _type_for(data_type: str, for_db: Engine) -> str:
         return "BIGINT"
     elif data_type_upper.startswith("VARCHAR") and for_db == Engine.Athena:
         return "STRING"
+    elif data_type_upper.startswith("VECTOR"):
+        if for_db == Engine.Athena:
+            return "BINARY"
+        elif for_db == Engine.Redshift:
+            return "VARBYTE"
+        else:
+            return data_type
     else:
         return data_type
diff --git a/src/brad/planner/beam/fpqb.py b/src/brad/planner/beam/fpqb.py
@@ -74,7 +74,11 @@ async def _run_replan_impl(
             # on at least one engine. This ensures that arbitrary unseen join
             # templates can always be immediately handled.
             all_tables = ", ".join(
-                [table.name for table in self._current_blueprint.tables()]
+                [
+                    table.name
+                    for table in self._current_blueprint.tables()
+                    if table.name != "embeddings"
+                ]
             )
             next_workload.add_priming_analytical_query(
                 f"SELECT 1 FROM {all_tables} LIMIT 1"

diff --git a/src/brad/planner/beam/query_based.py b/src/brad/planner/beam/query_based.py
@@ -75,7 +75,11 @@ async def _run_replan_impl(
             # on at least one engine. This ensures that arbitrary unseen join
             # templates can always be immediately handled.
             all_tables = ", ".join(
-                [table.name for table in self._current_blueprint.tables()]
+                [
+                    table.name
+                    for table in self._current_blueprint.tables()
+                    if table.name != "embeddings"
+                ]
             )
             next_workload.add_priming_analytical_query(
                 f"SELECT 1 FROM {all_tables} LIMIT 1"

diff --git a/src/brad/planner/beam/table_based.py b/src/brad/planner/beam/table_based.py
@@ -75,7 +75,11 @@ async def _run_replan_impl(
             # on at least one engine. This ensures that arbitrary unseen join
             # templates can always be immediately handled.
             all_tables = ", ".join(
-                [table.name for table in self._current_blueprint.tables()]
+                [
+                    table.name
+                    for table in self._current_blueprint.tables()
+                    if table.name != "embeddings"
+                ]
             )
             next_workload.add_priming_analytical_query(
                 f"SELECT 1 FROM {all_tables} LIMIT 1"