diff --git a/src/brad/query_rep.py b/src/brad/query_rep.py index 564a42b3..92998529 100644 --- a/src/brad/query_rep.py +++ b/src/brad/query_rep.py @@ -1,5 +1,9 @@ import sqlglot import sqlglot.expressions as exp +import yaml +import pathlib +import os +from brad.routing.functionality_catalog import Functionality from typing import List, Optional @@ -17,6 +21,12 @@ "TRUNCATE", ] +# Load geospatial keywords used to detect if geospatial query +_GEOSPATIAL_KEYWORDS_PATH = os.path.join(pathlib.Path(__file__).parent.resolve(), "specialized_functionality/geospatial_keywords.yml") +with open(_GEOSPATIAL_KEYWORDS_PATH, "r") as f: + _GEOSPATIAL_KEYWORDS = yaml.safe_load(f) +_GEOSPATIAL_KEYWORDS = [k.upper() for k in _GEOSPATIAL_KEYWORDS] + class QueryRep: """ @@ -53,6 +63,20 @@ def is_transaction_end(self) -> bool: raw_sql = self._raw_sql_query.upper() return raw_sql == "COMMIT" or raw_sql == "ROLLBACK" + def is_geospatial(self) -> bool: + query = self._raw_sql_query.upper() + for keyword in _GEOSPATIAL_KEYWORDS: + if keyword in query: + return True + return False + + def get_required_functionality(self) -> int: + req_functionality = [] + if self.is_geospatial(): + req_functionality.append(Functionality.Geospatial) + + return Functionality.to_bitmap(req_functionality) + def tables(self) -> List[str]: if self._tables is None: if self._ast is None: diff --git a/src/brad/routing/engine_functionality.yml b/src/brad/routing/engine_functionality.yml new file mode 100644 index 00000000..ecff873c --- /dev/null +++ b/src/brad/routing/engine_functionality.yml @@ -0,0 +1,11 @@ +database_engines: + - name: Aurora + functionalities: + - geospatial + + - name: Athena + functionalities: + - geospatial + + - name: Redshift + functionalities: [] diff --git a/src/brad/routing/functionality_catalog.py b/src/brad/routing/functionality_catalog.py new file mode 100644 index 00000000..06d9b7a0 --- /dev/null +++ b/src/brad/routing/functionality_catalog.py @@ -0,0 +1,55 @@ +from brad.config.engine import Engine +from typing import List +import operator +import yaml +from functools import reduce + + +class Functionality: + + Geospatial = "geospatial" + + def __init__(self, functionality_yaml="engine_functionality.yml"): + + # Read the YAML file + with open(functionality_yaml, 'r') as yaml_file: + data = yaml.load(yaml_file, Loader=yaml.FullLoader) + + # Initialize lists for each database engine's functionalities + aurora_functionalities = [] + athena_functionalities = [] + redshift_functionalities = [] + + # Parse the data into the respective lists + for engine in data['database_engines']: + if engine['name'] == 'Aurora': + aurora_functionalities = engine['functionalities'] + elif engine['name'] == 'Athena': + athena_functionalities = engine['functionalities'] + elif engine['name'] == 'Redshift': + redshift_functionalities = engine['functionalities'] + + # Convert to bitmaps + engine_functionality_strings = [athena_functionalities, aurora_functionalities, redshift_functionalities] + self.engine_functionalities = [Functionality.to_bitmap(f) for f in engine_functionality_strings] + + @staticmethod + def to_bitmap(functionalities: List["Functionality"]) -> int: + if len(functionalities) == 0: + return 0 + return reduce( + # Bitwise OR + operator.or_, + map(lambda f: FunctionalityBitmapValues[f], functionalities), + 0, + ) + + def get_engine_functionalities(self) -> List[int]: + """ + Return a bitmap for each engine that states what functionalities the + engine supports + """ + return self.engine_functionalities + +FunctionalityBitmapValues = {} +FunctionalityBitmapValues[Functionality.Geospatial] = 0b1 diff --git a/src/brad/routing/router.py b/src/brad/routing/router.py index a1f53f97..05b022d6 100644 --- a/src/brad/routing/router.py +++ b/src/brad/routing/router.py @@ -1,11 +1,16 @@ from typing import Dict, Tuple, Optional +from functionality_catalog import Functionality from brad.data_stats.estimator import Estimator from brad.config.engine import Engine, EngineBitmapValues from brad.query_rep import QueryRep class Router: + + def __init__(self): + self.functionality_catalog = Functionality() + async def run_setup(self, estimator: Optional[Estimator] = None) -> None: """ Should be called before using the router. This is used to set up any @@ -37,6 +42,47 @@ def engine_for_sync(self, query: QueryRep) -> Engine: """ raise NotImplementedError + def _run_functionality_routing( + self, query: QueryRep + ) -> Tuple[int, Optional[Engine]]: + """ + Based on the functinalities required by the query (e.g. geospatial), + compute the set of engines that are able to serve this query. + """ + + # Bitmap describing what functionality is required for running query + req_bitmap = query.get_required_functionality() + + # Bitmap for each engine which features it supports + engine_support = self.functionality_catalog.get_engine_functionalities() + engines = [Engine.Athena, Engine.Aurora, Engine.Redshift] + + # Narrow down the valid engines that can run the query, based on the + # engine functionality + valid_locations_list = [] + for engine, sup_bitmap in zip(engines, engine_support): + + query_supported = (~req_bitmap | (req_bitmap & sup_bitmap)) == -1 + + if query_supported: + valid_locations_list.append(engine) + + valid_locations = Engine.to_bitmap(valid_locations_list) + + if (valid_locations & (valid_locations - 1)) == 0: + # Bitmap trick - only one bit is set. + if (EngineBitmapValues[Engine.Aurora] & valid_locations) != 0: + return (valid_locations, Engine.Aurora) + elif (EngineBitmapValues[Engine.Redshift] & valid_locations) != 0: + return (valid_locations, Engine.Redshift) + elif (EngineBitmapValues[Engine.Athena] & valid_locations) != 0: + return (valid_locations, Engine.Athena) + else: + raise RuntimeError("Unsupported bitmap value " + str(valid_locations)) + + # There is more than one possible location. + return (valid_locations, None) + def _run_location_routing( self, query: QueryRep, location_bitmap: Dict[str, int] ) -> Tuple[int, Optional[Engine]]: diff --git a/src/brad/specialized_functionality/README.md b/src/brad/specialized_functionality/README.md new file mode 100644 index 00000000..e6261408 --- /dev/null +++ b/src/brad/specialized_functionality/README.md @@ -0,0 +1,9 @@ +# Specialized functionality + +This directory contains helper functions that help BRAD run queries with specialized functionality (like geospatial queries). + +## Geospatial queries + +In `QueryRep`, we determine whether a query makes use of geospatial functions by determining whether PostGIS keywords appear in the query. `geospatial_keywords.yml` contains a list of the PostGIS keywords that BRAD considers. `geospatial_keywords.yml` can be updated by running `python geospatial_keywords.py`, which crawls a list of PostGIS keywords from [PostGIS' specialized functions index](https://postgis.net/docs/manual-1.5/ch08.html). + +Crawling the PostGIS documentation requires `requests` and `bs4`. diff --git a/src/brad/specialized_functionality/geospatial_keywords.py b/src/brad/specialized_functionality/geospatial_keywords.py new file mode 100644 index 00000000..7d3337ae --- /dev/null +++ b/src/brad/specialized_functionality/geospatial_keywords.py @@ -0,0 +1,45 @@ +import requests +from bs4 import BeautifulSoup +import re +import yaml + +if __name__ == "__main__": + + # URL of PostGIS special functions index + url = "https://postgis.net/docs/manual-1.5/ch08.html" + + # Send an HTTP GET request to the URL + response = requests.get(url) + + # Check if the request was successful + if response.status_code == 200: + # Parse the HTML content of the page + soup = BeautifulSoup(response.text, "html.parser") + + # Find all list items within the page + list_items = soup.find_all("li") + + # Initialize an empty list to store the extracted keywords + keywords = [] + + # Define a regular expression pattern to match the keywords + keyword_pattern = r"^(.*?)\s-\s" + + # Iterate over each list item and extract the keyword + for item in list_items: + text = item.get_text() + match = re.search(keyword_pattern, text) + if match: + keyword = match.group(1).strip() + keywords.append(keyword) + + # Define the output YAML file name + output_yaml_file = "postgis_keywords.yml" + + # Write the extracted keywords to a YAML file + with open(output_yaml_file, "w") as yaml_file: + yaml.dump(keywords, yaml_file, default_flow_style=False) + + print(f"Extracted {len(keywords)} keywords and saved to {output_yaml_file}") + else: + print(f"Failed to retrieve the webpage. Status code: {response.status_code}") diff --git a/src/brad/specialized_functionality/geospatial_keywords.yml b/src/brad/specialized_functionality/geospatial_keywords.yml new file mode 100644 index 00000000..b341a86c --- /dev/null +++ b/src/brad/specialized_functionality/geospatial_keywords.yml @@ -0,0 +1,306 @@ +- ST_Accum +- ST_Collect +- ST_Extent +- ST_Extent3D +- ST_MakeLine +- ST_MemUnion +- ST_Polygonize +- ST_Union +- ST_Area +- ST_AsBinary +- ST_AsText +- ST_Boundary +- ST_Buffer +- ST_Centroid +- ST_Contains +- ST_ConvexHull +- ST_CoordDim +- ST_Crosses +- ST_CurveToLine +- ST_Difference +- ST_Dimension +- ST_Disjoint +- ST_Distance +- ST_EndPoint +- ST_Envelope +- ST_Equals +- ST_ExteriorRing +- ST_GMLToSQL +- ST_GeomCollFromText +- ST_GeomFromText +- ST_GeomFromWKB +- ST_GeometryFromText +- ST_GeometryN +- ST_GeometryType +- ST_InteriorRingN +- ST_Intersection +- ST_Intersects +- ST_IsClosed +- ST_IsEmpty +- ST_IsRing +- ST_IsSimple +- ST_IsValid +- ST_Length +- ST_LineFromText +- ST_LineFromWKB +- ST_LinestringFromWKB +- ST_M +- ST_MLineFromText +- ST_MPointFromText +- ST_MPolyFromText +- ST_NumGeometries +- ST_NumInteriorRing +- ST_NumInteriorRings +- ST_NumPoints +- ST_OrderingEquals +- ST_Overlaps +- ST_Perimeter +- ST_Point +- ST_PointFromText +- ST_PointFromWKB +- ST_PointN +- ST_PointOnSurface +- ST_Polygon +- ST_PolygonFromText +- ST_Relate +- ST_SRID +- ST_StartPoint +- ST_SymDifference +- ST_Touches +- ST_Transform +- ST_Union +- ST_WKBToSQL +- ST_WKTToSQL +- ST_Within +- ST_X +- ST_Y +- ST_Z +- ST_Area +- ST_AsBinary +- ST_AsGML +- ST_AsGeoJSON +- ST_AsKML +- ST_AsSVG +- ST_AsText +- ST_Buffer +- ST_CoveredBy +- ST_Covers +- ST_DWithin +- ST_Distance +- ST_GeogFromText +- ST_GeogFromWKB +- ST_GeographyFromText +- '=' +- '&&' +- ST_Intersection +- ST_Intersects +- ST_Length +- ST_Dump +- ST_DumpPoints +- ST_DumpRings +- Box2D +- Box3D +- ST_Estimated_Extent +- ST_Expand +- ST_Extent +- ST_Extent3D +- ST_MakeBox2D +- ST_MakeBox3D +- ST_XMax +- ST_XMin +- ST_YMax +- ST_YMin +- ST_ZMax +- ST_ZMin +- AddGeometryColumn +- Box3D +- DropGeometryColumn +- ST_Accum +- ST_AddMeasure +- ST_AddPoint +- ST_Affine +- ST_AsEWKB +- ST_AsEWKT +- ST_AsGML +- ST_AsGeoJSON +- ST_AsHEXEWKB +- ST_AsKML +- ST_Boundary +- ST_Collect +- ST_ConvexHull +- ST_CoordDim +- ST_CurveToLine +- ST_Difference +- ST_Dump +- ST_DumpPoints +- ST_DumpRings +- ST_EndPoint +- ST_Extent3D +- ST_ExteriorRing +- ST_ForceRHR +- ST_Force_3D +- ST_Force_3DZ +- ST_Force_4D +- ST_Force_Collection +- ST_GeomFromEWKB +- ST_GeomFromEWKT +- ST_GeomFromGML +- ST_GeomFromKML +- ST_GeometryN +- ST_HasArc +- ST_InteriorRingN +- ST_IsClosed +- ST_IsSimple +- ST_Length3D +- ST_Length3D_Spheroid +- ST_Length_Spheroid +- ST_LineFromMultiPoint +- ST_LineToCurve +- ST_Line_Interpolate_Point +- ST_Line_Substring +- ST_LocateBetweenElevations +- ST_M +- ST_MakeBox3D +- ST_MakeLine +- ST_MakePoint +- ST_MakePolygon +- ST_MemUnion +- ST_Mem_Size +- ST_NDims +- ST_NPoints +- ST_NRings +- ST_Perimeter3D +- ST_PointFromWKB +- ST_PointN +- ST_PointOnSurface +- ST_Polygon +- ST_RemovePoint +- ST_Rotate +- ST_RotateX +- ST_RotateY +- ST_RotateZ +- ST_Scale +- ST_SetPoint +- ST_Shift_Longitude +- ST_SnapToGrid +- ST_StartPoint +- ST_Summary +- ST_SymDifference +- ST_TransScale +- ST_Translate +- ST_X +- ST_XMax +- ST_XMin +- ST_Y +- ST_YMax +- ST_YMin +- ST_Z +- ST_ZMax +- ST_ZMin +- ST_Zmflag +- UpdateGeometrySRID +- AddGeometryColumn +- Box2D +- Box3D +- DropGeometryColumn +- GeometryType +- PostGIS_AddBBox +- PostGIS_DropBBox +- PostGIS_HasBBox +- ST_Accum +- ST_Affine +- ST_AsBinary +- ST_AsEWKB +- ST_AsEWKT +- ST_AsHEXEWKB +- ST_AsText +- ST_Collect +- ST_CoordDim +- ST_CurveToLine +- ST_Dump +- ST_Estimated_Extent +- ST_Extent3D +- ST_Force_2D +- ST_Force_3D +- ST_Force_3DM +- ST_Force_3DZ +- ST_Force_4D +- ST_Force_Collection +- ST_GeoHash +- ST_GeogFromWKB +- ST_GeomFromEWKB +- ST_GeomFromEWKT +- ST_GeomFromText +- ST_GeomFromWKB +- ST_GeometryN +- '=' +- '&<|' +- '&&' +- ST_HasArc +- ST_IsClosed +- ST_IsEmpty +- ST_LineToCurve +- ST_Mem_Size +- ST_NPoints +- ST_NRings +- ST_PointFromWKB +- ST_PointN +- ST_Rotate +- ST_RotateZ +- ST_SRID +- ST_Scale +- ST_SetSRID +- ST_TransScale +- ST_Transform +- ST_Translate +- ST_XMax +- ST_XMin +- ST_YMax +- ST_YMin +- ST_ZMax +- ST_ZMin +- ST_Zmflag +- UpdateGeometrySRID +- PostGIS_LibXML_Version +- ST_AddMeasure +- ST_AsBinary +- ST_AsGeoJSON +- ST_AsText +- ST_Buffer +- ST_ClosestPoint +- ST_CollectionExtract +- ST_Covers +- ST_DFullyWithin +- ST_DWithin +- ST_Distance +- ST_Distance_Sphere +- ST_Distance_Spheroid +- ST_DumpPoints +- ST_Envelope +- ST_GMLToSQL +- ST_GeomFromGML +- ST_GeomFromKML +- '&&' +- ~= +- ST_HausdorffDistance +- ST_Intersection +- ST_Intersects +- ST_Length +- ST_LongestLine +- ST_MakeEnvelope +- ST_MaxDistance +- ST_ShortestLine +- Populate_Geometry_Columns +- ST_AsSVG +- ST_Collect +- ST_ContainsProperly +- ST_Extent +- ST_GeoHash +- ST_IsValidReason +- ST_LineCrossingDirection +- ST_LocateBetweenElevations +- ST_MakeLine +- ST_MinimumBoundingCircle +- ST_Union +- ST_AsGeoJSON +- ST_SimplifyPreserveTopology