Skip to content

Commit

Permalink
Functionality catalog, initial
Browse files Browse the repository at this point in the history
  • Loading branch information
ferdiko committed Sep 5, 2023
1 parent 310780b commit a08aa6f
Show file tree
Hide file tree
Showing 7 changed files with 496 additions and 0 deletions.
24 changes: 24 additions & 0 deletions src/brad/query_rep.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import sqlglot
import sqlglot.expressions as exp
import yaml
import pathlib
import os
from brad.routing.functionality_catalog import Functionality

from typing import List, Optional

Expand All @@ -17,6 +21,12 @@
"TRUNCATE",
]

# Load geospatial keywords used to detect if geospatial query
_GEOSPATIAL_KEYWORDS_PATH = os.path.join(pathlib.Path(__file__).parent.resolve(), "specialized_functionality/geospatial_keywords.yml")
with open(_GEOSPATIAL_KEYWORDS_PATH, "r") as f:
_GEOSPATIAL_KEYWORDS = yaml.safe_load(f)
_GEOSPATIAL_KEYWORDS = [k.upper() for k in _GEOSPATIAL_KEYWORDS]


class QueryRep:
"""
Expand Down Expand Up @@ -53,6 +63,20 @@ def is_transaction_end(self) -> bool:
raw_sql = self._raw_sql_query.upper()
return raw_sql == "COMMIT" or raw_sql == "ROLLBACK"

def is_geospatial(self) -> bool:
query = self._raw_sql_query.upper()
for keyword in _GEOSPATIAL_KEYWORDS:
if keyword in query:
return True
return False

def get_required_functionality(self) -> int:
req_functionality = []
if self.is_geospatial():
req_functionality.append(Functionality.Geospatial)

return Functionality.to_bitmap(req_functionality)

def tables(self) -> List[str]:
if self._tables is None:
if self._ast is None:
Expand Down
11 changes: 11 additions & 0 deletions src/brad/routing/engine_functionality.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
database_engines:
- name: Aurora
functionalities:
- geospatial

- name: Athena
functionalities:
- geospatial

- name: Redshift
functionalities: []
55 changes: 55 additions & 0 deletions src/brad/routing/functionality_catalog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from brad.config.engine import Engine
from typing import List
import operator
import yaml
from functools import reduce


class Functionality:

Geospatial = "geospatial"

def __init__(self, functionality_yaml="engine_functionality.yml"):

# Read the YAML file
with open(functionality_yaml, 'r') as yaml_file:
data = yaml.load(yaml_file, Loader=yaml.FullLoader)

# Initialize lists for each database engine's functionalities
aurora_functionalities = []
athena_functionalities = []
redshift_functionalities = []

# Parse the data into the respective lists
for engine in data['database_engines']:
if engine['name'] == 'Aurora':
aurora_functionalities = engine['functionalities']
elif engine['name'] == 'Athena':
athena_functionalities = engine['functionalities']
elif engine['name'] == 'Redshift':
redshift_functionalities = engine['functionalities']

# Convert to bitmaps
engine_functionality_strings = [athena_functionalities, aurora_functionalities, redshift_functionalities]
self.engine_functionalities = [Functionality.to_bitmap(f) for f in engine_functionality_strings]

@staticmethod
def to_bitmap(functionalities: List["Functionality"]) -> int:
if len(functionalities) == 0:
return 0
return reduce(
# Bitwise OR
operator.or_,
map(lambda f: FunctionalityBitmapValues[f], functionalities),
0,
)

def get_engine_functionalities(self) -> List[int]:
"""
Return a bitmap for each engine that states what functionalities the
engine supports
"""
return self.engine_functionalities

FunctionalityBitmapValues = {}
FunctionalityBitmapValues[Functionality.Geospatial] = 0b1
46 changes: 46 additions & 0 deletions src/brad/routing/router.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
from typing import Dict, Tuple, Optional

from functionality_catalog import Functionality
from brad.data_stats.estimator import Estimator
from brad.config.engine import Engine, EngineBitmapValues
from brad.query_rep import QueryRep


class Router:

def __init__(self):
self.functionality_catalog = Functionality()

async def run_setup(self, estimator: Optional[Estimator] = None) -> None:
"""
Should be called before using the router. This is used to set up any
Expand Down Expand Up @@ -37,6 +42,47 @@ def engine_for_sync(self, query: QueryRep) -> Engine:
"""
raise NotImplementedError

def _run_functionality_routing(
self, query: QueryRep
) -> Tuple[int, Optional[Engine]]:
"""
Based on the functinalities required by the query (e.g. geospatial),
compute the set of engines that are able to serve this query.
"""

# Bitmap describing what functionality is required for running query
req_bitmap = query.get_required_functionality()

# Bitmap for each engine which features it supports
engine_support = self.functionality_catalog.get_engine_functionalities()
engines = [Engine.Athena, Engine.Aurora, Engine.Redshift]

# Narrow down the valid engines that can run the query, based on the
# engine functionality
valid_locations_list = []
for engine, sup_bitmap in zip(engines, engine_support):

query_supported = (~req_bitmap | (req_bitmap & sup_bitmap)) == -1

if query_supported:
valid_locations_list.append(engine)

valid_locations = Engine.to_bitmap(valid_locations_list)

if (valid_locations & (valid_locations - 1)) == 0:
# Bitmap trick - only one bit is set.
if (EngineBitmapValues[Engine.Aurora] & valid_locations) != 0:
return (valid_locations, Engine.Aurora)
elif (EngineBitmapValues[Engine.Redshift] & valid_locations) != 0:
return (valid_locations, Engine.Redshift)
elif (EngineBitmapValues[Engine.Athena] & valid_locations) != 0:
return (valid_locations, Engine.Athena)
else:
raise RuntimeError("Unsupported bitmap value " + str(valid_locations))

# There is more than one possible location.
return (valid_locations, None)

def _run_location_routing(
self, query: QueryRep, location_bitmap: Dict[str, int]
) -> Tuple[int, Optional[Engine]]:
Expand Down
9 changes: 9 additions & 0 deletions src/brad/specialized_functionality/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Specialized functionality

This directory contains helper functions that help BRAD run queries with specialized functionality (like geospatial queries).

## Geospatial queries

In `QueryRep`, we determine whether a query makes use of geospatial functions by determining whether PostGIS keywords appear in the query. `geospatial_keywords.yml` contains a list of the PostGIS keywords that BRAD considers. `geospatial_keywords.yml` can be updated by running `python geospatial_keywords.py`, which crawls a list of PostGIS keywords from [PostGIS' specialized functions index](https://postgis.net/docs/manual-1.5/ch08.html).

Crawling the PostGIS documentation requires `requests` and `bs4`.
45 changes: 45 additions & 0 deletions src/brad/specialized_functionality/geospatial_keywords.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import requests

This comment has been minimized.

Copy link
@geoffxy

geoffxy Sep 22, 2023

Member

Move into tools/

from bs4 import BeautifulSoup
import re
import yaml

if __name__ == "__main__":

# URL of PostGIS special functions index
url = "https://postgis.net/docs/manual-1.5/ch08.html"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
# Parse the HTML content of the page
soup = BeautifulSoup(response.text, "html.parser")

# Find all list items within the page
list_items = soup.find_all("li")

# Initialize an empty list to store the extracted keywords
keywords = []

# Define a regular expression pattern to match the keywords
keyword_pattern = r"^(.*?)\s-\s"

# Iterate over each list item and extract the keyword
for item in list_items:
text = item.get_text()
match = re.search(keyword_pattern, text)
if match:
keyword = match.group(1).strip()
keywords.append(keyword)

# Define the output YAML file name
output_yaml_file = "postgis_keywords.yml"

# Write the extracted keywords to a YAML file
with open(output_yaml_file, "w") as yaml_file:
yaml.dump(keywords, yaml_file, default_flow_style=False)

print(f"Extracted {len(keywords)} keywords and saved to {output_yaml_file}")
else:
print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
Loading

0 comments on commit a08aa6f

Please sign in to comment.