diff --git a/.docker/docker-compose.prod.yml b/.docker/docker-compose.prod.yml new file mode 100644 index 00000000..41202524 --- /dev/null +++ b/.docker/docker-compose.prod.yml @@ -0,0 +1,33 @@ +version: '3' +services: + congress_parser_api: + environment: + - STAGE=prod + - db_host=10.0.0.248:5432 + congress_viewer_app: + volumes: + - /var/www/congress:/usr/src/app/build + entrypoint: + - "yarn" + command: + - "build" +# congress_nginx: +# container_name: congress_nginx +# build: +# context: ../ +# dockerfile: .docker/nginx.dockerfile +# ports: +# - 80:80 +# - 443:443 +# networks: +# parser: +# volumes: +# - /etc/letsencrypt:/etc/letsencrypt +# - ../frontend/.nginx/:/etc/nginx/sites-enabled/ +# - /var/www/congress:/var/www/congress + congress_postgres: + image: tianon/true +networks: + parser: + external: + name: docker_parser diff --git a/.docker/docker-compose.yml b/.docker/docker-compose.yml index 76030b91..ab98e13b 100644 --- a/.docker/docker-compose.yml +++ b/.docker/docker-compose.yml @@ -1,6 +1,6 @@ version: '3' services: - parser_api: + congress_parser_api: container_name: congress_parser_api tty: true stdin_open: true @@ -22,13 +22,18 @@ services: - congress_postgres networks: parser: - viewer_app: + entrypoint: "python3" + command: + - "-m" + - "billparser" + congress_viewer_app: container_name: congress_viewer_app tty: true stdin_open: true environment: - STAGE=dev - CHOKIDAR_USEPOLLING=true + - API_URL=http://localhost:9090/ build: context: ../frontend dockerfile: .docker/Dockerfile @@ -36,19 +41,11 @@ services: - ../frontend/public:/usr/src/app/public - ../frontend/src:/usr/src/app/src ports: - - "80:3000" + - "3000:3000" networks: parser: - # nginx: - # image: nginx - # ports: - # - 80:80 - # - 443:443 - # networks: - # parser: - # volumes: - # - /etc/letsencrypt:/etc/letsencrypt - # - ./nginx/:/etc/nginx/sites-enabled/ + command: + - "start.sh" congress_postgres: image: postgres:10.6 container_name: congress_postgres @@ -67,4 +64,4 @@ networks: external: name: docker_parser volumes: - postgres-volume: \ No newline at end of file + postgres-volume: diff --git a/.docker/nginx.dockerfile b/.docker/nginx.dockerfile new file mode 100644 index 00000000..331eb590 --- /dev/null +++ b/.docker/nginx.dockerfile @@ -0,0 +1,11 @@ +FROM debian:jessie + +RUN apt-get update && apt-get install -y nginx \ + ca-certificates \ + gettext-base +RUN ln -sf /dev/stdout /var/log/nginx/access.log \ + && ln -sf /dev/stderr /var/log/nginx/error.log + +EXPOSE 80 443 + +CMD ["/usr/sbin/nginx", "-g", "daemon off;"] diff --git a/.gitignore b/.gitignore index 8ae0f18b..c59c7c5d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ **.pyc -**/node_modules \ No newline at end of file +**/node_modules +frontend/yarn-error.log +.vscode +.docker/docker-compose.local.yml diff --git a/backend/.docker/Dockerfile b/backend/.docker/Dockerfile index 4e719912..79995639 100644 --- a/backend/.docker/Dockerfile +++ b/backend/.docker/Dockerfile @@ -12,5 +12,4 @@ WORKDIR /usr/src/app EXPOSE 9090 EXPOSE 80 -ENTRYPOINT "python" -CMD ["-m", "billparser"] \ No newline at end of file +ENTRYPOINT ["python3", "-m", "billparser"] \ No newline at end of file diff --git a/backend/.dockerignore b/backend/.dockerignore index 80305eed..72ca9d0b 100644 --- a/backend/.dockerignore +++ b/backend/.dockerignore @@ -1,3 +1,4 @@ .git .docker *.workspace +usc \ No newline at end of file diff --git a/backend/billparser/__main__.py b/backend/billparser/__main__.py index 84b4a31a..c77bf0a1 100644 --- a/backend/billparser/__main__.py +++ b/backend/billparser/__main__.py @@ -15,6 +15,7 @@ from flask_cors import CORS from flask_sqlalchemy import SQLAlchemy from flask_sqlalchemy_session import flask_scoped_session +from flask_sqlalchemy_session import current_session from sqlalchemy.orm import sessionmaker from billparser.db.handler import DATABASE_URI @@ -31,7 +32,17 @@ get_sections, get_versions, get_revisions, - get_revision_diff + get_revision_diff, +) +from billparser.db.models import ( + Chapter, + Section, + Content, + ContentDiff, + Version, + Bill, + BillVersion, + BillContent, ) from billparser.helpers import treeify @@ -78,8 +89,21 @@ row2dict = lambda r: {c.name: str(getattr(r, c.name)) for c in r.__table__.columns} -@app.route("/bills") -def bills(): +@app.route("/bills", methods=["GET"]) +def bills() -> str: + """ + This is the bill search function. There are a few query parameters: + Params: + h (int): Boolean of if they requested House bills + s (int): Boolean of if they requested Senate bills + q (str): Text to search for + + These are kinda dumb. + incl (str): Mutually exclusive search for specific bill version types (include) + decl (str): Mutually exclusive search for specific bill version types (exclude) + Returns: + str: Array of objects representing the bills that match the search + """ house = request.args.get("h", default=1, type=int) senate = request.args.get("s", default=1, type=int) query = request.args.get("q", default="", type=str) @@ -109,57 +133,94 @@ def bills(): return res -@app.route("/bill/") -def bill_content(bill_version): +@app.route("/bill/", methods=["GET"]) +def bill_content(bill_version: str) -> str: + """ + Returns the bill text, broken down by the way the XML was structured + + Args: + bill_version (str): bill_version_id used as a fk on the BillContent table + + Returns: + str: String json array of bills + """ results = get_bill_contents(bill_version) results = [x.to_dict() for x in results] - # print(treeify(results)) - ret_obj = {"contents": results} return json.dumps(results) -@app.route("/bill_tree/") -def bill_content_tree(bill_version): +@app.route("/bill_tree/", methods=["GET"]) +def bill_content_tree(bill_version: str) -> str: + """ + Handles assembling the bill tree for a particular bill + + Args: + bill_version (str): bill_version_id used as a fk on the BillContent table + + Returns: + str: A treeified version of the bill, and the associated metadata + """ results = get_bill_contents(bill_version) results = [x.to_dict() for x in results] metadata = get_bill_metadata(bill_version) res = treeify(results)["child"][0] - # ['"`print(res) return json.dumps({"content": res, "metadata": metadata}) -@app.route("/titles") -def titles(): - res = [] - for chapter in get_chapters(): - res.append(chapter.to_dict()) +@app.route("/titles", methods=["GET"]) +def titles() -> str: + """ + Returns all the chapters of the USCode + + Returns: + str: str array of the chapter objects + """ + res = [chapter.to_dict() for chapter in get_chapters()] return json.dumps(res) -@app.route("/versions") -def versions(): +@app.route("/versions", methods=["GET"]) +def versions() -> str: + """ + More of a debug function that lists all the Version rows + These represent the different USCode release points, and the bills themselves + + Returns: + str: Dump of the Version table + """ res = [] for version in get_versions(): res.append(version.to_dict()) return json.dumps(res) -@app.route("/revisions") -def revisions(): +@app.route("/revisions", methods=["GET"]) +def revisions() -> str: + """ + Returns a dump of the USCode release points available in the database. + These are the XML dumps that are put out when an enrolled bill is codified. + + Returns: + str: Dump of the Version table where base_id == None + """ res = [] for version in get_revisions(): res.append(version.to_dict()) return json.dumps(res) -@app.route("/test") -def test(): - return get_revision_diff(1,2) @app.route("/version", methods=["POST"]) -def version(): - print(request) +def version() -> str: + """ + Grabs the diff for a specific bill_version_id. + ContentDiff is a preprocessed table with all the diffs, this merely returns them. + + Currently the entire set is returned. + + Returns: + str: Object with the diffs and the contents enumerated for a specific bill version + """ req = request.json - print(req) res = {"diffs": [], "contents": []} if "version" in req: for diff in get_diffs(int(req["version"])): @@ -170,7 +231,23 @@ def version(): @app.route("/latest/chapter/", methods=["GET"]) -def latest_sections(chapter_number): +def latest_sections(chapter_number: str) -> str: + """ + Returns the sections for a given chapter in the latest version of the USCode + + # TODO: Paginate + Currently not paginated, might also be useless to return them all, as a user likely + wants a specific one, and it's sort of unintelligble to look at them all at once. + + + Args: + chapter_number (str): Chapter "Number" which is actually not a number, + It's more of the Chapter's official "number", all pulled from the uscode.house.gov + 05, 11, 18, 28, 50 all have *A varients. + + Returns: + str: Stringified array of the rows + """ res = [] for section in get_latest_sections(chapter_number): res.append(section.to_dict()) @@ -178,7 +255,18 @@ def latest_sections(chapter_number): @app.route("/chapter/", methods=["GET"]) -def sections(chapter_id): +def sections(chapter_id: int) -> str: + """ + Gets the sections for a specific chapter id. + + # TODO: Unused function? + + Args: + chapter_id (int): PK on the Chapter table + + Returns: + str: Stringifed array of the rows + """ latest_base = ( current_session.query(Version).filter(Version.base_id == None).all()[0] ) @@ -191,7 +279,21 @@ def sections(chapter_id): @app.route( "/latest/section//", methods=["GET"] ) -def latest_contents(chapter_number, section_number): +def latest_contents(chapter_number: str, section_number: str) -> str: + """ + Grabs the content for a given section inside a given chapter. + + # TODO: Create a typevar for chapter_number + + Args: + chapter_number (str): Chapter "Number" which is actually not a number, + It's more of the Chapter's official "number", all pulled from the uscode.house.gov + 05, 11, 18, 28, 50 all have *A varients. + section_number (str): Section "Number" basically the same as above, they are not really numbers + + Returns: + str: Stringifed array of the rows + """ res = [] for section in get_latest_content(chapter_number, section_number): res.append(section.to_dict()) @@ -199,7 +301,16 @@ def latest_contents(chapter_number, section_number): @app.route("/section/", methods=["GET"]) -def contents(section_id): +def contents(section_id: int) -> str: + """ + Returns the data for a given section_id + + Args: + section_id (int): PK on the Section table + + Returns: + str: Stringified array of the content rows + """ latest_base = ( current_session.query(Version).filter(Version.base_id == None).all()[0] ) @@ -211,6 +322,9 @@ def contents(section_id): @app.app.after_request def add_header(response): + """ + This was something I was doing for logging requests for a goaccess endpoint which isn't used anymore + """ if not windows: if "X-Real-Ip" in request.headers: remote_addr = request.headers.getlist("X-Real-Ip")[0].strip() diff --git a/backend/billparser/actions/__init__.py b/backend/billparser/actions/__init__.py index 81f34b58..fe51271f 100644 --- a/backend/billparser/actions/__init__.py +++ b/backend/billparser/actions/__init__.py @@ -1,7 +1,14 @@ import re from billparser.logger import log +from unidecode import unidecode + +# TODO: This whole file is some honkin bullshit. It's entirely unsustainable, but at the same time, unless I can get them to follow standards, I'm not sure +# I can actually do anything else but maintain a long ass list of regexes. + +# These regexes all have named capture groups, because they are incredibly useful +# The capture groups are typically consistent, especially when the same functions are used between different 'actions' regex_holder = { "SHORT-TITLE": [ r"This (?P(?:act|(?:sub)?title)) may be cited as the (?P.+?)\." @@ -79,13 +86,28 @@ "TERM-DEFINITION": [r"The term \"(?P<term>.+?)\" means (?P<term_def>.+?)."], } -from unidecode import unidecode +SuchCodeRegex = re.compile(r"(Section|paragraph) (?P<section>\d*)\(", re.IGNORECASE) +SubParts = re.compile(r"\((.*?)\)") +DupeFinder = re.compile(r"(\/.{1,}\b)\1") + for action in regex_holder: regex_holder[action] = [re.compile(x, flags=re.I) for x in regex_holder[action]] -def determine_action(text): +def determine_action(text: str) -> dict: + """ + Parses the input string against all the regexes + Searches each action's regexes until it finds one + The order in which the regexes are placed are important, because the most general ones need to be last + Especially if there is information in the more specific ones that is important for action. + + Args: + text (str): Input bill clause string + + Returns: + dict: A dict of the matching action regexes + """ actions = {} text = unidecode(text).replace("--", "-") for action in regex_holder: @@ -101,12 +123,18 @@ def determine_action(text): return actions -SuchCodeRegex = re.compile(r"(Section|paragraph) (?P<section>\d*)\(", re.IGNORECASE) -SubParts = re.compile(r"\((.*?)\)") -DupeFinder = re.compile(r"(\/.{1,}\b)\1") +def parse_such_code(text: str, title: str) -> str: + """ + Sometimes clauses in a bill will reference "such code", which means we've already been given the Chapter + and all we have to do is attempt to match up the section to that Chapter + Args: + text (str): String containing the such code reference + title (str): Chapter -def parse_such_code(text, title): + Returns: + str: A usc cite according to the such code logic + """ SuchCodeRegex_match = SuchCodeRegex.search(text) if SuchCodeRegex_match: cite = "/us/usc/t{}/s{}".format(title, SuchCodeRegex_match["section"]) diff --git a/backend/billparser/actions/insert.py b/backend/billparser/actions/insert.py index 4d92fdb5..0645b76e 100644 --- a/backend/billparser/actions/insert.py +++ b/backend/billparser/actions/insert.py @@ -3,24 +3,49 @@ from billparser.db.models import ContentDiff, Section, Content +from billparser.actions import ActionObject + from billparser.actions.strike import strike_text import re from billparser.logger import log +from lxml import etree + +Element = etree.Element + def recursive_content( - chapter_id, - section_id, - content_id, - search_element, - order, - version_id, - last_ident, - session, -): - # print(' '.join(search_element.itertext()).strip().replace('\n', ' ')) - # if it has an id it is probably a thingy + chapter_id: int, + section_id: int, + content_id: int, + search_element: Element, + order: int, + version_id: int, + last_ident: str, + session: "Session", +) -> None: + """ + This is the function that "inserts" a new block of content from a bill. + In the case where a bill says "insert blah blah after section E", this will + recursively look at the content to be inserted, and insert it as a ContentDiff object + it will also make an empty Content row at the locations, together these signify that + they have been inserted. + + TODO: Fix this so we aren't having to flush between subsequent calls? We should be able to just assume the new ID, even in parallel circumstances + + + Args: + chapter_id (int): PK in the Chapter table for where this will be inserted + section_id (int): PK in the Section table for where this will be inserted + content_id (int): PK of the _parent_ Content for these to be added under + search_element (Element): The xml element from the bill + order (int): What order should it be rendered in, this is important because it would be complicated to take the section header and get the order from it + version_id (int): PK in the Version table, this is the bill's corresponding version + last_ident (str): This represents the cite location for the newly added content + session (Session): DB session to add these new objects too + """ + # if it has an id it is probably an element that we care about if "id" in search_element.attrib: enum = search_element[0] @@ -28,6 +53,8 @@ def recursive_content( ident = ident.replace("//", "/") heading = search_element[1] content_str = None + + # TODO: Remember why I had to break this out like this? if len(search_element) > 2: content_elem = search_element[2] if ( @@ -99,8 +126,14 @@ def recursive_content( order = order + 1 -def insert_section_after(action_obj, session): - action = action_obj.action +def insert_section_after(action_obj: ActionObject, session: "Session") -> None: + """ + Figures out what Chapter/Section we are in so that we can insert after it + + Args: + action_obj (ActionObject): The parsed action + session (Session): DB session to insert into + """ cited_content = action_obj.cited_content new_vers_id = action_obj.version_id if action_obj.next is not None: @@ -124,10 +157,16 @@ def insert_section_after(action_obj, session): session.commit() -def insert_end(action_obj, session): - action = action_obj.action +def insert_end(action_obj: ActionObject, session: "Session") -> None: + """ + When an amendment is "insert the following after Section 3(a)" + This figures out what the latest section is in there and then calls insert after + + Args: + action_obj (ActionObject): Parsed action + session ([type]): DB session + """ cited_content = action_obj.cited_content - new_vers_id = action_obj.version_id if action_obj.next is not None: last_content = ( session.query(Content) @@ -138,12 +177,20 @@ def insert_end(action_obj, session): ) if len(last_content) > 0: action_obj.cited_content = last_content[0] + # DRY :) insert_section_after(action_obj, session) else: log.warn("Couldn't find content") -def insert_text_end(action_obj, session): +def insert_text_end(action_obj: ActionObject, session: "Session") -> None: + """ + This is usually called when they want to add text to the end of a single clause + + Args: + action_obj (ActionObject): Parsed action + session ([type]): DB session + """ action = action_obj.action cited_content = action_obj.cited_content log.debug(cited_content.content_id) @@ -185,20 +232,34 @@ def insert_text_end(action_obj, session): log.debug("Added diff", diff.diff_id) -def insert_text_after(action_obj, session): - action = action_obj.action - cited_content = action_obj.cited_content - new_vers_id = action_obj.version_id +def insert_text_after(action_obj: ActionObject, session: "Session") -> None: + """ + This typically gets called when they want to add text after another bit of text + "add 'blah' after 'blibbilty' in Section 3(a) of USC 3" + + TODO: I think I messed up, by calling the strike_text function it may happen more than once? + + Args: + action_obj (ActionObject): Parsed action + session ([type]): DB session + """ action_obj.action["to_replace"] = ( action_obj.action["to_remove_text"] + " " + action_obj.action["to_insert_text"] ) strike_text(action_obj, session) -def insert_text_before(action_obj, session): - action = action_obj.action - cited_content = action_obj.cited_content - new_vers_id = action_obj.version_id +def insert_text_before(action_obj: ActionObject, session: "Session") -> None: + """ + This typically gets called when they want to add text before another bit of text + "add 'blah' before 'blibbilty' in Section 3(a) of USC 3" + + TODO: I think I messed up, by calling the strike_text function it may happen more than once? + + Args: + action_obj (ActionObject): Parsed action + session ([type]): DB session + """ action_obj.action["to_remove_text"] = action_obj.action["target_text"] action_obj.action["to_replace"] = ( action_obj.action["to_insert_text"] + " " + action_obj.action["target_text"] diff --git a/backend/billparser/actions/redesignate.py b/backend/billparser/actions/redesignate.py index 607b6b8e..e4a98bb9 100644 --- a/backend/billparser/actions/redesignate.py +++ b/backend/billparser/actions/redesignate.py @@ -2,12 +2,20 @@ from billparser.db.models import ContentDiff, Section, Content from billparser.logger import log import re +from billparser.actions import ActionObject name_extract = re.compile(r"\((?P<name>.+?)") -def redesignate(action_obj, session): +def redesignate(action_obj: ActionObject, session: "Session") -> None: + """ + Handles changing the display letter to something new for a section + + Args: + action_obj (ActionObject): Parsed action + session (Session): Current database session + """ action = action_obj.action new_vers_id = action_obj.version_id cited_content = action_obj.cited_content diff --git a/backend/billparser/actions/strike.py b/backend/billparser/actions/strike.py index a0516021..d9a83b11 100644 --- a/backend/billparser/actions/strike.py +++ b/backend/billparser/actions/strike.py @@ -2,9 +2,19 @@ from billparser.db.models import ContentDiff, Section, Content from billparser.logger import log import re +from billparser.actions import ActionObject -def strike_section(action_obj, session): + +def strike_section(action_obj: ActionObject, session: "Session") -> None: + """ + This handles removing an entire section it does this by making a ContentDiff with empty + strings for all the parts. + + Args: + action_obj (ActionObject): Parsed action object + session (Session): Current database session + """ action = action_obj.action new_vers_id = action_obj.version_id cited_content = action_obj.cited_content @@ -37,7 +47,18 @@ def strike_section(action_obj, session): session.commit() -def strike_emu(to_strike, to_replace, target): +def strike_emulation(to_strike: str, to_replace: str, target: str) -> str: + """ + Handles emulating the strike text behavior for a given string + + Args: + to_strike (str): Text to search for + to_replace (str): Text to replace with, if any + target (str): Text to look in + + Returns: + str: The result of the replacement + """ start_boi = r"(\s|\b)" if re.match(r"[^\w]", to_strike): start_boi = "" @@ -52,13 +73,24 @@ def strike_emu(to_strike, to_replace, target): return target -def strike_text(action_obj, session): +def strike_text(action_obj: ActionObject, session: "Session") -> None: + """ + Handle striking and replacing text from a given clause or header. + It checks to see if the text is within the header or the content, but not both + + TODO: Do both? + + TODO: Return something instead of acting directly on the session + + Args: + action_obj (ActionObject): Parsed action object + session (Session): Current DB session + """ action = action_obj.action cited_content = action_obj.cited_content new_vers_id = action_obj.version_id to_strike = action.get("to_remove_text", None) to_replace = action.get("to_replace", "") - sub = re.compile(f"\b{re.escape(to_strike)}\b") chapter = ( session.query(Section) .filter(Section.section_id == cited_content.section_id) @@ -70,7 +102,7 @@ def strike_text(action_obj, session): chapter_id = chapter[0].chapter_id if to_strike is not None: if cited_content.heading is not None and to_strike in cited_content.heading: - heading_diff = strike_emu(to_strike, to_replace, cited_content.heading) + heading_diff = strike_emulation(to_strike, to_replace, cited_content.heading) if heading_diff != cited_content.heading: diff = ContentDiff( content_id=cited_content.content_id, @@ -83,7 +115,7 @@ def strike_text(action_obj, session): cited_content.content_str is not None and to_strike in cited_content.content_str ): - content_diff = strike_emu( + content_diff = strike_emulation( to_strike, to_replace, cited_content.content_str ) if content_diff != cited_content.content_str: diff --git a/backend/billparser/compare.py b/backend/billparser/compare.py index 182f5e31..b8092258 100644 --- a/backend/billparser/compare.py +++ b/backend/billparser/compare.py @@ -5,7 +5,18 @@ cols = ["action", "file", "enum", "lxml_path", "parsed_cite", "text"] -def compare_dfs(df_1, df_2): +def compare_dfs(df_1: pd.DataFrame, df_2: pd.DataFrame) -> None: + """ + This function was for when I was making iterative progress on updating the parsing/transforming rules + I wanted a simple way to compare successive outputs for changes. + It is an inspired function. + + It does the comparisons, then writes the output to disk. + + Args: + df_1 (pd.DataFrame): DF 1 + df_2 (pd.DataFrame): DF 2 + """ df_1 = df_1[cols].copy() df_2 = df_2[cols].copy() for col in cols: diff --git a/backend/billparser/db/caching.py b/backend/billparser/db/caching.py index 370c586a..36fb34cd 100644 --- a/backend/billparser/db/caching.py +++ b/backend/billparser/db/caching.py @@ -13,6 +13,8 @@ from dogpile.cache.region import make_region from dogpile.cache.api import NO_VALUE +# This is the same dumb library we use at Groundspeed and I don't think it works +# And I was the one who added it to our stuff at Groundspeed. RIP def md5_key_mangler(key): """ diff --git a/backend/billparser/db/handler.py b/backend/billparser/db/handler.py index e112b816..e2a7cde3 100644 --- a/backend/billparser/db/handler.py +++ b/backend/billparser/db/handler.py @@ -21,6 +21,7 @@ Base.metadata.create_all(engine) Session = sessionmaker(bind=engine, query_cls=query_callable(regions)) +ribber = string.ascii_letters + string.digits def unidecode_str(input_str: str) -> str: @@ -37,10 +38,16 @@ def open_usc(file): return lookup -ribber = string.ascii_letters + string.digits +def get_number(ident: str) -> float: + """ + Converts a usc_ident into a number that is supposed to impart some implicit order + Args: + ident (str): The usc ident -def get_number(ident): + Returns: + float: The supposed order + """ ident = unidecode_str(ident) if "..." in ident: ident = ident.split("...")[0] diff --git a/backend/billparser/db/queries.py b/backend/billparser/db/queries.py index c138dcf4..2c2a6373 100644 --- a/backend/billparser/db/queries.py +++ b/backend/billparser/db/queries.py @@ -16,6 +16,8 @@ from sqlalchemy.sql import alias import re +from typing import List + import platform windows = platform.system() == "Windows" @@ -28,7 +30,13 @@ @cached(cache=TTLCache(maxsize=512, ttl=CACHE_TIME)) -def get_chapters(): +def get_chapters(version_id=DEFAULT_VERSION_ID) -> List[Chapter]: + """ + Gets all the Chapters for the current version + + Returns: + List[Chapter]: A list of all the + """ results = ( current_session.query(Chapter) .filter(Chapter.version_id == DEFAULT_VERSION_ID) @@ -38,7 +46,20 @@ def get_chapters(): @cached(cache=TTLCache(maxsize=512, ttl=CACHE_TIME)) -def get_bills(house, senate, query, incl, decl): +def get_bills(house: int, senate: int, query: str, incl: str, decl: str) -> List[Bill]: + """ + Gets the Bill rows according to the filters. + + Args: + house (int): Include House bills + senate (int): Include Senate bills + query (str): Text to search for in the title + incl (str): Versions to include + decl (str): Versions to exclude + + Returns: + List[Bill]: Bill objects that pass the above filters + """ results = current_session.query(Bill).join(BillVersion) if house != 1: results = results.filter(Bill.chamber != "House") @@ -67,19 +88,40 @@ def get_bills(house, senate, query, incl, decl): @cached(cache=TTLCache(maxsize=512, ttl=CACHE_TIME)) -def get_versions(): +def get_versions() -> List[Version]: + """ + Gets a list of all the Version rows that correspond to Bills + + Returns: + List[Version]: List of Versions corresponding to the Bill versions + """ results = current_session.query(Version).filter(Version.base_id is not None).all() return results @cached(cache=TTLCache(maxsize=512, ttl=CACHE_TIME)) -def get_revisions(): +def get_revisions() -> List[Version]: + """ + Gets a list of all the Version rows that correspond to USCode revisions + + Returns: + List[Version]: List of Versions that are USCode revisions + """ results = current_session.query(Version).filter(Version.base_id is None).all() return results @cached(cache=TTLCache(maxsize=512, ttl=CACHE_TIME)) -def get_latest_sections(chapter_number): +def get_latest_sections(chapter_number: str) -> List[Section]: + """ + Gets the sections for the given Chapter, from the first USCode revision in the table + + Args: + chapter_number (str): Given Chapter.number to look for + + Returns: + List[Section]: List of Sections from the given Chapter + """ latest_base = ( current_session.query(Version).filter(Version.base_id == None).all()[0] ) @@ -98,7 +140,17 @@ def get_latest_sections(chapter_number): @cached(cache=TTLCache(maxsize=512, ttl=CACHE_TIME)) -def get_sections(chapter_id, version_id): +def get_sections(chapter_id: int, version_id: int) -> List[Section]: + """ + Gets the sections from the chapter and version id + + Args: + chapter_id (int): Chapter id to look at + version_id (int): Version id to look at + + Returns: + List[Section]: List of sections that match the criteria + """ results = ( current_session.query(Section) .filter(Section.chapter_id == chapter_id, Section.version_id == version_id) @@ -108,7 +160,18 @@ def get_sections(chapter_id, version_id): @cached(cache=TTLCache(maxsize=512, ttl=CACHE_TIME)) -def get_latest_content(chapter_number, section_number): +def get_latest_content(chapter_number: str, section_number: str) -> List[Content]: + """ + Converts a chapter number and section number into chapter and version ids + Then calls the get_content function with those arguments + + Args: + chapter_number (str): The Chapter number to search for + section_number (str): The Section number to search for + + Returns: + List[Content]: List of Contents from the given section + """ latest_base = ( current_session.query(Version).filter(Version.base_id == None).all()[0] ) @@ -129,7 +192,19 @@ def get_latest_content(chapter_number, section_number): @cached(cache=TTLCache(maxsize=512, ttl=CACHE_TIME)) -def get_content(section_id, version_id): +def get_content(section_id: int, version_id: int) -> List[Content]: + """ + Gets the contents of a given Section in a given Version + + TODO: Is version id redundent here? + + Args: + section_id (int): Section id to look at + version_id (int): Version id to look at + + Returns: + List[Content]: Content that passes the above filter + """ results = ( current_session.query(Content) .filter(Content.section_id == section_id, Content.version_id == version_id) @@ -140,22 +215,38 @@ def get_content(section_id, version_id): @cached(cache=TTLCache(maxsize=512, ttl=CACHE_TIME)) -def get_content_versions(version_id): - print(version_id, isinstance(version_id, (int))) +def get_content_versions(bill_version_id: int) -> List[Content]: + """ + Returns the content versions for a given bill version id + + Args: + bill_version_id (int): Given bill version id + + Returns: + List[Content]: List of Content that corresponds to a given Bill + """ results = ( current_session.query(Content) .filter( - Version.bill_version_id == version_id, + Version.bill_version_id == bill_version_id, Content.version_id == Version.version_id, ) .all() ) - print("Content versions", len(results)) return results @cached(cache=TTLCache(maxsize=512, ttl=CACHE_TIME)) -def get_diffs(bill_version_id): +def get_diffs(bill_version_id: int) -> List[ContentDiff]: + """ + Gets the ContentDiffs for a given bill_version_id + + Args: + bill_version_id (int): Target bill version id + + Returns: + List[ContentDiff]: List of ContentDiffs for the bill version + """ version = ( current_session.query(Version) .filter(Version.bill_version_id == bill_version_id) @@ -170,12 +261,20 @@ def get_diffs(bill_version_id): .filter(ContentDiff.version_id == version.version_id) .all() ) - print("Diffs", len(results)) return results @cached(cache=TTLCache(maxsize=512, ttl=CACHE_TIME)) -def get_bill_contents(bill_version_id): +def get_bill_contents(bill_version_id: int) -> List[BillContent]: + """ + Returns the BillContent for a given bill_version + + Args: + bill_version_id (int): BillVersion PK on the BillContent table + + Returns: + List[BillContent]: Matching BillContent rows + """ results = ( current_session.query(BillContent) .filter(BillContent.bill_version_id == bill_version_id) @@ -185,7 +284,7 @@ def get_bill_contents(bill_version_id): @cached(cache=TTLCache(maxsize=512, ttl=CACHE_TIME)) -def get_bill_metadata(bill_version_id): +def get_bill_metadata(bill_version_id: int) -> dict: bill_version = ( current_session.query(BillVersion) .filter(BillVersion.bill_version_id == bill_version_id) @@ -201,7 +300,7 @@ def get_bill_metadata(bill_version_id): return { "chamber": bill[0].chamber, "number": bill[0].bill_number, - "version": bill_version[0].bill_versiIon, + "version": bill_version[0].bill_version, } return {} diff --git a/backend/billparser/downloader.py b/backend/billparser/downloader.py index e415d8fc..d8123da2 100644 --- a/backend/billparser/downloader.py +++ b/backend/billparser/downloader.py @@ -2,6 +2,9 @@ def download(): + """ + Currently hardcoded for just the first session of the 116 congress + """ os.makedirs("bills/116", exist_ok=True) os.system( "wget https://www.govinfo.gov/bulkdata/BILLS/116/1/s/BILLS-116-1-s.zip --output-document bills/116/s_1.zip" diff --git a/backend/billparser/helpers.py b/backend/billparser/helpers.py index ecc24ed5..71025d59 100644 --- a/backend/billparser/helpers.py +++ b/backend/billparser/helpers.py @@ -6,11 +6,22 @@ from unidecode import unidecode -cite_regex = re.compile("^(\d+) [USC\.]+ (.*?)(\(.+\))?$", re.IGNORECASE) -section_regex = re.compile("^Section (.*?)(\(.*?\))?$", re.IGNORECASE) -part_regex = re.compile("\((.*?)\)", re.IGNORECASE) +# TODO: I think this is mostly unused + +cite_regex = re.compile(r"^(\d+) [USC\.]+ (.*?)(\(.+\))?$", re.IGNORECASE) +section_regex = re.compile(r"^Section (.*?)(\(.*?\))?$", re.IGNORECASE) +part_regex = re.compile(r"\((.*?)\)", re.IGNORECASE) # /us/usc/t7/s7333/e def convert_to_usc_id(xref): + """ + # TODO: This is complicated + + Args: + xref ([type]): [description] + + Returns: + [type]: [description] + """ try: # doc = xref.attrib['legal-doc'] cite = unidecode(xref.attrib["parsable-cite"]) @@ -45,6 +56,7 @@ def convert_to_usc_id(xref): def determine_action(xref): + # TODO: Unused function? par = xref.getparent() texts = [x for x in par.itertext() if x != xref.text] print("actions", texts[-1]) @@ -56,6 +68,7 @@ def determine_action(xref): def look_for_directions(text): + # TODO: Unused function? if "by striking" in text: overall = "striking" print("Directions", text) @@ -69,6 +82,7 @@ def look_for_directions(text): def determine_result(paragraph): + # TODO: Unused function print("dtrs") text = [x.strip() for x in paragraph.itertext() if x.strip() != ""] print(text[0]) @@ -81,6 +95,7 @@ def determine_result(paragraph): def parse_usc(title): + # TODO: Unused function? global usc_root, usc_29 with open("usc/usc{}.xml".format(title), "rb") as file: usc_root = etree.fromstring(file.read()) @@ -90,6 +105,7 @@ def parse_usc(title): def parse(file, name): + # TODO: Unused function? global usc_root, usc_29 ref_boi = [] try: @@ -125,7 +141,6 @@ def parse(file, name): for para in root.xpath("//paragraph"): print(etree.tostring(para, pretty_print=True).decode()) print(etree.tostring(translate_paragraph(para), pretty_print=True).decode()) - # insert_after(translate_paragraph(para), usc_29['/us/usc/t29/s203/e/5']) except: return [] diff --git a/backend/billparser/initializer.py b/backend/billparser/initializer.py deleted file mode 100644 index f3a656cc..00000000 --- a/backend/billparser/initializer.py +++ /dev/null @@ -1,14 +0,0 @@ -from billparser.db.handler import import_title, get_number -import locale -def getpreferredencoding(do_setlocale = True): - return "utf-8" -locale.getpreferredencoding = getpreferredencoding - -if __name__ == "__main__": - import os - - files = os.listdir("usc") - files = [x[3:].split(".")[0] for x in files] - files = sorted(files, key=lambda x: get_number(x)) - for file in files: - import_title(file, "Q1-2019") diff --git a/backend/billparser/nightly.py b/backend/billparser/nightly.py index 7e24a954..036a52bf 100644 --- a/backend/billparser/nightly.py +++ b/backend/billparser/nightly.py @@ -2,7 +2,7 @@ from billparser.run_through import run_archives from billparser.prune import run_prune - +# TODO: Put this back onto a cron job if __name__ == "__main__": download() run_archives() diff --git a/backend/billparser/prune.py b/backend/billparser/prune.py index 8d53023d..fd4c99dc 100644 --- a/backend/billparser/prune.py +++ b/backend/billparser/prune.py @@ -3,6 +3,9 @@ def run_prune(): + # This is removing any bills that I was unable to generate differences for + # The thought behind this is that they stay in the list, but they don't show anything + # So they distract from the functionality of the site session = Session() boys = session.query(ContentDiff.version_id).distinct(ContentDiff.version_id).all() session.execute("ALTER TABLE version DISABLE TRIGGER ALL;") diff --git a/backend/billparser/run_through.py b/backend/billparser/run_through.py index 239819f4..5bed3841 100644 --- a/backend/billparser/run_through.py +++ b/backend/billparser/run_through.py @@ -35,18 +35,39 @@ from billparser.translater import translate_paragraph from joblib import Parallel, delayed - +from typing import List text_paths = ["legis-body/section/subsection/text", "legis-body/section/text"] BASE_VERSION = 1 THREADS = 16 -def strip_arr(arr): +Element = etree.Element + +def strip_arr(arr: List[str]) -> List[str]: + """ + Strips all the strings in the input array + + Args: + arr (List[str]): List of strings to strip + + Returns: + List[str]: Stripped strings + """ return [x.strip() for x in arr] -def convert_to_text(element, inside_quote=False): +def convert_to_text(element: Element, inside_quote: bool=False) -> str: + """ + Converts an element to a string, if there is a quote tag it will add quotes. + + Args: + element (Element): Input element + inside_quote (bool, optional): Doesn't actually do anything. Defaults to False. + + Returns: + str: Stringified version of the element's text + """ ret = "" if element.tag == "quote": ret = '"' @@ -62,8 +83,20 @@ def convert_to_text(element, inside_quote=False): ll = {"subsection": "ss", "paragraph": "p", "section": "s", "subparagraph": "sb"} -def extract_actions(element, path): - # for checking what the order of the elements are +def extract_actions(element: Element, path: str) -> List[dict]: + """ + Looks at an element for textual clues to determine what actions it is implying + These actions will be extracted and passed to other functions to utilize. + + This is a recursive function + + Args: + element (Element): An XML element from a bill. + path (str): The path of this element in the document + + Returns: + List[dict]: A flat list of the extracted actions of all the elements + """ res = [] try: @@ -128,7 +161,7 @@ def extract_actions(element, path): def extract_single_action( - element: "lxml.Element", path: str, parent_action: dict + element: Element, path: str, parent_action: dict ) -> list: """ Takes in an element and a path (relative within the bill) @@ -204,7 +237,19 @@ def extract_single_action( cite_contexts = {"last_title": None} -def parse_action_for_cite(action_object): +def parse_action_for_cite(action_object: dict) -> str: + """ + Looks at a given action object to determine what citation it is trying to edit. + A citation represents a location in the USCode + + # TODO: Split function apart + + Args: + action_object (dict): An object represented an extract action + + Returns: + str: A USCode citation str + """ try: parent_cite = "" if action_object["text_element"] is not None: @@ -638,7 +683,17 @@ def run_action2(ACTION, new_bill_version, new_vers_id, session): chamb = {"hr": "House", "s": "Senate"} -def parse_archive(path): +def parse_archive(path: str) -> List[dict]: + """ + Opens a ZipFile that is the dump of all bills. + It will parse each one and return a list of the parsed out objects + + Args: + path (str):Path to the zip file + + Returns: + List[dict]: List of the parsed objects + """ archive = ZipFile(path) names = [] rec = [] diff --git a/backend/billparser/transformer.py b/backend/billparser/transformer.py index bc69798c..167aeed7 100644 --- a/backend/billparser/transformer.py +++ b/backend/billparser/transformer.py @@ -3,7 +3,7 @@ from sqlalchemy.orm import sessionmaker from billparser.db.handler import Session - +# TODO: I think this is all unused, superceded by the actions/ folder def recursive_content( session, chapter_id, diff --git a/backend/billparser/translater.py b/backend/billparser/translater.py index 577fe0b6..0842576f 100644 --- a/backend/billparser/translater.py +++ b/backend/billparser/translater.py @@ -4,6 +4,9 @@ def translate_paragraph(element): + """ + Transforms an XML element from the USCode version into HTML + """ for elem in element.iter(): if elem.tag == "enum": elem.tag = "num" diff --git a/frontend/.dockerignore b/frontend/.dockerignore new file mode 100644 index 00000000..b0f6b7a2 --- /dev/null +++ b/frontend/.dockerignore @@ -0,0 +1 @@ +yarn-error.log \ No newline at end of file diff --git a/frontend/.gitignore b/frontend/.gitignore new file mode 100644 index 00000000..b0f6b7a2 --- /dev/null +++ b/frontend/.gitignore @@ -0,0 +1 @@ +yarn-error.log \ No newline at end of file diff --git a/frontend/.nginx/default b/frontend/.nginx/default index bc143018..513f1e90 100644 --- a/frontend/.nginx/default +++ b/frontend/.nginx/default @@ -3,10 +3,10 @@ server { root /var/www/congress; index index.html; location / { - try_files $uri index.html; + try_files $uri /index.html; } listen 443 ssl; - listen [::]:443 ssl ipv6only=on; + listen [::]:443 ssl; ssl_certificate /etc/letsencrypt/live/congress.dev/fullchain.pem; # managed by Certbot ssl_certificate_key /etc/letsencrypt/live/congress.dev/privkey.pem; # managed by Certbot include /etc/letsencrypt/options-ssl-nginx.conf; # managed by Certbot @@ -16,12 +16,12 @@ server { server { server_name api.congress.dev; location / { - proxy_pass http://congress_api:9090; + proxy_pass http://congress_parser_api:9090; } listen 443 ssl; - listen [::]:443 ssl ipv6only=on; + listen [::]:443 ssl; ssl_certificate /etc/letsencrypt/live/congress.dev/fullchain.pem; # managed by Certbot ssl_certificate_key /etc/letsencrypt/live/congress.dev/privkey.pem; # managed by Certbot include /etc/letsencrypt/options-ssl-nginx.conf; # managed by Certbot ssl_dhparam /etc/letsencrypt/ssl-dhparams.pem; # managed by Certbot -} \ No newline at end of file +} diff --git a/frontend/package.json b/frontend/package.json index 14c7dcdd..978a6547 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -36,7 +36,7 @@ "not op_mini all" ], "main": "index.js", - "repository": "git@github.com:mustyoshi/bill-viewer.git", + "repository": "git@github.com:mustyoshi/congress-dev.git", "author": "mustyoshi@gmail.com <mustyoshi@mustyoshi.com>", "license": "MIT" } diff --git a/frontend/src/components/bills/bill_view.jsx b/frontend/src/components/bills/bill_view.jsx index 9839da6a..2ee5ebc1 100644 --- a/frontend/src/components/bills/bill_view.jsx +++ b/frontend/src/components/bills/bill_view.jsx @@ -31,7 +31,7 @@ const styles={ marginBottom: '0px' }, unchanged: {}, - + added: { backgroundColor: '#cdffd8' }, @@ -107,7 +107,7 @@ class BillViewportSection extends Component { } const bgStyle = bgStyles[this.getbg(item.ap)]; return ( - <div name={item.bill_content_id} style={ parent=== 'Legis' ? {...styles.font} : (styles[item.content_type] || styles.section) } ref={(elem)=> {this.elements.push(elem)}}> + <div key={item.bill_content_id} name={item.bill_content_id} style={ parent=== 'Legis' ? {...styles.font} : (styles[item.content_type] || styles.section) } ref={(elem)=> {this.elements.push(elem)}}> <span style={{...bgStyle, ...styles.font}}> <b style={styles.font} name={item.bill_content_id}>{item.display} {item.heading}</b> {item.heading ? (<p style={ {...styles.continue, ...bgStyle, ...styles.font} }>{item.content}</p>) : <span>{item.content}</span>} @@ -152,7 +152,7 @@ class BillViewportSection extends Component { <h1 style={{...styles.font, ...styles.centered}}>{chamberToShort[metadata.chamber]} {metadata.number}</h1> <h3 style={{...styles.font, ...styles.centered, fontStyle: 'italic'}}>{versionToFull[metadata.version]}</h3> </>: null} - + {this.generate_divs(items)} </Scrollbars> ); diff --git a/frontend/src/components/bills/reader.jsx b/frontend/src/components/bills/reader.jsx index 6a083a93..702fa8f3 100644 --- a/frontend/src/components/bills/reader.jsx +++ b/frontend/src/components/bills/reader.jsx @@ -31,7 +31,7 @@ const styles={ marginBottom: '0px' }, unchanged: {}, - + added: { backgroundColor: '#cdffd8' }, @@ -191,7 +191,7 @@ class BillReader extends Component { // console.log(sectionLookup[sectionId]); getSectionContent(intTitles[sectionLookup[sectionId].chapter_id].number, sectionLookup[sectionId].number); intTitles[sectionLookup[sectionId].chapter_id].sub.push(sectionLookup[sectionId]); - } + } }); const {diffTitles = {}} = this.state; this.setState({diffTitles: {...diffTitles, ...intTitles}}); @@ -254,7 +254,7 @@ class BillReader extends Component { metadata={metadata} /> ) - + const {childLookup = {}, parentLookup= {}} = this.state; const items = childLookup[parent] || []; if(Object.keys(childLookup).length === 0) { @@ -336,11 +336,11 @@ class BillReader extends Component { {this.renderSideBar()} </Col> <Col lg={9} style={ styles.col_a } ref={this.viewportRef}> - { chapter ? this.renderViewport() : + { chapter ? this.renderViewport() : <> - {window.performance.mark('Reader')} + {window.performance.mark('Reader') && null} {this.generate_divs('Legis')} - {console.log("Reader", window.performance.now('Reader'))} + {console.log("Reader", window.performance.now('Reader')) && null} </> } </Col> diff --git a/frontend/src/components/common/utils.js b/frontend/src/components/common/utils.js index 44797ceb..4cc8c034 100644 --- a/frontend/src/components/common/utils.js +++ b/frontend/src/components/common/utils.js @@ -1,6 +1,4 @@ -//export const host = window.location.hostname.indexOf('localhost') == 0 ? 'http://localhost:9090' : `https://api.${window.location.hostname.replace('beta.', '')}`; -export const host = "http://localhost:9090" -//export const host = "http://10.0.0.249:9090"; +export const host = window.location.hostname.indexOf('localhost') == 0 ? 'http://localhost:9090' : 'https://api.congress.dev'; function shallowEqual(objA: mixed, objB: mixed): boolean { if (objA === objB) { return true; diff --git a/frontend/src/components/nav/nav.jsx b/frontend/src/components/nav/nav.jsx index 86043041..4ccf65ee 100644 --- a/frontend/src/components/nav/nav.jsx +++ b/frontend/src/components/nav/nav.jsx @@ -12,7 +12,7 @@ class NavBarClass extends Component { return ( <Navbar style={{marginBottom: '10px'}}> <Navbar.Brand> - Congress.dev{(version && versionLookup[version])? `- ${versionLookup[version].title}` : null} <span className="badge badge-pill badge-info">0.0.5</span> + Congress.dev{(version && versionLookup[version])? `- ${versionLookup[version].title}` : null} <span className="badge badge-pill badge-info">0.0.6</span> </Navbar.Brand> <Nav bsStyle='pills'> <NavItem eventKey={1} href="/"> @@ -21,7 +21,7 @@ class NavBarClass extends Component { <NavItem eventKey={2} href="/revisions"> US Code Revisions </NavItem> - + <NavItem eventKey={2} href="/bills"> Bills </NavItem>