From f2ae3ff1f66836401f799459fdc7715fda5ecfaf Mon Sep 17 00:00:00 2001 From: Alex Parsons Date: Tue, 24 Sep 2024 19:42:49 +0000 Subject: [PATCH 1/8] Switch to poetry for requirements --- pyproject.toml | 13 +++++++++++++ requirements.txt | 8 -------- 2 files changed, 13 insertions(+), 8 deletions(-) create mode 100644 pyproject.toml delete mode 100644 requirements.txt diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..d604fb28 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,13 @@ +[tool.poetry] +package-mode = false + +[tool.poetry.dependencies] +python = "^3.9" +beautifulsoup4 = "4.12.3" +everypolitician = "0.0.13" +lxml = "5.2.1" +python-dateutil = "2.2" +requests = { version = "2.32.3", extras = ["security"] } +requests-cache = "0.4.13" +Click = "7.0" +click-log = "0.3.2" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 373b954c..00000000 --- a/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -beautifulsoup4==4.12.3 -everypolitician==0.0.13 -lxml==5.2.1 -python-dateutil==2.2 -requests[security]==2.21.0 -requests-cache==0.4.13 -Click==7.0 -click-log==0.3.2 From 11f67344d545ad7a3d74472759807fcaf2a1f1db Mon Sep 17 00:00:00 2001 From: Alex Parsons Date: Tue, 24 Sep 2024 19:43:01 +0000 Subject: [PATCH 2/8] Add dockerfile --- .devcontainer/devcontainer.json | 22 +++++++++++++++++++ .vscode/settings.json | 38 +++++++++++++++++++++++++++++++++ Dockerfile | 15 +++++++++++++ 3 files changed, 75 insertions(+) create mode 100644 .devcontainer/devcontainer.json create mode 100644 .vscode/settings.json create mode 100644 Dockerfile diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 00000000..9fa553c8 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,22 @@ +{ + "name": "parlparse", + "build": + { + "dockerfile": "../Dockerfile" + }, + "workspaceFolder": "/workspaces/parlparse", + "customizations": { + "vscode": { + "extensions": [ + "ms-vscode.test-adapter-converter", + "ms-azuretools.vscode-docker", + "bmewburn.vscode-intelephense-client", + "bungcip.better-toml", + "ms-python.python", + "ms-python.vscode-pylance", + "charliermarsh.ruff", + "mhutchie.git-graph" + ] + } + } +} diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..6b061139 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,38 @@ +{ + "[python]": { + "editor.formatOnSave": true + }, + "python.defaultInterpreterPath": "/usr/bin/python", + "python.terminal.activateEnvironment": false, + "python.analysis.typeCheckingMode": "basic", + "editor.formatOnSave": true, + "files.exclude": { + "**/.git": true, + "**/.svn": true, + "**/.hg": true, + "**/CVS": true, + "**/.DS_Store": true, + "**/*.pyc": { + "when": "$(basename).py" + }, + "**/__pycache__": true + }, + "files.associations": { + "**/*.html": "html", + "**/templates/**/*.html": "django-html", + "**/templates/**/*": "django-txt", + "**/requirements{/**,*}.{txt,in}": "pip-requirements" + }, + "[markdown]": { + "editor.quickSuggestions": { + "comments": "on", + "strings": "on", + "other": "on" + } + }, + "python.testing.pytestArgs": [ + "tests/" + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true +} \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..684ee5b9 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,15 @@ +FROM mysocietyorg/debian:bullseye +RUN apt-get update && \ + apt-get install python3-distutils python3-pip libxml2-dev libxslt-dev python-dev libffi-dev -y && \ + update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \ + update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \ + pip install --upgrade pip + +RUN curl -sSL https://install.python-poetry.org | /usr/bin/python3 - +ENV PATH="/root/.local/bin:$PATH" + +ENV PYTHONPATH=$PYTHONPATH:/usr/lib/python3.9/site-packages +ENV POETRY_VIRTUALENVS_CREATE=false + +COPY pyproject.toml poetry.loc[k] /tmp/pyproject/ +RUN cd /tmp/pyproject && poetry install \ No newline at end of file From de1ba3c44b3c063d840d103d78b800c8c7582064 Mon Sep 17 00:00:00 2001 From: Alex Parsons Date: Tue, 24 Sep 2024 20:00:08 +0000 Subject: [PATCH 3/8] Add ruff linting options --- pyproject.toml | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++ script/lint | 10 +++++++++ 2 files changed, 69 insertions(+) create mode 100755 script/lint diff --git a/pyproject.toml b/pyproject.toml index d604fb28..a7b7661f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,3 +11,62 @@ requests = { version = "2.32.3", extras = ["security"] } requests-cache = "0.4.13" Click = "7.0" click-log = "0.3.2" + +[tool.poetry.group.dev.dependencies] +ruff = "^0.6.7" + +[tool.ruff] + +extend-include = ["scripts/*"] +extend-exclude = [ + 'scripts/.gitignore', + 'scripts/2016_data_update/README.txt', + 'scripts/config.pm.incvs', + 'scripts/consts', + 'scripts/crontab', + 'scripts/dailyupdate', + 'scripts/datadotparl/mp-party-check', + 'scripts/datadotparl/one-off-add-pims-ids', + 'scripts/datadotparl/one-off-sync-lord-parties', + 'scripts/dircmp', + 'scripts/divisionextractor.pl', + 'scripts/morningupdate', + 'scripts/ni-format-revert', + 'scripts/ni_membership.php', + 'scripts/one-off-move-names-to-persons', + 'scripts/other-sites-update', + 'scripts/updatedaterange-parse', + 'scripts/updatedaterange-scrape', + 'scripts/weeklyupdate', + 'scripts/ynmp/tests.txt' +] + + + +[tool.ruff.lint] +select = [ + "E", + # flake8 + "F", + # isort + "I", +] +ignore = [ + # line too long, sorted with formatter where it can be + "E501", +] + + +[tool.ruff.lint.isort] +known-first-party = ["hub"] +section-order = [ + "future", + "standard-library", + "django", + "third-party", + "first-party", + "local-folder" +] + +[tool.ruff.lint.isort.sections] +django = ["django"] \ No newline at end of file diff --git a/script/lint b/script/lint new file mode 100755 index 00000000..28d42b1c --- /dev/null +++ b/script/lint @@ -0,0 +1,10 @@ +#!/bin/bash + +poetry run ruff format . + +# This ignores a number of linting checks that are *problems* and so +# we want to be able to see in editor (and can't put in pyproject.toml) +# but we don't want to have to fix everything (given it's working fineish) +# to see new issues +# this is the 'using is' for equality, top module imports broken by chdir, don't use lambdas, etc +poetry run ruff check . --fix --config 'lint.ignore = ["E501", "E402", "E731", "E722", "F841", "E711", "E712"]' \ No newline at end of file From 395019f26059336809359bc8478e9345a2876d6c Mon Sep 17 00:00:00 2001 From: Alex Parsons Date: Tue, 24 Sep 2024 20:00:24 +0000 Subject: [PATCH 4/8] Automatic formatting --- filtersentence_xml.py | 177 ++- london-mayors-questions/questions.py | 815 +++++----- members/parl-old-check-party.py | 253 +++- members/wikipedia-commons.py | 60 +- members/wikipedia-lords.py | 32 +- members/wikipedia-standingdown.py | 21 +- pyscraper/base_resolver.py | 281 ++-- pyscraper/contextexception.py | 4 +- pyscraper/get_links_from_ep.py | 26 +- pyscraper/gettwittermps.py | 37 +- pyscraper/gidmatching.py | 686 +++++---- pyscraper/lazyrunall.py | 145 +- pyscraper/lords/resolvenames.py | 180 ++- pyscraper/miscfuncs.py | 460 +++--- pyscraper/new_hansard.py | 1332 +++++++++-------- pyscraper/ni/parse.py | 179 ++- pyscraper/ni/resolvenames.py | 137 +- pyscraper/ni/scrape.py | 119 +- pyscraper/ni/wikipedia-mla.py | 44 +- pyscraper/parlphrases.py | 408 +++-- pyscraper/patchtool.py | 252 ++-- pyscraper/process_hansard.py | 71 +- pyscraper/pullgluepages.py | 50 +- pyscraper/regmem/filter.py | 133 +- pyscraper/regmem/pullgluepages.py | 199 ++- pyscraper/resolvemembernames.py | 374 +++-- pyscraper/runfilters.py | 99 +- pyscraper/sp/common.py | 48 +- pyscraper/sp/fastest-msps.py | 184 ++- pyscraper/sp/get-official-reports-new.py | 111 +- pyscraper/sp/parse-official-reports-new.py | 659 +++++--- pyscraper/sp/resolvenames.py | 142 +- pyscraper/sp/wikipedia-msp.py | 25 +- pyscraper/sp_2024/__main__.py | 10 +- pyscraper/sp_2024/common.py | 8 +- pyscraper/sp_2024/convert.py | 8 +- pyscraper/sp_2024/download.py | 2 +- pyscraper/sp_2024/parse.py | 6 +- pyscraper/test.py | 26 +- pyscraper/unpack_hansard_zips.py | 76 +- pyscraper/wa/parse.py | 319 ++-- pyscraper/wa/resolvenames.py | 30 +- pyscraper/wa/scrape.py | 66 +- pyscraper/xmlfilewrite.py | 17 +- scripts/2016_data_update/dadem_import_ni.py | 181 ++- scripts/2016_data_update/dadem_import_sp.py | 187 ++- scripts/2021-lam-update | 126 +- scripts/2021-msp-update | 139 +- scripts/add-new-lords | 209 +-- scripts/add-new-mlas | 96 +- scripts/datadotparl/crawl-members | 47 +- scripts/datadotparl/json-add-new-parl-ids | 94 +- scripts/datadotparl/update-members | 400 ++--- scripts/fetch-mp-eu-ref-positions | 33 +- scripts/fetch-pw-json | 21 +- scripts/fetch_london_assembly.py | 378 ++--- scripts/fetch_scottish_ministers.py | 56 +- .../fetch_wikidata_from_everypolitician.py | 111 +- scripts/json-add-membership | 136 +- scripts/json-body-end | 14 +- scripts/json-change-party | 85 +- scripts/json-edit-person | 34 +- scripts/json-end-membership | 25 +- scripts/json-merge-people | 43 +- scripts/json-new-ids | 1 + scripts/json-nia-2017-new | 94 +- scripts/json-nia-2022-new | 86 +- scripts/popolo/__init__.py | 196 ++- scripts/popolo/menu.py | 26 +- scripts/popolo/utils.py | 24 +- scripts/quickupdate | 84 +- scripts/welsh-parliament/dual-posts.py | 74 +- scripts/welsh-parliament/memberships.py | 138 +- scripts/welsh-parliament/official-ids.py | 73 +- scripts/welsh-parliament/organizations.py | 102 +- scripts/welsh-parliament/persons.py | 108 +- scripts/welsh-parliament/posts.py | 108 +- scripts/ynmp/update.py | 274 ++-- wrans-2014/parse.py | 183 ++- 79 files changed, 7013 insertions(+), 5284 deletions(-) diff --git a/filtersentence_xml.py b/filtersentence_xml.py index 2e64fbb7..fe5723c5 100644 --- a/filtersentence_xml.py +++ b/filtersentence_xml.py @@ -1,13 +1,10 @@ -from datetime import datetime import re - -from lxml import etree +from datetime import datetime from contextexception import ContextException from parlphrases import parlPhrases from resolvemembernames import memberList - # this code fits onto the paragraphs before the fixhtmlentities and # performs difficult regular expression matching that can be # used for embedded links. @@ -33,22 +30,26 @@ reqnum = re.compile("\s*\[(\d+)\]\s*$") refqnum = re.compile("\s*\[(\d+)\]\s*") -redatephraseval = re.compile('(?:(?:%s),? )?(\d+(?: | )*(?:%s)( \d+)?)' % (parlPhrases.daysofweek, parlPhrases.monthsofyear)) +redatephraseval = re.compile( + "(?:(?:%s),? )?(\d+(?: | )*(?:%s)( \d+)?)" + % (parlPhrases.daysofweek, parlPhrases.monthsofyear) +) def TokenDate(ldate, phrtok): sdate_year = phrtok.sdate[0:4] - tdate = ldate.group(0).replace(' ', ' ') + tdate = ldate.group(0).replace(" ", " ") if not ldate.group(2): tdate += " %s" % sdate_year try: - lldate = datetime.strptime(tdate, '%A, %d %B %Y') + lldate = datetime.strptime(tdate, "%A, %d %B %Y") phrtok.lastdate = lldate.date().isoformat() except: - phrtok.lastdate = '' - return ('phrase', ' class="date" code="%s"' % phrtok.lastdate) + phrtok.lastdate = "" + return ("phrase", ' class="date" code="%s"' % phrtok.lastdate) -restandingo = re.compile('''(?x) + +restandingo = re.compile("""(?x) (?:)? Standing\sOrder\sNo\.\s* ( @@ -60,7 +61,7 @@ def TokenDate(ldate, phrtok): \(([^()]*(?:\([^()]*\))?)\) # inclusion of title for clarity )? (?:)? -''') +""") restandingomarg = re.compile("Standing Order No") @@ -68,23 +69,26 @@ def TokenDate(ldate, phrtok): def TokenStandingOrder(mstandingo, phrtok): if mstandingo.group(2): return ( - 'phrase', ' class="standing-order" code="%s" title="%s"' % - (mstandingo.group(1), re.sub('<[^>]*>', '', mstandingo.group(2))) + "phrase", + ' class="standing-order" code="%s" title="%s"' + % (mstandingo.group(1), re.sub("<[^>]*>", "", mstandingo.group(2))), ) - return ( - 'phrase', ' class="standing-order" code="%s"' % mstandingo.group(1) - ) + return ("phrase", ' class="standing-order" code="%s"' % mstandingo.group(1)) + + +rehtlink = re.compile("(?\s*official(?:||\s)*report # Official Report (?:||[,;\s])* (Commons|House\sof\sCommons|House\sof\sLords)? # Optional house (1) @@ -96,7 +100,7 @@ def TokenHrefLink(mhttp, phrtok): (?:(W[AS]?)\s*)? # Optional column number prefix (2) (\d+(?:(?:&\#150;|-)\d+)?) # Column number or numbers (3) ([WHSA]*) # Optional column suffix (4) -''') +""") def TokenOffRep(qoffrep, phrtok): @@ -108,30 +112,31 @@ def TokenOffRep(qoffrep, phrtok): if qcolsuffix: qcolsuffix = qcolsuffix.upper() # print '*', qoffrep.group(0), loc1, qcolprefix, qcolsuffix, qoffrep.group(3) - qcpart = re.match('(\d+)(?:(?:–|-)(\d+))?(?i)$', qoffrep.group(3)) + qcpart = re.match("(\d+)(?:(?:–|-)(\d+))?(?i)$", qoffrep.group(3)) qcolnum = qcpart.group(1) if qcpart.group(2): - qcpartlead = qcpart.group(1)[len(qcpart.group(1)) - len(qcpart.group(2)):] + qcpartlead = qcpart.group(1)[len(qcpart.group(1)) - len(qcpart.group(2)) :] if int(qcpartlead) >= int(qcpart.group(2)): - print(' non-following column leadoff ', qoffrep.group(0)) + print(" non-following column leadoff ", qoffrep.group(0)) # raise Exception, ' non-following column leadoff ' - if qcolsuffix == 'WH': - sect = 'westminhall' - elif qcolprefix == 'WS' or qcolsuffix == 'WS': - sect = 'wms' - elif qcolprefix == 'WA' or qcolsuffix == 'W' or qcolsuffix == 'WA': - sect = 'wrans' - elif loc1 == 'House of Lords': - sect = 'lords' + if qcolsuffix == "WH": + sect = "westminhall" + elif qcolprefix == "WS" or qcolsuffix == "WS": + sect = "wms" + elif qcolprefix == "WA" or qcolsuffix == "W" or qcolsuffix == "WA": + sect = "wrans" + elif loc1 == "House of Lords": + sect = "lords" else: - sect = 'debates' + sect = "debates" + + offrepid = "%s/%s.%s" % (sect, phrtok.lastdate, qcolnum) + return ("phrase", ' class="offrep" id="%s"' % offrepid) - offrepid = '%s/%s.%s' % (sect, phrtok.lastdate, qcolnum) - return ('phrase', ' class="offrep" id="%s"' % offrepid) # Date in the middle, so need to match before the date-only parsing... -reoffrepwdate = re.compile('''(?ix) +reoffrepwdate = re.compile("""(?ix) \s*official(?:||\s)*report # Official Report (?:(?:||,|\s)*(Westminster\sHall|House\sof\sLords|House\sof\sCommons))? # Optionally followed by a chamber (1) [,;]?\s*(?:)?[,;]?\s* @@ -144,47 +149,48 @@ def TokenOffRep(qoffrep, phrtok): (?:(W[AS]?)\s*)? # Optional column number prefix (4) (\d+)(?:(?:&\#150;|-)\d+)? # Column number or numbers (5) ([WHS]*) # Optional column number suffix (6) -''') +""") def TokenOffRepWDate(qoffrep, phrtok): # print qoffrep.group(0) loc1 = qoffrep.group(1) loc2 = qoffrep.group(2) - date = qoffrep.group(3).replace(' ', ' ') + date = qoffrep.group(3).replace(" ", " ") qcolprefix = qoffrep.group(4) qcolnum = qoffrep.group(5) qcolsuffix = qoffrep.group(6) - m = re.match('(\d+)/(\d+)/(\d+)', date) + m = re.match("(\d+)/(\d+)/(\d+)", date) if m: lordsdate = True - date = datetime.strptime(date, '%d/%m/%Y').date().isoformat() + date = datetime.strptime(date, "%d/%m/%Y").date().isoformat() else: lordsdate = False - date = datetime.strptime(date, '%d %B %Y').date().isoformat() + date = datetime.strptime(date, "%d %B %Y").date().isoformat() if qcolprefix: qcolprefix = qcolprefix.upper() if qcolsuffix: qcolsuffix = qcolsuffix.upper() - if loc1 == 'Westminster Hall' or qcolsuffix == 'WH': - sect = 'westminhall' - elif qcolprefix == 'WS' or qcolsuffix == 'WS': - sect = 'wms' - elif qcolprefix == 'WA' or qcolsuffix == 'W': - sect = 'wrans' - elif loc1 == 'House of Commons' or loc2 == 'Commons': - sect = 'debates' - elif loc1 == 'House of Lords' or loc2 == 'Lords' or lordsdate: - sect = 'lords' + if loc1 == "Westminster Hall" or qcolsuffix == "WH": + sect = "westminhall" + elif qcolprefix == "WS" or qcolsuffix == "WS": + sect = "wms" + elif qcolprefix == "WA" or qcolsuffix == "W": + sect = "wrans" + elif loc1 == "House of Commons" or loc2 == "Commons": + sect = "debates" + elif loc1 == "House of Lords" or loc2 == "Lords" or lordsdate: + sect = "lords" else: - sect = 'debates' + sect = "debates" + + offrepid = "%s/%s.%s" % (sect, date, qcolnum) + return ("phrase", ' class="offrep" id="%s"' % offrepid) - offrepid = '%s/%s.%s' % (sect, date, qcolnum) - return ('phrase', ' class="offrep" id="%s"' % offrepid) -#my hon. Friend the Member for Regent's Park and Kensington, North (Ms Buck) +# my hon. Friend the Member for Regent's Park and Kensington, North (Ms Buck) # (sometimes there are spurious adjectives -rehonfriend = re.compile('''(?ix) +rehonfriend = re.compile("""(?ix) the\.? # Privy counsellors, barrister, armed forces, status, etc. (?:(?:\s|&.{4};)*(?:right\.?|rt\.|very|old|new|now|current|then|visiting|former|distinguished|hon\.?|honourable|and|learned|gallant|Labour|Liberal Democrat|Conservative|reverend|independent|excellent|poor|rude|courageous|wonderful|brutal|redoubtable|mute|present|pious|formidable|fragrant))* @@ -193,14 +199,16 @@ def TokenOffRepWDate(qoffrep, phrtok): ([^(]{3,60}?) # group 1 the name of the constituency \s* \(([^)]{5,60}?)(?:&\#(?:146|8217);s)?\) # group 2 the name of the MP, inserted for clarity. -''') -rehonfriendmarg = re.compile('the\s+(hon\.\s*)?member for [^(]{0,60}\((?i)') +""") +rehonfriendmarg = re.compile("the\s+(hon\.\s*)?member for [^(]{0,60}\((?i)") def TokenHonFriend(mhonfriend, phrtok): # will match for ids orgname = mhonfriend.group(2) - res = memberList.matchfullnamecons(orgname, mhonfriend.group(1), phrtok.sdate, alwaysmatchcons=False) + res = memberList.matchfullnamecons( + orgname, mhonfriend.group(1), phrtok.sdate, alwaysmatchcons=False + ) if not res[0]: # comes back as None nid = "unknown" mname = orgname @@ -212,38 +220,36 @@ def TokenHonFriend(mhonfriend, phrtok): # remove any xml entities from the name orgname = res[1] - return ('phrase', ' class="honfriend" person_id="%s" name="%s"' % (nid, orgname)) + return ("phrase", ' class="honfriend" person_id="%s" name="%s"' % (nid, orgname)) # the array of tokens which we will detect on the way through tokenchain = [ - ('hreflink', rehreflink, None, TokenHrefLink), - ('offrepwdate', reoffrepwdate, None, TokenOffRepWDate), - ("date", redatephraseval, None, TokenDate), - ("offrep", reoffrepw, None, TokenOffRep), - ("standing order", restandingo, restandingomarg, TokenStandingOrder), - ("httplink", rehtlink, None, TokenHttpLink), - ("honfriend", rehonfriend, rehonfriendmarg, TokenHonFriend), + ("hreflink", rehreflink, None, TokenHrefLink), + ("offrepwdate", reoffrepwdate, None, TokenOffRepWDate), + ("date", redatephraseval, None, TokenDate), + ("offrep", reoffrepw, None, TokenOffRep), + ("standing order", restandingo, restandingomarg, TokenStandingOrder), + ("httplink", rehtlink, None, TokenHttpLink), + ("honfriend", rehonfriend, rehonfriendmarg, TokenHonFriend), ] # this handles the chain of tokenization of a paragraph class PhraseTokenize: - # recurses over itc < len(tokenchain) def TokenizePhraseRecurse(self, qs, stex, itc): - # end of the chain if itc == len(tokenchain): - self.toklist.append(('', '', stex)) + self.toklist.append(("", "", stex)) return # keep eating through the pieces for the same token while stex: # attempt to split the token mtoken = tokenchain[itc][1].search(stex) - if mtoken: # the and/or method fails with this - headtex = stex[:mtoken.span(0)[0]] + if mtoken: # the and/or method fails with this + headtex = stex[: mtoken.span(0)[0]] else: headtex = stex @@ -268,29 +274,34 @@ def TokenizePhraseRecurse(self, qs, stex, itc): # print "Token detected:", mtoken.group(0) # the tail part - stex = stex[mtoken.span(0)[1]:] + stex = stex[mtoken.span(0)[1] :] def __init__(self, date, stex): - self.lastdate = '' + self.lastdate = "" self.toklist = [] self.sdate = date - stex = re.sub('&(?!amp;)', '&', stex) + stex = re.sub("&(?!amp;)", "&", stex) # separate out any qnums at end of paragraph self.rmqnum = reqnum.search(stex) if self.rmqnum: - stex = stex[:self.rmqnum.span(0)[0]] + stex = stex[: self.rmqnum.span(0)[0]] # separate out qnums stuffed into front of paragraph (by the grabber of the speakername) frqnum = refqnum.match(stex) if frqnum: if self.rmqnum: - raise ContextException('Found question number [%s] in para, but already found [%s] at end (this probably just means it is being quoted, and you just need to change [] to ().' % (frqnum.group(1), self.rmqnum.group(1))) + raise ContextException( + "Found question number [%s] in para, but already found [%s] at end (this probably just means it is being quoted, and you just need to change [] to ()." + % (frqnum.group(1), self.rmqnum.group(1)) + ) self.rmqnum = frqnum - stex = stex[frqnum.span(0)[1]:] - stex_nohtml = re.sub('<[^>]*>', '', stex) + stex = stex[frqnum.span(0)[1] :] + stex_nohtml = re.sub("<[^>]*>", "", stex) if len(stex_nohtml) < 10: - raise ContextException('Removing question number from para appears to have removed all text (this probably just means a footnote marker is using [], just change to ()).') + raise ContextException( + "Removing question number from para appears to have removed all text (this probably just means a footnote marker is using [], just change to ())." + ) self.TokenizePhraseRecurse(date, stex, 0) @@ -299,10 +310,10 @@ def GetPara(self): for tok in self.toklist: if tok[0]: - res.append('<%s%s>' % (tok[0], tok[1])) + res.append("<%s%s>" % (tok[0], tok[1])) res.append(tok[2]) - res.append('' % tok[0]) + res.append("" % tok[0]) else: res.append(tok[2]) - return ''.join(res) + return "".join(res) diff --git a/london-mayors-questions/questions.py b/london-mayors-questions/questions.py index 73db32a8..f5123ed4 100755 --- a/london-mayors-questions/questions.py +++ b/london-mayors-questions/questions.py @@ -1,95 +1,90 @@ #! /usr/bin/env python3 -import os +import datetime +import json import logging +import os +import re +import string import click import click_log - -import json -import datetime import dateutil.parser -import re - import requests import requests_cache - -import string - from bs4 import BeautifulSoup, element from lxml import etree -from lxml.html import soupparser # Set up logging logger = logging.getLogger(__name__) click_log.basic_config(logger) # Set up the requests cache -cache_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cache') -requests_cache.install_cache(cache_path, expire_after=60*60*12) +cache_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "cache") +requests_cache.install_cache(cache_path, expire_after=60 * 60 * 12) # Load and parsethe configuration file -with open('config.json') as config_json_file: - logger.debug('Reading config file') +with open("config.json") as config_json_file: + logger.debug("Reading config file") config = json.load(config_json_file) # Set our constants -ASSEMBLY_DOMAIN = config['assembly_domain'] -DEFAULT_START_DATE = config['default_start_date'] -PUBLIC_WHIP_QUESTION_ID_PREFIX = config['public_whip_question_id_prefix'] -CURRENT_MAYOR_NAME = config['current_mayor_name'] -NAME_REGEX_TO_STRIP = config['name_regex_to_strip'] -NAME_CORRECTIONS = config['name_corrections'] +ASSEMBLY_DOMAIN = config["assembly_domain"] +DEFAULT_START_DATE = config["default_start_date"] +PUBLIC_WHIP_QUESTION_ID_PREFIX = config["public_whip_question_id_prefix"] +CURRENT_MAYOR_NAME = config["current_mayor_name"] +NAME_REGEX_TO_STRIP = config["name_regex_to_strip"] +NAME_CORRECTIONS = config["name_corrections"] # This needs to match the type from xml2db.pl in TWFY -XML_FILE_PREFIX = config['xml_file_prefix'] +XML_FILE_PREFIX = config["xml_file_prefix"] -CLI_DATETIME_FORMAT = click.DateTime(formats=('%Y-%m-%d',)) +CLI_DATETIME_FORMAT = click.DateTime(formats=("%Y-%m-%d",)) -STATE_JSON_FILENAME = 'state.json' +STATE_JSON_FILENAME = "state.json" -EMPTY_STATE_OBJECT = { - 'dates': {}, - 'questions': {} -} +EMPTY_STATE_OBJECT = {"dates": {}, "questions": {}} def getScraperState(output_folder): - ''' Load the scraper's state from file. ''' + """Load the scraper's state from file.""" state_file = os.path.join(output_folder, STATE_JSON_FILENAME) # Check this file exists before we load it if os.path.exists(state_file): - with open(state_file) as state_json_file: - logger.debug('Reading state file') + logger.debug("Reading state file") state = json.load(state_json_file) # If not, just use the empty object. It'll be written at wrap-up. else: - logger.warning('Could not find existing state file at {}, creating new one'.format(state_file)) + logger.warning( + "Could not find existing state file at {}, creating new one".format( + state_file + ) + ) state = EMPTY_STATE_OBJECT return state def writeScraperState(state, output_folder): - ''' Write the scraper's state back out to file. ''' + """Write the scraper's state back out to file.""" output_file = os.path.join(output_folder, STATE_JSON_FILENAME) try: json_string = json.dumps(state, indent=2, default=str) - with open(output_file, 'w') as state_json_file: - logger.debug('Writing state file') + with open(output_file, "w") as state_json_file: + logger.debug("Writing state file") state_json_file.write(json_string) except TypeError as e: - logger.error('Could not serialise to valid JSON: {}'.format(str(e))) + logger.error("Could not serialise to valid JSON: {}".format(str(e))) def getDatesInRange(start, end): - ''' Return an array of dates between (and inclusive of) those given. ''' + """Return an array of dates between (and inclusive of) those given.""" delta = end - start dates = [] @@ -102,55 +97,57 @@ def getDatesInRange(start, end): def scrapeAssemblyMeetingOnDate(date): - ''' Scrape the Mayor's Questions meeting page for the provided date ''' + """Scrape the Mayor's Questions meeting page for the provided date""" - meeting_date_string = date.strftime('%Y/%m/%d') + meeting_date_string = date.strftime("%Y/%m/%d") - meeting_date_url = ASSEMBLY_DOMAIN + '/questions/meeting/mqt/' + meeting_date_string + meeting_date_url = ASSEMBLY_DOMAIN + "/questions/meeting/mqt/" + meeting_date_string - logger.debug('Scraping meeting page at {}'.format(meeting_date_url)) + logger.debug("Scraping meeting page at {}".format(meeting_date_url)) meeting_page = requests.get(meeting_date_url) - scraped_data = { - 'http_status': str(meeting_page.status_code) - } + scraped_data = {"http_status": str(meeting_page.status_code)} if meeting_page.status_code == 404: - logger.info('Meeting on {} returned HTTP 404'.format(date)) - scraped_data['to_scrape'] = False + logger.info("Meeting on {} returned HTTP 404".format(date)) + scraped_data["to_scrape"] = False elif meeting_page.status_code == 200: - logger.info('Meeting on {} returned HTTP 200'.format(date)) - - scraped_data['sessions'] = parseAssemblyMeetingToSessions(meeting_page.content) - scraped_data['questions'] = [] - - if len(scraped_data['sessions']) > 0: - scraped_data['to_scrape'] = False - for session in scraped_data['sessions']: - scraped_data['questions'] += scrapeSessionAtUrl(session) - elif meeting_date_string != '2019/02/25': # Exempt date we know lacks sessions - logger.warning('Meeting on {} doesn\'t seem to have any sessions!'.format(date)) - scraped_data['to_scrape'] = True + logger.info("Meeting on {} returned HTTP 200".format(date)) + + scraped_data["sessions"] = parseAssemblyMeetingToSessions(meeting_page.content) + scraped_data["questions"] = [] + + if len(scraped_data["sessions"]) > 0: + scraped_data["to_scrape"] = False + for session in scraped_data["sessions"]: + scraped_data["questions"] += scrapeSessionAtUrl(session) + elif meeting_date_string != "2019/02/25": # Exempt date we know lacks sessions + logger.warning( + "Meeting on {} doesn't seem to have any sessions!".format(date) + ) + scraped_data["to_scrape"] = True else: - logger.warning('Meeting on {} returned HTTP {}'.format(date, meeting_page.status_code)) - scraped_data['to_scrape'] = True + logger.warning( + "Meeting on {} returned HTTP {}".format(date, meeting_page.status_code) + ) + scraped_data["to_scrape"] = True return scraped_data def parseAssemblyMeetingToSessions(content): - ''' Parse an assembly meeting page and return a list of its sessions. ''' + """Parse an assembly meeting page and return a list of its sessions.""" soup = BeautifulSoup(content, features="lxml") - sessions_in_content = soup.find_all('div', class_='entity-meetingsession') + sessions_in_content = soup.find_all("div", class_="entity-meetingsession") sessions_in_meeting = [] for session in sessions_in_content: session_title = session.a.text - session_url = session.a.get('href') + session_url = session.a.get("href") logger.debug('Found session "{}" at URL {}'.format(session_title, session_url)) @@ -160,11 +157,11 @@ def parseAssemblyMeetingToSessions(content): def scrapeSessionAtUrl(session_url): - ''' Scrape a given session URL and extract the questions within. ''' + """Scrape a given session URL and extract the questions within.""" session_full_url = ASSEMBLY_DOMAIN + session_url - logger.debug('Scraping session page at {}'.format(session_full_url)) + logger.debug("Scraping session page at {}".format(session_full_url)) session_page = requests.get(session_full_url) @@ -174,46 +171,48 @@ def scrapeSessionAtUrl(session_url): def parseSessionToQuestions(content): - soup = BeautifulSoup(content, features="lxml") - questions_in_content = soup.find_all('tr', class_='question') + questions_in_content = soup.find_all("tr", class_="question") questions_in_session = [] for question_row in questions_in_content: - - question_row_cells = question_row.findAll('td') + question_row_cells = question_row.findAll("td") question_number = question_row_cells[1].text - logger.debug('Found question {}'.format(question_number)) + logger.debug("Found question {}".format(question_number)) questions_in_session.append(question_number) return questions_in_session -def scrapeQuestionWithId(question_id,context): - ''' Scrape the page for a given question ID and return structured data. ''' +def scrapeQuestionWithId(question_id, context): + """Scrape the page for a given question ID and return structured data.""" - logger.debug('Scraping question {}'.format(question_id)) + logger.debug("Scraping question {}".format(question_id)) - question_full_url = ASSEMBLY_DOMAIN + '/questions/' + question_id + question_full_url = ASSEMBLY_DOMAIN + "/questions/" + question_id - logger.debug('Scraping question page at {}'.format(question_full_url)) + logger.debug("Scraping question page at {}".format(question_full_url)) question_page = requests.get(question_full_url) if question_page.status_code == 200: - logger.debug('Question {} returned HTTP 200'.format(question_id)) + logger.debug("Question {} returned HTTP 200".format(question_id)) question_parsed_data = parseQuestionPage(question_page.content) else: if question_page.status_code != 403: - logger.warning('Question {} returned HTTP {}'.format(question_id, question_page.status_code)) - context.obj['state']['questions'][question_id]['to_scrape'] = True + logger.warning( + "Question {} returned HTTP {}".format( + question_id, question_page.status_code + ) + ) + context.obj["state"]["questions"][question_id]["to_scrape"] = True question_parsed_data = None @@ -221,56 +220,59 @@ def scrapeQuestionWithId(question_id,context): def parseQuestionPage(content): - ''' Actually take the HTML from a scraped question page and turn it into a structured object. ''' + """Actually take the HTML from a scraped question page and turn it into a structured object.""" soup = BeautifulSoup(content, features="lxml") # We use the canonical URL just in case anything exotic has happened with redirects. - canonical_url = soup.find('link', {'rel': 'canonical'})['href'] + canonical_url = soup.find("link", {"rel": "canonical"})["href"] - main_content = soup.find('div', role='main') + main_content = soup.find("div", role="main") # Pull the title question_title = main_content.h1.text.strip() - logger.debug('Question title is {}'.format(question_title)) + logger.debug("Question title is {}".format(question_title)) # Extract who asked it - asked_by_name = main_content.find('div', class_='field--name-field-asked-by').find('div', class_='field__item').text.strip() + asked_by_name = ( + main_content.find("div", class_="field--name-field-asked-by") + .find("div", class_="field__item") + .text.strip() + ) asked_by_person = getSpeakerObjectFromName(asked_by_name) - logger.debug('Question asked by {}'.format(asked_by_person['name'])) + logger.debug("Question asked by {}".format(asked_by_person["name"])) # Try to extract the actual question - question_text = main_content.find('div', class_='field--name-body').find('div', class_='field__item') + question_text = main_content.find("div", class_="field--name-body").find( + "div", class_="field__item" + ) - question_p_elements = main_content\ - .find('section', class_='question')\ - .findAll('p') + question_p_elements = main_content.find("section", class_="question").findAll("p") question_paragraphs = [] for paragraph in question_p_elements: - # Some paragraphs are helpfully empty. Deal with those - if paragraph.text.strip() != '': + if paragraph.text.strip() != "": # NB at this point we're still sending BeautifulSoup objects question_paragraphs.append(paragraph) # We ignore the speaker which comes back with this, but this function otherwise does all the tidying needed question_with_speaker = splitTextToSpeeches(question_text)[0] - question_text_paragraphs = question_with_speaker['paragraphs'] + question_text_paragraphs = question_with_speaker["paragraphs"] # Now we know the title and the question, assemble the basic question object to send back question_object = { - 'title': question_title, - 'canonical_url': canonical_url, - 'question_text_paragraphs': question_text_paragraphs, - 'asked_by': asked_by_person + "title": question_title, + "canonical_url": canonical_url, + "question_text_paragraphs": question_text_paragraphs, + "asked_by": asked_by_person, } # Try parse the actual answers out @@ -278,12 +280,12 @@ def parseQuestionPage(content): # Got answers? - if len(answers_object['answers']) > 0: - question_object['answered'] = True - question_object['answers'] = answers_object['answers'] - question_object['answered_date'] = answers_object['answered_date'] + if len(answers_object["answers"]) > 0: + question_object["answered"] = True + question_object["answers"] = answers_object["answers"] + question_object["answered_date"] = answers_object["answered_date"] else: - question_object['answered'] = False + question_object["answered"] = False # Send the parsed data back upstream @@ -291,102 +293,126 @@ def parseQuestionPage(content): def parseAnswersFromQuestionPage(page_content): - ''' Given page content, see if we can get answers. ''' + """Given page content, see if we can get answers.""" # Look to see if there are any answers given - answers_div = page_content.find('div', class_='answers') + answers_div = page_content.find("div", class_="answers") - answers_object = { - 'answers': [] - } + answers_object = {"answers": []} - answer_articles = answers_div.findAll('article', class_='node--answer') + answer_articles = answers_div.findAll("article", class_="node--answer") for answer_article in answer_articles: # If there's a paragraph with a class of 'holding', we're waiting for an answer. - if answer_article.find('p', class_='holding'): - logger.debug('Question is awaiting an answer') + if answer_article.find("p", class_="holding"): + logger.debug("Question is awaiting an answer") continue # Sometimes the question just has no answer. Because this is "currently", still assume it's unanswered. - elif answer_article.find('div', class_='no-answer'): - logger.debug('Question has no available answers.') + elif answer_article.find("div", class_="no-answer"): + logger.debug("Question has no available answers.") continue # Get the date this was answered - this is the important one, not when it was asked, - answer_date = answer_article.find('div', class_='field--name-post-date').find('div', class_='field__item').text + answer_date = ( + answer_article.find("div", class_="field--name-post-date") + .find("div", class_="field__item") + .text + ) - if 'answered_date' not in answers_object: - answers_object['answered_date'] = dateutil.parser.parse(answer_date).date() - logger.debug('Question first answered on {}'.format(answers_object['answered_date'])) + if "answered_date" not in answers_object: + answers_object["answered_date"] = dateutil.parser.parse(answer_date).date() + logger.debug( + "Question first answered on {}".format(answers_object["answered_date"]) + ) # Find who answered it - answered_by_name = answer_article.find('div', class_='field--name-field-answered-by').find('div', class_='field__item').text.strip() + answered_by_name = ( + answer_article.find("div", class_="field--name-field-answered-by") + .find("div", class_="field__item") + .text.strip() + ) answered_by_person = getSpeakerObjectFromName(answered_by_name) - logger.debug('Question answered by {}'.format(answered_by_person['name'])) + logger.debug("Question answered by {}".format(answered_by_person["name"])) answer_paragraphs = [] - answer_body = answer_article.find('div', class_='field--name-body') + answer_body = answer_article.find("div", class_="field--name-body") if answer_body: - answer_p_elements = answer_body.findAll('p') + answer_p_elements = answer_body.findAll("p") for paragraph in answer_p_elements: # Some paragraphs are helpfully empty. Deal with those - if paragraph.text.strip() != '': + if paragraph.text.strip() != "": # NB at this point we're still sending BeautifulSoup objects answer_paragraphs.append(paragraph) - logger.debug('Found {} paragraphs of non-empty answers on page'.format(len(answer_paragraphs))) + logger.debug( + "Found {} paragraphs of non-empty answers on page".format( + len(answer_paragraphs) + ) + ) # Send the paragraphs of answers off to be sliced if this is multiple parts of a conversation answers_by_speech = splitTextToSpeeches(answer_paragraphs) - logger.debug('Found {} individual speeches within this answer'.format(len(answers_by_speech))) + logger.debug( + "Found {} individual speeches within this answer".format( + len(answers_by_speech) + ) + ) for i, answer in enumerate(answers_by_speech): - # This makes sure the answer has a speaker - if it doesn't, something is wrong - if answer['speaker']: - answers_object['answers'].append({ - 'speaker': answer['speaker'], - 'paragraphs': answer['paragraphs'] - }) + if answer["speaker"]: + answers_object["answers"].append( + {"speaker": answer["speaker"], "paragraphs": answer["paragraphs"]} + ) else: # If this is the first speech with no speaker, it's the answerer. - if (i == 0): - logger.debug('First speech with no detected speaker, using "Answered By"') - answers_object['answers'].append({ - 'speaker': answered_by_person, - 'paragraphs': answer['paragraphs'] - }) + if i == 0: + logger.debug( + 'First speech with no detected speaker, using "Answered By"' + ) + answers_object["answers"].append( + { + "speaker": answered_by_person, + "paragraphs": answer["paragraphs"], + } + ) else: - logger.warning('Speech with no detected speaker in question {}!'.format(canonical_url)) - - answer_attachment_div = answer_article.find('div', class_='field--name-field-attachments') + logger.warning( + "Speech with no detected speaker in question {}!".format( + canonical_url + ) + ) + + answer_attachment_div = answer_article.find( + "div", class_="field--name-field-attachments" + ) if answer_attachment_div: - attachments = answer_attachment_div.findAll('a') + attachments = answer_attachment_div.findAll("a") attachments = [str(a) for a in attachments] - answers_object['answers'].append({ - 'speaker': answered_by_person, - 'attachments': attachments, - }) + answers_object["answers"].append( + { + "speaker": answered_by_person, + "attachments": attachments, + } + ) return answers_object def stripPatternsFromName(name): - patterns_to_strip = True while patterns_to_strip: - original_name = name for pattern in NAME_REGEX_TO_STRIP: - name = re.sub(pattern, '', name) + name = re.sub(pattern, "", name) if name == original_name: patterns_to_strip = False @@ -395,9 +421,9 @@ def stripPatternsFromName(name): def getPersonIDFromName(name): - ''' Turn a name into a speaker ID. ''' + """Turn a name into a speaker ID.""" - if name == 'The Mayor': + if name == "The Mayor": name = CURRENT_MAYOR_NAME # If this person's name has a correction, use that instead @@ -408,26 +434,24 @@ def getPersonIDFromName(name): def getSpeakerObjectFromName(name): - ''' Given a name, try to find a speaker ID and return a whole object. ''' + """Given a name, try to find a speaker ID and return a whole object.""" - name = name.replace('\u00a0', ' ') + name = name.replace("\u00a0", " ") name = stripPatternsFromName(name) id = getPersonIDFromName(name) if not id: - if 'Liz Peace' not in name: - logger.warning('Could not match name {} to any assembly member'.format(name)) - id = 'unknown' + if "Liz Peace" not in name: + logger.warning( + "Could not match name {} to any assembly member".format(name) + ) + id = "unknown" - return { - 'id': id, - 'name': name - } + return {"id": id, "name": name} def cleanParagraphText(text): - # Remove non-breaking spaces followed by a space. - text = text.replace('\u00a0 ', ' ') + text = text.replace("\u00a0 ", " ") # Strip trailing whitespace text = text.strip() @@ -436,34 +460,31 @@ def cleanParagraphText(text): def getSpeakerAndTextFromParagraph(paragraph): - ''' For the given paragraph text, try to detect if it is led by a speaker's name. ''' + """For the given paragraph text, try to detect if it is led by a speaker's name.""" # Strong tags are used to mark speaker names in the source - name_candidate = paragraph.find('strong') + name_candidate = paragraph.find("strong") if name_candidate: - # Sanity check if this matches the expected format of speaker names - a name followed by a colon - if re.match('.*:$', name_candidate.text): - + if re.match(".*:$", name_candidate.text): # extract() removes the element from the beautifulsoup tree and returns it speaker_name = name_candidate.extract() - speaker = getSpeakerObjectFromName(speaker_name.text.replace(':', '').strip()) + speaker = getSpeakerObjectFromName( + speaker_name.text.replace(":", "").strip() + ) else: - speaker = False + speaker = False else: speaker = False - return { - 'speaker': speaker, - 'text': cleanParagraphText(paragraph.text) - } + return {"speaker": speaker, "text": cleanParagraphText(paragraph.text)} def splitTextToSpeeches(text_paragraphs): - ''' Sometimes text has several speeches by different people within it. Try isolate those. ''' + """Sometimes text has several speeches by different people within it. Try isolate those.""" answers_by_speech = [] @@ -471,274 +492,352 @@ def splitTextToSpeeches(text_paragraphs): current_speaker = False for paragraph in text_paragraphs: - if isinstance(paragraph, element.NavigableString): - logger.debug('Ignored NavigableString') + logger.debug("Ignored NavigableString") else: - # Ignore entirely empty paragraphs - if paragraph.text != '': - + if paragraph.text != "": paragraph_with_speaker = getSpeakerAndTextFromParagraph(paragraph) # If this paragraph is a new speaker, wrap up the answer and start a new one - if paragraph_with_speaker['speaker']: + if paragraph_with_speaker["speaker"]: if len(paragraphs_in_speech) > 0: - answers_by_speech.append({ - 'paragraphs': paragraphs_in_speech, - 'speaker': current_speaker - }) - - logger.debug('New speaker! Last speech was {} paragraphs'.format(len(paragraphs_in_speech))) - - paragraphs_in_speech = [paragraph_with_speaker['text']] - current_speaker = paragraph_with_speaker['speaker'] + answers_by_speech.append( + { + "paragraphs": paragraphs_in_speech, + "speaker": current_speaker, + } + ) + + logger.debug( + "New speaker! Last speech was {} paragraphs".format( + len(paragraphs_in_speech) + ) + ) + + paragraphs_in_speech = [paragraph_with_speaker["text"]] + current_speaker = paragraph_with_speaker["speaker"] # If this isn't a new speaker, just append to the current one else: - paragraphs_in_speech.append(paragraph_with_speaker['text']) + paragraphs_in_speech.append(paragraph_with_speaker["text"]) # Finally, wrap up the whole thing if there's anything remaining if len(paragraphs_in_speech) > 0: + logger.debug("Final speech was {} paragraphs".format(len(paragraphs_in_speech))) - logger.debug('Final speech was {} paragraphs'.format(len(paragraphs_in_speech))) + answers_by_speech.append( + {"paragraphs": paragraphs_in_speech, "speaker": current_speaker} + ) - answers_by_speech.append({ - 'paragraphs': paragraphs_in_speech, - 'speaker': current_speaker - }) - - logger.debug('Split {} paragraphs into {} speeches'.format(len(text_paragraphs), len(answers_by_speech))) + logger.debug( + "Split {} paragraphs into {} speeches".format( + len(text_paragraphs), len(answers_by_speech) + ) + ) return answers_by_speech def buildXMLForQuestions(questions): - ''' Given a date, collect answered questions and output the appropriate XML file. ''' + """Given a date, collect answered questions and output the appropriate XML file.""" - pwxml = etree.Element('publicwhip') + pwxml = etree.Element("publicwhip") for question_id, question in questions.items(): - - question_number = '{}.{}'.format(question['answered_date'].strftime('%Y-%m-%d'), question['canonical_url'].split('/')[-1]) - pw_root_id = '{}{}'.format(PUBLIC_WHIP_QUESTION_ID_PREFIX, question_number) - - pw_heading_id = pw_root_id + '.h' - heading_element = etree.SubElement(pwxml, 'minor-heading', nospeaker='true', id=pw_heading_id) - heading_element.text = question['title'] - - pw_question_id = pw_root_id + '.q0' - question_element = etree.SubElement(pwxml, 'question', - id=pw_question_id, - url=question['canonical_url'], - speakername=question['asked_by']['name'], - person_id=question['asked_by']['id'] - ) - - for paragraph in question['question_text_paragraphs']: - paragraph_element = etree.SubElement(question_element, 'p') + question_number = "{}.{}".format( + question["answered_date"].strftime("%Y-%m-%d"), + question["canonical_url"].split("/")[-1], + ) + pw_root_id = "{}{}".format(PUBLIC_WHIP_QUESTION_ID_PREFIX, question_number) + + pw_heading_id = pw_root_id + ".h" + heading_element = etree.SubElement( + pwxml, "minor-heading", nospeaker="true", id=pw_heading_id + ) + heading_element.text = question["title"] + + pw_question_id = pw_root_id + ".q0" + question_element = etree.SubElement( + pwxml, + "question", + id=pw_question_id, + url=question["canonical_url"], + speakername=question["asked_by"]["name"], + person_id=question["asked_by"]["id"], + ) + + for paragraph in question["question_text_paragraphs"]: + paragraph_element = etree.SubElement(question_element, "p") paragraph_element.text = paragraph - for answer_index, answer in enumerate(question['answers']): - - pw_answer_id = pw_root_id + '.r' + str(answer_index) + for answer_index, answer in enumerate(question["answers"]): + pw_answer_id = pw_root_id + ".r" + str(answer_index) - answer_element = etree.SubElement(pwxml, 'reply', - id=pw_answer_id, - speakername=answer['speaker']['name'], - person_id=answer['speaker']['id'] - ) + answer_element = etree.SubElement( + pwxml, + "reply", + id=pw_answer_id, + speakername=answer["speaker"]["name"], + person_id=answer["speaker"]["id"], + ) - for paragraph in answer.get('paragraphs', []): - paragraph_element = etree.SubElement(answer_element, 'p') + for paragraph in answer.get("paragraphs", []): + paragraph_element = etree.SubElement(answer_element, "p") paragraph_element.text = paragraph - for attachment in answer.get('attachments', []): - paragraph_element = etree.SubElement(answer_element, 'p') + for attachment in answer.get("attachments", []): + paragraph_element = etree.SubElement(answer_element, "p") paragraph_element.append(etree.fromstring(attachment)) return pwxml def writeXMLToFile(lxml, file): - ''' Write an lxml element out to file. ''' + """Write an lxml element out to file.""" # Make a new document tree xmldoc = etree.ElementTree(lxml) # Save to XML file - with open(file, 'w') as outFile: - xmldoc.write(outFile, pretty_print=True, encoding='utf-8') - logger.debug('Written XML to {}'.format(file)) + with open(file, "w") as outFile: + xmldoc.write(outFile, pretty_print=True, encoding="utf-8") + logger.debug("Written XML to {}".format(file)) def buildDateStatusObjectFromScrape(meeting_scrape_data): - ''' Format a date's status for storing in the state file. ''' + """Format a date's status for storing in the state file.""" status_object = { - 'http_status': meeting_scrape_data['http_status'], - 'to_scrape': meeting_scrape_data['to_scrape'] if 'to_scrape' in meeting_scrape_data else True, - 'updated': datetime.datetime.today() + "http_status": meeting_scrape_data["http_status"], + "to_scrape": meeting_scrape_data["to_scrape"] + if "to_scrape" in meeting_scrape_data + else True, + "updated": datetime.datetime.today(), } - if 'sessions' in meeting_scrape_data: - status_object['sessions_count'] = len(meeting_scrape_data['sessions']) + if "sessions" in meeting_scrape_data: + status_object["sessions_count"] = len(meeting_scrape_data["sessions"]) - if 'questions' in meeting_scrape_data: - status_object['questions_count'] = len(meeting_scrape_data['questions']) + if "questions" in meeting_scrape_data: + status_object["questions_count"] = len(meeting_scrape_data["questions"]) return status_object def loadMembershipsFromFile(members_file): - ''' Parse the provided file and extract data on Assembly members. ''' + """Parse the provided file and extract data on Assembly members.""" # We don't need to open this file, since Click deals with that members_raw_data = json.load(members_file) - logger.debug('Loaded {} people from {}'.format(len(members_raw_data['persons']), members_file.name)) + logger.debug( + "Loaded {} people from {}".format( + len(members_raw_data["persons"]), members_file.name + ) + ) people_by_id = {} post_org_by_id = {} # This unpacks all the people in the JSON so we can pull a person's name back from their ID - for person in members_raw_data['persons']: - people_by_id[person['id']] = person - for post in members_raw_data['posts']: - post_org_by_id[post['id']] = post['organization_id'] + for person in members_raw_data["persons"]: + people_by_id[person["id"]] = person + for post in members_raw_data["posts"]: + post_org_by_id[post["id"]] = post["organization_id"] # This loops through each membership, checks to see if it's for the Assembly, if so adds it to the map person_ids_by_name = {} - for membership in members_raw_data['memberships']: - if 'post_id' in membership and post_org_by_id[membership['post_id']] == 'london-assembly': - name = getNameFromPerson(people_by_id[membership['person_id']]) + for membership in members_raw_data["memberships"]: + if ( + "post_id" in membership + and post_org_by_id[membership["post_id"]] == "london-assembly" + ): + name = getNameFromPerson(people_by_id[membership["person_id"]]) if name not in person_ids_by_name: - person_ids_by_name[name] = membership['person_id'] - logger.debug('Added ID map for for {}'.format(name)) + person_ids_by_name[name] = membership["person_id"] + logger.debug("Added ID map for for {}".format(name)) else: - if person_ids_by_name[name] != membership['person_id']: - raise Exception('Multiple people with name {}'.format(name)) + if person_ids_by_name[name] != membership["person_id"]: + raise Exception("Multiple people with name {}".format(name)) - logger.debug('Added {} names with Assembly memberships'.format(len(person_ids_by_name))) + logger.debug( + "Added {} names with Assembly memberships".format(len(person_ids_by_name)) + ) return person_ids_by_name def getNameFromPerson(person): + for name in person.get("other_names", []): + if name["note"] == "Main": + return name["given_name"] + " " + name["family_name"] - for name in person.get('other_names', []): - if name['note'] == 'Main': - return name['given_name'] + ' ' + name['family_name'] - - raise Exception('Unable to find main name for person {}'.format(person['id'])) + raise Exception("Unable to find main name for person {}".format(person["id"])) @click.group() -@click_log.simple_verbosity_option(logger, default='warning') -@click.option('-o', '--out', required=True, type=click.Path(exists=True, file_okay=False, writable=True), help='The directory to place output and state files.') +@click_log.simple_verbosity_option(logger, default="warning") +@click.option( + "-o", + "--out", + required=True, + type=click.Path(exists=True, file_okay=False, writable=True), + help="The directory to place output and state files.", +) @click.pass_context def cli(context, out): context.ensure_object(dict) - context.obj['OUTPUT_FOLDER'] = out + context.obj["OUTPUT_FOLDER"] = out # Get the current state file, parse it and assign to the context - context.obj['state'] = getScraperState(context.obj['OUTPUT_FOLDER']) + context.obj["state"] = getScraperState(context.obj["OUTPUT_FOLDER"]) @cli.command() -@click.option('-s', '--start', type=CLI_DATETIME_FORMAT, help='The first date of the range to be scrape.') -@click.option('-e', '--end', type=CLI_DATETIME_FORMAT, help='The last date of the range to be scraped.') -@click.option('--force-scrape-dates', is_flag=True, help='Force all dates in the range to be re-scraped regardless of status') -@click.option('--force-refresh-questions', is_flag=True, help='Force all detected questions to have their state refreshed') +@click.option( + "-s", + "--start", + type=CLI_DATETIME_FORMAT, + help="The first date of the range to be scrape.", +) +@click.option( + "-e", + "--end", + type=CLI_DATETIME_FORMAT, + help="The last date of the range to be scraped.", +) +@click.option( + "--force-scrape-dates", + is_flag=True, + help="Force all dates in the range to be re-scraped regardless of status", +) +@click.option( + "--force-refresh-questions", + is_flag=True, + help="Force all detected questions to have their state refreshed", +) @click.pass_context def meetings(context, start, end, force_scrape_dates, force_refresh_questions): - ''' Get a list of questions from the London Assembly website asked between the dates given. ''' + """Get a list of questions from the London Assembly website asked between the dates given.""" - logger.info('Scraping London Assembly') + logger.info("Scraping London Assembly") if start: start_date = start.date() - logger.debug('End date has been explicitly set to {} by CLI'.format(start_date)) + logger.debug("End date has been explicitly set to {} by CLI".format(start_date)) else: - start_date = datetime.datetime.strptime(DEFAULT_START_DATE, '%Y-%m-%d').date() - logger.debug('Start date has been automatically set to {} by config'.format(start_date)) + start_date = datetime.datetime.strptime(DEFAULT_START_DATE, "%Y-%m-%d").date() + logger.debug( + "Start date has been automatically set to {} by config".format(start_date) + ) if end: end_date = end.date() - logger.debug('End date has been explicitly set to {} by CLI'.format(end_date)) + logger.debug("End date has been explicitly set to {} by CLI".format(end_date)) else: # Yesterday end_date = (datetime.datetime.today() - datetime.timedelta(days=1)).date() - logger.debug('End date has been automatically set to {} (yesterday)'.format(end_date)) + logger.debug( + "End date has been automatically set to {} (yesterday)".format(end_date) + ) if end_date < start_date: - logger.error('End date is before the start date. Aborting.') + logger.error("End date is before the start date. Aborting.") return dates_in_range = getDatesInRange(start_date, end_date) - logger.info('Targetting {} dates between {} and {}.'.format(len(dates_in_range), start_date, end_date)) + logger.info( + "Targetting {} dates between {} and {}.".format( + len(dates_in_range), start_date, end_date + ) + ) questions_in_range = [] with click.progressbar(dates_in_range) as bar: for date in bar: - # Check to see if we should actually scrape this date - if force_scrape_dates \ - or str(date) not in context.obj['state']['dates'] \ - or (str(date) in context.obj['state']['dates'] and context.obj['state']['dates'][str(date)]['to_scrape']): - - logger.info('Scraping date {}'.format(date)) - - meeting_scrape_data = scrapeAssemblyMeetingOnDate(date) - - if 'questions' in meeting_scrape_data: - logger.info('{} has {} questions'.format(date, len(meeting_scrape_data['questions']))) - - questions_in_range += meeting_scrape_data['questions'] - - context.obj['state']['dates'][str(date)] = buildDateStatusObjectFromScrape(meeting_scrape_data) + if ( + force_scrape_dates + or str(date) not in context.obj["state"]["dates"] + or ( + str(date) in context.obj["state"]["dates"] + and context.obj["state"]["dates"][str(date)]["to_scrape"] + ) + ): + logger.info("Scraping date {}".format(date)) + + meeting_scrape_data = scrapeAssemblyMeetingOnDate(date) + + if "questions" in meeting_scrape_data: + logger.info( + "{} has {} questions".format( + date, len(meeting_scrape_data["questions"]) + ) + ) + + questions_in_range += meeting_scrape_data["questions"] + + context.obj["state"]["dates"][str(date)] = ( + buildDateStatusObjectFromScrape(meeting_scrape_data) + ) else: + logger.debug( + "Skipping date {} (already scraped successfully)".format(date) + ) - logger.debug('Skipping date {} (already scraped successfully)'.format(date)) - - logger.info('{} questions found in this scrape'.format(len(questions_in_range))) + logger.info("{} questions found in this scrape".format(len(questions_in_range))) for question in questions_in_range: # Only do this if the question doesn't already exist, or we're forcing a refresh - if force_refresh_questions or question not in context.obj['state']['questions']: - context.obj['state']['questions'][question] = { - 'to_scrape': True, - 'scrape_requested_on': datetime.datetime.today() + if force_refresh_questions or question not in context.obj["state"]["questions"]: + context.obj["state"]["questions"][question] = { + "to_scrape": True, + "scrape_requested_on": datetime.datetime.today(), } @cli.command() -@click.option('-l', '--limit', type=int, help='The maximum number of questions to scrape') -@click.option('-m', '--members', required=True, type=click.File(), help='The members.json file to match names against.') -@click.option('--dry-run', is_flag=True, help='Should questions be marked as not needing scraping in future?') +@click.option( + "-l", "--limit", type=int, help="The maximum number of questions to scrape" +) +@click.option( + "-m", + "--members", + required=True, + type=click.File(), + help="The members.json file to match names against.", +) +@click.option( + "--dry-run", + is_flag=True, + help="Should questions be marked as not needing scraping in future?", +) @click.pass_context def questions(context, limit, members, dry_run): - ''' Update all questions which are still pending a scrape. ''' + """Update all questions which are still pending a scrape.""" # Try load in the Members data first - if that fails there's no point continuing. # ASSEMBLY_MEMBERS_BY_NAME is global to avoid having to pass it down every function until names are turned to IDs global ASSEMBLY_MEMBERS_BY_NAME ASSEMBLY_MEMBERS_BY_NAME = loadMembershipsFromFile(members) - logger.debug('{} questions are known to exist'.format(len(context.obj['state']['questions']))) + logger.debug( + "{} questions are known to exist".format(len(context.obj["state"]["questions"])) + ) questions_to_scrape = [] - for question_id, question_state in context.obj['state']['questions'].items(): - if question_state['to_scrape']: + for question_id, question_state in context.obj["state"]["questions"].items(): + if question_state["to_scrape"]: questions_to_scrape.append(question_id) # If a limit is provided, set it. Otherwise, scrape the lot. @@ -746,46 +845,51 @@ def questions(context, limit, members, dry_run): if limit: questions_to_scrape = questions_to_scrape[:limit] - logger.info('Scraping {} questions'.format(len(questions_to_scrape))) + logger.info("Scraping {} questions".format(len(questions_to_scrape))) scraped_questions = {} with click.progressbar(questions_to_scrape) as bar: for question_id in bar: - - scraped_questions[question_id] = scrapeQuestionWithId(question_id,context) - context.obj['state']['questions'][question_id]['scraped_at'] = datetime.datetime.today() + scraped_questions[question_id] = scrapeQuestionWithId(question_id, context) + context.obj["state"]["questions"][question_id]["scraped_at"] = ( + datetime.datetime.today() + ) answered_questions = {} for question_id, question_object in scraped_questions.items(): - # question will be None if we failed to scrape it, e.g page error - if question_object is not None and question_object['answered'] == True: - answered_date = question_object['answered_date'] - answered_questions.setdefault(answered_date, {})[question_id] = question_object + if question_object is not None and question_object["answered"] == True: + answered_date = question_object["answered_date"] + answered_questions.setdefault(answered_date, {})[question_id] = ( + question_object + ) if not dry_run: # Setting this question's scrape state to False means it won't be processed again - context.obj['state']['questions'][question_id]['to_scrape'] = False + context.obj["state"]["questions"][question_id]["to_scrape"] = False - logger.info('{} questions have had answers found in this scrape'.format(len(answered_questions))) + logger.info( + "{} questions have had answers found in this scrape".format( + len(answered_questions) + ) + ) # If there are new answers, write out our file. if len(answered_questions) > 0: for date, qns in answered_questions.items(): - - i = 0; - + i = 0 file_needs_writing = True while file_needs_writing: - - date_string = date.strftime('%Y-%m-%d') + date_string = date.strftime("%Y-%m-%d") letter_suffix = string.ascii_lowercase[i] - output_filename = XML_FILE_PREFIX + date_string + letter_suffix + '.xml' - output_file = os.path.join(context.obj['OUTPUT_FOLDER'], output_filename) + output_filename = XML_FILE_PREFIX + date_string + letter_suffix + ".xml" + output_file = os.path.join( + context.obj["OUTPUT_FOLDER"], output_filename + ) if os.path.exists(output_file): i = i + 1 @@ -795,67 +899,80 @@ def questions(context, limit, members, dry_run): file_needs_writing = False -@cli.command(name='set_date_scrape') -@click.option('--date', required=True, type=CLI_DATETIME_FORMAT, help='The date to alter the scrape status of.') -@click.option('--scrape/--no-scrape', required=True, help='Should the date be marked as needing scraping, or not?') +@cli.command(name="set_date_scrape") +@click.option( + "--date", + required=True, + type=CLI_DATETIME_FORMAT, + help="The date to alter the scrape status of.", +) +@click.option( + "--scrape/--no-scrape", + required=True, + help="Should the date be marked as needing scraping, or not?", +) @click.pass_context def set_date_scrape(context, date, scrape): - ''' Explicitly set if a date should be scraped or not at the next run. + """Explicitly set if a date should be scraped or not at the next run. - Used to either manually request a re-scraping of a date, or to suppress future scraping of a date. ''' + Used to either manually request a re-scraping of a date, or to suppress future scraping of a date.""" date = date.date() - click.echo('Setting scrape status of {} to {}'.format(date, scrape)) + click.echo("Setting scrape status of {} to {}".format(date, scrape)) - if date in context.obj['state']['dates']: - context.obj['state']['dates'][str(date)]['to_scrape'] = scrape + if date in context.obj["state"]["dates"]: + context.obj["state"]["dates"][str(date)]["to_scrape"] = scrape else: - context.obj['state']['dates'][str(date)] = { - 'to_scrape': scrape - } + context.obj["state"]["dates"][str(date)] = {"to_scrape": scrape} -@cli.command(name='set_question_scrape') -@click.option('--id', required=True, help='The question to alter the scrape status.') -@click.option('--scrape/--no-scrape', required=True, help='Should the question be marked as needing scraping, or not?') +@cli.command(name="set_question_scrape") +@click.option("--id", required=True, help="The question to alter the scrape status.") +@click.option( + "--scrape/--no-scrape", + required=True, + help="Should the question be marked as needing scraping, or not?", +) @click.pass_context def set_question_scrape(context, id, scrape): - ''' Explicitly set if a question should be scraped or not at the next run. + """Explicitly set if a question should be scraped or not at the next run. - Used to either manually request a re-scraping of a question, or to suppress future scraping of a question. ''' + Used to either manually request a re-scraping of a question, or to suppress future scraping of a question.""" - click.echo('Setting scrape status of {} to {}'.format(id, scrape)) + click.echo("Setting scrape status of {} to {}".format(id, scrape)) - if id in context.obj['state']['questions']: - context.obj['state']['questions'][id]['to_scrape'] = scrape + if id in context.obj["state"]["questions"]: + context.obj["state"]["questions"][id]["to_scrape"] = scrape else: - context.obj['state']['questions'][id] = { - 'to_scrape': scrape - } + context.obj["state"]["questions"][id] = {"to_scrape": scrape} -@cli.command(name='reset_state') +@cli.command(name="reset_state") @click.pass_context def reset_state(context): - ''' Reset the scraper's state file, wiping all knowledge of dates and questions. ''' + """Reset the scraper's state file, wiping all knowledge of dates and questions.""" - click.secho('Resetting the state file will wipe all information about the states of dates and questions.', bg='red', fg='white') + click.secho( + "Resetting the state file will wipe all information about the states of dates and questions.", + bg="red", + fg="white", + ) - if click.confirm('Are you really sure you want to do this?', abort=True): - logger.info('Resetting scraper state file') + if click.confirm("Are you really sure you want to do this?", abort=True): + logger.info("Resetting scraper state file") - context.obj['state'] = EMPTY_STATE_OBJECT + context.obj["state"] = EMPTY_STATE_OBJECT - click.echo('All done. Have a nice day.') + click.echo("All done. Have a nice day.") @cli.resultcallback() @click.pass_context def process_result(context, result, **kwargs): - ''' Called after anything in the CLI command group, to write the state back to the file. ''' - writeScraperState(context.obj['state'], context.obj['OUTPUT_FOLDER']) + """Called after anything in the CLI command group, to write the state back to the file.""" + writeScraperState(context.obj["state"], context.obj["OUTPUT_FOLDER"]) -if __name__ == '__main__': +if __name__ == "__main__": cli(obj={}) diff --git a/members/parl-old-check-party.py b/members/parl-old-check-party.py index badcef9d..b01eede7 100644 --- a/members/parl-old-check-party.py +++ b/members/parl-old-check-party.py @@ -1,117 +1,208 @@ #!/usr/bin/python -# +# # Old Work-In-Progress for something using old Parliament API, comparing # parties. Would probably want reworking using new API. The purpose would be to # check for changes against our data, and alert someone that something needs # fixing (or longer term, fix it automatically). -import re +import sys import urllib.request + import lxml.objectify -import sys sys.path.append("../pyscraper") from lords.resolvenames import lordsList -TYPES = ( - '2 Hered Office Holders', 'Bishops and Archbishops', 'Deputy Hereditary', 'Elected Hereditary', 'Hereditary', - 'Hereditary of 1st creation', 'Hereds given LPs', 'Law Lord', 'Life peer', +TYPES = ( + "2 Hered Office Holders", + "Bishops and Archbishops", + "Deputy Hereditary", + "Elected Hereditary", + "Hereditary", + "Hereditary of 1st creation", + "Hereds given LPs", + "Law Lord", + "Life peer", +) +RANKS = ( + "Archbishop", + "Baroness", + "Bishop", + "Countess", + "Duke", + "Earl", + "Lady", + "Lord", + "Marquess", + "Prince", + "Viscount", ) -RANKS = ( 'Archbishop', 'Baroness', 'Bishop', 'Countess', 'Duke', 'Earl', 'Lady', 'Lord', 'Marquess', 'Prince', 'Viscount' ) -GENDERS = ( 'Female', 'Male' ) +GENDERS = ("Female", "Male") PARTIES = ( - '', 'Alliance', 'Bishops', 'Conservative', 'Conservative Independent', 'Crossbench', 'Democratic Unionist', - 'Independent Labour', 'Labour', 'Labour Independent', 'Liberal Democrat', 'Non-affiliated (current Member)', - 'Other', 'Plaid Cymru', 'UK Independence Party', 'Ulster Unionist Party', + "", + "Alliance", + "Bishops", + "Conservative", + "Conservative Independent", + "Crossbench", + "Democratic Unionist", + "Independent Labour", + "Labour", + "Labour Independent", + "Liberal Democrat", + "Non-affiliated (current Member)", + "Other", + "Plaid Cymru", + "UK Independence Party", + "Ulster Unionist Party", ) -STATUS = ('Active', 'Retired', 'Deceased', 'Suspended', 'Inactive', 'Disqualified', 'Resigned', 'LeaveOfAbsence') +STATUS = ( + "Active", + "Retired", + "Deceased", + "Suspended", + "Inactive", + "Disqualified", + "Resigned", + "LeaveOfAbsence", +) + class Lord: left_date = None def __init__(self, lord): - self.ids = { 'id': lord.get('id'), 'pims': lord.get('pimsId'), 'dods': lord.get('dodsId') } - self.type = TYPES.index(lord.type) - self.rank = RANKS.index(lord.rank) - self.firstName = str(getattr(lord, 'firstName', '')) - self.lastName = str(lord.lastName) - self.shortTitle = str(lord.shortTitle).replace(' ', ' ') # Used in division listings - self.longTitle = str(lord.longTitle).replace('Rdt Hon. ', '') # Used in debate speech - self.party = PARTIES.index(lord['{urn:parliament/metadata/core/2010/10/01/party}party'].partyName) - self.website = str(lord.get('website', '')) - self.gender = GENDERS.index(lord['{urn:parliament/metadata/core/2010/10/01/gender}gender']) - self.lastOath = str(lord.lastOathDate)[:10] - - honours = getattr(lord, '{urn:parliament/metadata/core/members/2010/10/01/honour}honours', None) + self.ids = { + "id": lord.get("id"), + "pims": lord.get("pimsId"), + "dods": lord.get("dodsId"), + } + self.type = TYPES.index(lord.type) + self.rank = RANKS.index(lord.rank) + self.firstName = str(getattr(lord, "firstName", "")) + self.lastName = str(lord.lastName) + self.shortTitle = str(lord.shortTitle).replace( + " ", " " + ) # Used in division listings + self.longTitle = str(lord.longTitle).replace( + "Rdt Hon. ", "" + ) # Used in debate speech + self.party = PARTIES.index( + lord["{urn:parliament/metadata/core/2010/10/01/party}party"].partyName + ) + self.website = str(lord.get("website", "")) + self.gender = GENDERS.index( + lord["{urn:parliament/metadata/core/2010/10/01/gender}gender"] + ) + self.lastOath = str(lord.lastOathDate)[:10] + + honours = getattr( + lord, + "{urn:parliament/metadata/core/members/2010/10/01/honour}honours", + None, + ) if honours is not None: - self.honours = [ ( str(h.name), str(h.startDate) ) for h in honours['{urn:parliament/metadata/core/2010/10/01/honour}honour'] ] - - status = lord['{urn:parliament/metadata/core/2010/10/01/status}status'] - self.status = STATUS[STATUS.index(status['name'])] - self.statusInfo = status['statusInformation'] - - if self.status == 'Retired': - self.left_date = str(self.statusInfo['dateOfRetirement'])[:10] - elif self.status == 'Deceased': - self.left_date = str(self.statusInfo['dateOfDeath'])[:10] - elif self.status == 'Suspended': - start_date = str(self.statusInfo['startDate'])[:10] - end_date = str(self.statusInfo['endDate'])[:10] - reason = self.statusInfo['description'] + self.honours = [ + (str(h.name), str(h.startDate)) + for h in honours[ + "{urn:parliament/metadata/core/2010/10/01/honour}honour" + ] + ] + + status = lord["{urn:parliament/metadata/core/2010/10/01/status}status"] + self.status = STATUS[STATUS.index(status["name"])] + self.statusInfo = status["statusInformation"] + + if self.status == "Retired": + self.left_date = str(self.statusInfo["dateOfRetirement"])[:10] + elif self.status == "Deceased": + self.left_date = str(self.statusInfo["dateOfDeath"])[:10] + elif self.status == "Suspended": + start_date = str(self.statusInfo["startDate"])[:10] + end_date = str(self.statusInfo["endDate"])[:10] + reason = self.statusInfo["description"] self.status = (self.status, start_date, end_date, reason) - elif self.status == 'Inactive': - self.left_date = str(self.statusInfo['membershipEndDate'])[:10] - elif self.status == 'Disqualified': - start_date = str(self.statusInfo['startDate'])[:10] - end_date = str(self.statusInfo['endDate'])[:10] - reason = self.statusInfo['reason'] + elif self.status == "Inactive": + self.left_date = str(self.statusInfo["membershipEndDate"])[:10] + elif self.status == "Disqualified": + start_date = str(self.statusInfo["startDate"])[:10] + end_date = str(self.statusInfo["endDate"])[:10] + reason = self.statusInfo["reason"] self.status = (self.status, start_date, end_date, reason) - elif self.status == 'Resigned': - self.left_date = str(self.statusInfo['dateOfResignation'])[:10] - elif self.status == 'LeaveOfAbsence': - assert self.party in (PARTIES.index('Non-affiliated (current Member)'), PARTIES.index('Other')) - #self.party = PARTIES.index(self.statusInfo['party']['partyName']) - elif self.status == 'Active': + elif self.status == "Resigned": + self.left_date = str(self.statusInfo["dateOfResignation"])[:10] + elif self.status == "LeaveOfAbsence": + assert self.party in ( + PARTIES.index("Non-affiliated (current Member)"), + PARTIES.index("Other"), + ) + # self.party = PARTIES.index(self.statusInfo['party']['partyName']) + elif self.status == "Active": pass # Corrections - if self.longTitle == 'The Lord McAlpine of West Green': - self.left_date = '2010-05-21' # From House of Lords journal - if self.longTitle == 'The Most Hon. the Marquess of Salisbury DL': - self.status = 'Retired' # The 6th Marquess left, as I understand it - self.left_date = '1999-11-11' - if self.longTitle == 'The Rt Hon. the Viscount Younger of Leckie KT KCVO TD DL': - self.type = TYPES.index('Hereds given LPs') # Not a Hereditary - if self.longTitle == 'The Earl of Carnarvon KCVO KBE DL': - self.type = TYPES.index('Elected Hereditary') # One of the 92 + if self.longTitle == "The Lord McAlpine of West Green": + self.left_date = "2010-05-21" # From House of Lords journal + if self.longTitle == "The Most Hon. the Marquess of Salisbury DL": + self.status = "Retired" # The 6th Marquess left, as I understand it + self.left_date = "1999-11-11" + if self.longTitle == "The Rt Hon. the Viscount Younger of Leckie KT KCVO TD DL": + self.type = TYPES.index("Hereds given LPs") # Not a Hereditary + if self.longTitle == "The Earl of Carnarvon KCVO KBE DL": + self.type = TYPES.index("Elected Hereditary") # One of the 92 def __str__(self): - return '%s (%s) - %s' % ( self.longTitle, PARTIES[self.party], self.status ) + return "%s (%s) - %s" % (self.longTitle, PARTIES[self.party], self.status) + # Fetch the current live information -lords = urllib.request.urlopen('http://data.parliament.uk/resources/members/api/lords/all/').read() -lords = [ Lord(lord) for lord in lxml.objectify.fromstring(lords).peer ] +lords = urllib.request.urlopen( + "http://data.parliament.uk/resources/members/api/lords/all/" +).read() +lords = [Lord(lord) for lord in lxml.objectify.fromstring(lords).peer] for lord in lords: # Ignore hereditaries retired by the House of Lords Act 1999, or # others who retired or dided before our records begin - if lord.status in ('Deceased', 'Retired') and lord.left_date <= '1999-11-11': continue + if lord.status in ("Deceased", "Retired") and lord.left_date <= "1999-11-11": + continue # We don't show ones that haven't been introduced yet (and couple of bugs, looks like) - if not lord.lastOath: continue - - date = lord.left_date or '2011-12-04' - match = lordsList.MatchRevName(lord.shortTitle, date, '') - - #if '%s %s' % (lord.title, lord.lastName) in self. - if PARTIES[lord.party] == 'Conservative' and lordsList.lords[match]['party'] == 'Con': continue - if PARTIES[lord.party] == 'Labour' and lordsList.lords[match]['party'] == 'Lab': continue - if PARTIES[lord.party] == 'Liberal Democrat' and lordsList.lords[match]['party'] == 'LDem': continue - if PARTIES[lord.party] == 'Crossbench' and lordsList.lords[match]['party'] == 'XB': continue - if PARTIES[lord.party] == 'Bishops' and lordsList.lords[match]['party'] == 'Bp': continue - if PARTIES[lord.party] == 'Ulster Unionist Party' and lordsList.lords[match]['party'] == 'UUP': continue - if PARTIES[lord.party] == 'UK Independence Party' and lordsList.lords[match]['party'] == 'UKIP': continue - if PARTIES[lord.party] == 'Plaid Cymru' and lordsList.lords[match]['party'] == 'PC': continue - if PARTIES[lord.party] == 'Plaid Cymru' and lordsList.lords[match]['party'] == 'PC': continue - print(PARTIES[lord.party], lordsList.lords[match]['party']) - + if not lord.lastOath: + continue + + date = lord.left_date or "2011-12-04" + match = lordsList.MatchRevName(lord.shortTitle, date, "") + + # if '%s %s' % (lord.title, lord.lastName) in self. + if ( + PARTIES[lord.party] == "Conservative" + and lordsList.lords[match]["party"] == "Con" + ): + continue + if PARTIES[lord.party] == "Labour" and lordsList.lords[match]["party"] == "Lab": + continue + if ( + PARTIES[lord.party] == "Liberal Democrat" + and lordsList.lords[match]["party"] == "LDem" + ): + continue + if PARTIES[lord.party] == "Crossbench" and lordsList.lords[match]["party"] == "XB": + continue + if PARTIES[lord.party] == "Bishops" and lordsList.lords[match]["party"] == "Bp": + continue + if ( + PARTIES[lord.party] == "Ulster Unionist Party" + and lordsList.lords[match]["party"] == "UUP" + ): + continue + if ( + PARTIES[lord.party] == "UK Independence Party" + and lordsList.lords[match]["party"] == "UKIP" + ): + continue + if PARTIES[lord.party] == "Plaid Cymru" and lordsList.lords[match]["party"] == "PC": + continue + if PARTIES[lord.party] == "Plaid Cymru" and lordsList.lords[match]["party"] == "PC": + continue + print(PARTIES[lord.party], lordsList.lords[match]["party"]) diff --git a/members/wikipedia-commons.py b/members/wikipedia-commons.py index 511684e6..b6831585 100755 --- a/members/wikipedia-commons.py +++ b/members/wikipedia-commons.py @@ -7,10 +7,10 @@ # certain conditions. However, it comes with ABSOLUTELY NO WARRANTY. # For details see the file LICENSE.html in the top level of the source. -import datetime +import re import sys import urllib.parse -import re + # import sets sys.path.append("../pyscraper") @@ -18,59 +18,61 @@ from resolvemembernames import memberList # Get region pages -wiki_index_url = "http://en.wikipedia.org/wiki/MPs_elected_in_the_UK_general_election,_2005" +wiki_index_url = ( + "http://en.wikipedia.org/wiki/MPs_elected_in_the_UK_general_election,_2005" +) date_parl = { - 1997: '1999-01-01', - 2001: '2003-01-01', - 2005: '2007-08-01', - 2010: '2014-01-01', - 2015: '2016-01-01', + 1997: "1999-01-01", + 2001: "2003-01-01", + 2005: "2007-08-01", + 2010: "2014-01-01", + 2015: "2016-01-01", } -wikimembers = {} +wikimembers = {} -# Grab page +# Grab page for year in (1997, 2001, 2005, 2010, 2015): - ur = open('../rawdata/Members_of_the_House_of_Commons_%d' % year) + ur = open("../rawdata/Members_of_the_House_of_Commons_%d" % year) content = ur.read() ur.close() -# -#West Ham -#Lyn Brown -#Labour - matcher = '\s+]*?title="[^"]+">([^<]+)(?:
\s+.*?)?\s*\s+(?:]*>\s*\s*]*>]*>[^<]*\s*\s*]*>\s*\s*)?(?:(?:[^<]*|))?(?:Dr |Sir |The Rev\. )?]*?title="[^"]+"[^>]*>([^<]+)(?:(?:){2,3})?(?: \(.*?\))?\s*|by-election,[^"]+">([^<]+) [^ ]{1,3} ([^<]+)'; + # + # West Ham + # Lyn Brown + # Labour + matcher = '\s+]*?title="[^"]+">([^<]+)(?:
\s+.*?)?\s*\s+(?:]*>\s*\s*]*>]*>[^<]*\s*\s*]*>\s*\s*)?(?:(?:[^<]*|))?(?:Dr |Sir |The Rev\. )?]*?title="[^"]+"[^>]*>([^<]+)(?:(?:){2,3})?(?: \(.*?\))?\s*|by-election,[^"]+">([^<]+) [^ ]{1,3} ([^<]+)' matches = re.findall(matcher, content) - for (cons, url, name, cons2, url2, name2) in matches: + for cons, url, name, cons2, url2, name2 in matches: id = None if cons2: cons = cons2 name = name2 url = url2 - cons = cons.replace('&', '&') + cons = cons.replace("&", "&") try: - (id, canonname, canoncons) = memberList.matchfullnamecons(name, cons, date_parl[year]) + (id, canonname, canoncons) = memberList.matchfullnamecons( + name, cons, date_parl[year] + ) except Exception as e: print(e, file=sys.stderr) if not id: continue wikimembers[id] = url -print(''' -''') +print(""" +""") k = sorted(wikimembers) for id in k: url = urllib.parse.urljoin(wiki_index_url, wikimembers[id]) print('' % (id, url)) -print('') +print("") -#wikimembers = sets.Set(wikimembers.keys()) -#print "len: ", len(wikimembers) +# wikimembers = sets.Set(wikimembers.keys()) +# print "len: ", len(wikimembers) # Check we have everybody -- ha! not likely yet -#allmembers = sets.Set(memberList.currentmpslist()) -#symdiff = allmembers.symmetric_difference(wikimembers) -#if len(symdiff) > 0: +# allmembers = sets.Set(memberList.currentmpslist()) +# symdiff = allmembers.symmetric_difference(wikimembers) +# if len(symdiff) > 0: # print >>sys.stderr, "Failed to get all MPs, these ones in symmetric difference" # print >>sys.stderr, symdiff - - diff --git a/members/wikipedia-lords.py b/members/wikipedia-lords.py index 34ac829b..852f3349 100755 --- a/members/wikipedia-lords.py +++ b/members/wikipedia-lords.py @@ -8,9 +8,9 @@ # For details see the file LICENSE.html in the top level of the source. import datetime +import re import sys import urllib.parse -import re sys.path.append("../pyscraper") from lords.resolvenames import lordsList @@ -20,39 +20,39 @@ date_today = datetime.date.today().isoformat() wikimembers = {} -# Grab page -ur = open('../rawdata/Members_of_the_House_of_Lords') +# Grab page +ur = open("../rawdata/Members_of_the_House_of_Lords") content = ur.read() ur.close() -#The Lord Ampthill -matcher = '\s+]*?title="([^"]+)"[^>]*>([^<]+)\s*'; +# The Lord Ampthill +matcher = ( + '\s+]*?title="([^"]+)"[^>]*>([^<]+)\s*' +) matches = re.findall(matcher, content) -for (url, title, name) in matches: +for url, title, name in matches: id = None try: id = lordsList.GetLordIDfname(name, None, date_today) - except Exception as e: + except Exception: continue if not id: continue wikimembers[id] = url -print(''' -''') +print(""" +""") for id, url in sorted(wikimembers.items()): url = urllib.parse.urljoin(wiki_index_url, url) print('' % (id, url)) -print('') +print("") -#print "len: ", len(wikimembers) +# print "len: ", len(wikimembers) # Check we have everybody -- ha! not likely yet -#allmembers = set(memberList.currentmpslist()) -#symdiff = allmembers.symmetric_difference(wikimembers) -#if len(symdiff) > 0: +# allmembers = set(memberList.currentmpslist()) +# symdiff = allmembers.symmetric_difference(wikimembers) +# if len(symdiff) > 0: # print >>sys.stderr, "Failed to get all MPs, these ones in symmetric difference" # print >>sys.stderr, symdiff - - diff --git a/members/wikipedia-standingdown.py b/members/wikipedia-standingdown.py index bbf0c5e6..19c92524 100755 --- a/members/wikipedia-standingdown.py +++ b/members/wikipedia-standingdown.py @@ -7,24 +7,25 @@ # certain conditions. However, it comes with ABSOLUTELY NO WARRANTY. # For details see the file LICENSE.html in the top level of the source. -import sys import re +import sys sys.path.append("../pyscraper") from resolvemembernames import memberList -today = '2024-05-24' +today = "2024-05-24" -page = open('../rawdata/Members_of_the_2024_standing_down').read() -page = re.sub('(?s)^.*?Members of Parliament not standing for re-election', '', page) -page = re.sub('(?s).*', '', page) +page = open("../rawdata/Members_of_the_2024_standing_down").read() +page = re.sub( + "(?s)^.*?Members of Parliament not standing for re-election", "", page +) +page = re.sub("(?s).*", "", page) -print(''' -''') +print(""" +""") m = re.findall(r'\s*.*?]*>([^<]*)', page) for row in m: url, name = row - pid, canonname, canoncons = memberList.matchfullnamecons(name, None, today) + pid, canonname, canoncons = memberList.matchfullnamecons(name, None, today) print((' ' % (pid, name))) -print('') - +print("") diff --git a/pyscraper/base_resolver.py b/pyscraper/base_resolver.py index ea7939b4..62beacab 100644 --- a/pyscraper/base_resolver.py +++ b/pyscraper/base_resolver.py @@ -2,177 +2,211 @@ import os import re -members_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'members')) +members_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "members")) + class ResolverBase(object): def __init__(self): self.reloadJSON() def reloadJSON(self): - self.members = {} # ID --> membership - self.persons = {} # ID --> person - self.fullnames = {} # "Firstname Lastname" --> memberships - self.lastnames = {} # Surname --> memberships - - self.constoidmap = {} # constituency name --> cons attributes (with date and ID) - self.considtonamemap = {} # cons ID --> name - self.considtomembermap = {} # cons ID --> memberships - self.historichansard = {} # Historic Hansard commons membership ID -> MPs - self.pims = {} # Pims membership ID and date -> MPs - self.mnis = {} # Parliament Member Names ID to person - - self.parties = {} # party --> memberships - self.membertopersonmap = {} # member ID --> person ID - self.persontomembermap = {} # person ID --> memberships + self.members = {} # ID --> membership + self.persons = {} # ID --> person + self.fullnames = {} # "Firstname Lastname" --> memberships + self.lastnames = {} # Surname --> memberships + + self.constoidmap = {} # constituency name --> cons attributes (with date and ID) + self.considtonamemap = {} # cons ID --> name + self.considtomembermap = {} # cons ID --> memberships + self.historichansard = {} # Historic Hansard commons membership ID -> MPs + self.pims = {} # Pims membership ID and date -> MPs + self.mnis = {} # Parliament Member Names ID to person + + self.parties = {} # party --> memberships + self.membertopersonmap = {} # member ID --> person ID + self.persontomembermap = {} # person ID --> memberships def import_constituencies(self): - data = json.load(open(os.path.join(members_dir, 'people.json'))) - for con in data['posts']: - if con['organization_id'] != self.import_organization_id: + data = json.load(open(os.path.join(members_dir, "people.json"))) + for con in data["posts"]: + if con["organization_id"] != self.import_organization_id: continue attr = { - 'id': con['id'], - 'start_date': con.get('start_date', '0000-00-00'), - 'end_date': con.get('end_date', '9999-12-31'), + "id": con["id"], + "start_date": con.get("start_date", "0000-00-00"), + "end_date": con.get("end_date", "9999-12-31"), } - if len(attr['start_date']) == 4: - attr['start_date'] = '%s-01-01' % attr['start_date'] - if len(attr['end_date']) == 4: - attr['end_date'] = '%s-12-31' % attr['end_date'] + if len(attr["start_date"]) == 4: + attr["start_date"] = "%s-01-01" % attr["start_date"] + if len(attr["end_date"]) == 4: + attr["end_date"] = "%s-12-31" % attr["end_date"] - names = [con['area']['name']] + con['area'].get('other_names', []) + names = [con["area"]["name"]] + con["area"].get("other_names", []) for name in names: - if not con['id'] in self.considtonamemap: - self.considtonamemap[con['id']] = name + if con["id"] not in self.considtonamemap: + self.considtonamemap[con["id"]] = name self.constoidmap.setdefault(name, []).append(attr) nopunc = self.strip_punctuation(name) self.constoidmap.setdefault(nopunc, []).append(attr) def strip_punctuation(self, cons): - nopunc = cons.replace(',','').replace('-','').replace(' ','').lower().strip() + nopunc = cons.replace(",", "").replace("-", "").replace(" ", "").lower().strip() return nopunc def import_people_json(self): - data = json.load(open(os.path.join(members_dir, 'people.json'))) - posts = {post['id']: post for post in data['posts']} - orgs = {org['id']: org for org in data['organizations']} - for mship in data['memberships']: + data = json.load(open(os.path.join(members_dir, "people.json"))) + posts = {post["id"]: post for post in data["posts"]} + orgs = {org["id"]: org for org in data["organizations"]} + for mship in data["memberships"]: self.import_people_membership(mship, posts, orgs) - for person in data['persons']: + for person in data["persons"]: self.import_people_names(person) def import_people_membership(self, mship, posts, orgs): - if 'post_id' not in mship or posts[mship['post_id']]['organization_id'] != self.import_organization_id: + if ( + "post_id" not in mship + or posts[mship["post_id"]]["organization_id"] != self.import_organization_id + ): return if mship["id"] in self.membertopersonmap: raise Exception("Same member id %s appeared twice" % mship["id"]) - self.membertopersonmap[mship["id"]] = mship['person_id'] - self.persontomembermap.setdefault(mship['person_id'], []).append(mship["id"]) + self.membertopersonmap[mship["id"]] = mship["person_id"] + self.persontomembermap.setdefault(mship["person_id"], []).append(mship["id"]) if self.members.get(mship["id"]): raise Exception("Repeated identifier %s in members JSON file" % mship["id"]) self.members[mship["id"]] = mship - if 'end_date' not in mship: - mship['end_date'] = '9999-12-31' + if "end_date" not in mship: + mship["end_date"] = "9999-12-31" # index by constituency - mship['constituency'] = posts[mship['post_id']]['area']['name'] - consids = self.constoidmap[mship['constituency']] + mship["constituency"] = posts[mship["post_id"]]["area"]["name"] + consids = self.constoidmap[mship["constituency"]] consid = None # find the constituency id for this person - mship_start_date = len(mship['start_date'])==4 and ('%s-01-01' % mship['start_date']) or mship['start_date'] - mship_end_date = len(mship['end_date'])==4 and ('%s-12-31' % mship['end_date']) or mship['end_date'] + mship_start_date = ( + len(mship["start_date"]) == 4 + and ("%s-01-01" % mship["start_date"]) + or mship["start_date"] + ) + mship_end_date = ( + len(mship["end_date"]) == 4 + and ("%s-12-31" % mship["end_date"]) + or mship["end_date"] + ) for cons in consids: - if (cons['start_date'] <= mship_start_date and - mship_start_date <= mship_end_date and - mship_end_date <= cons['end_date']): - if consid and consid != cons['id']: - raise Exception("Two constituency ids %s %s overlap with MP %s" % (consid, cons['id'], mship['id'])) - consid = cons['id'] + if ( + cons["start_date"] <= mship_start_date + and mship_start_date <= mship_end_date + and mship_end_date <= cons["end_date"] + ): + if consid and consid != cons["id"]: + raise Exception( + "Two constituency ids %s %s overlap with MP %s" + % (consid, cons["id"], mship["id"]) + ) + consid = cons["id"] if not consid: raise Exception("Constituency '%s' not found" % mship["constituency"]) # check name in members file is same as default in cons file backformed_cons = self.considtonamemap[consid] if backformed_cons != mship["constituency"]: - raise Exception("Constituency '%s' in members file differs from first constituency '%s' listed in cons file" % (mship["constituency"], backformed_cons)) + raise Exception( + "Constituency '%s' in members file differs from first constituency '%s' listed in cons file" + % (mship["constituency"], backformed_cons) + ) # check first date ranges don't overlap, MPs only # Only check modern MPs as we might have overlapping data previously - if self.import_organization_id == 'house-of-commons': + if self.import_organization_id == "house-of-commons": for cons in self.considtomembermap.get(consid, []): - if cons['end_date'] < '1997-05-01': continue - if cons['start_date'] <= mship['start_date'] <= cons['end_date'] \ - or cons['start_date'] <= mship['end_date'] <= cons['end_date'] \ - or mship['start_date'] <= cons['start_date'] <= mship['end_date'] \ - or mship['start_date'] <= cons['end_date'] <= mship['end_date']: - raise Exception("%s %s Two MP entries for constituency %s with overlapping dates" % (mship, cons, consid)) + if cons["end_date"] < "1997-05-01": + continue + if ( + cons["start_date"] <= mship["start_date"] <= cons["end_date"] + or cons["start_date"] <= mship["end_date"] <= cons["end_date"] + or mship["start_date"] <= cons["start_date"] <= mship["end_date"] + or mship["start_date"] <= cons["end_date"] <= mship["end_date"] + ): + raise Exception( + "%s %s Two MP entries for constituency %s with overlapping dates" + % (mship, cons, consid) + ) # then add in self.considtomembermap.setdefault(consid, []).append(mship) # ... and by party - if 'on_behalf_of_id' in mship: - mship['party'] = orgs[mship['on_behalf_of_id']]['name'] - self.parties.setdefault(mship['party'], []).append(mship) + if "on_behalf_of_id" in mship: + mship["party"] = orgs[mship["on_behalf_of_id"]]["name"] + self.parties.setdefault(mship["party"], []).append(mship) - if 'hansard_id' in mship: - self.historichansard.setdefault(int(mship['hansard_id']), []).append(mship) + if "hansard_id" in mship: + self.historichansard.setdefault(int(mship["hansard_id"]), []).append(mship) def import_people_names(self, person): - if person['id'] not in self.persontomembermap: + if person["id"] not in self.persontomembermap: return - self.persons[person['id']] = person - memberships = [self.members[x] for x in self.persontomembermap[person['id']]] - for other_name in person.get('other_names', []): - if other_name.get('note') == 'Main': + self.persons[person["id"]] = person + memberships = [self.members[x] for x in self.persontomembermap[person["id"]]] + for other_name in person.get("other_names", []): + if other_name.get("note") == "Main": self.import_people_main_name(other_name, memberships) - elif other_name.get('note') == 'Alternate': + elif other_name.get("note") == "Alternate": self.import_people_alternate_name(person, other_name, memberships) - for identifier in person.get('identifiers', []): - if identifier.get('scheme') == 'pims_id': - id = identifier.get('identifier') + for identifier in person.get("identifiers", []): + if identifier.get("scheme") == "pims_id": + id = identifier.get("identifier") for m in memberships: p = person.copy() - p['start_date'] = m['start_date'] - p['end_date'] = m['end_date'] + p["start_date"] = m["start_date"] + p["end_date"] = m["end_date"] self.pims.setdefault(id, []).append(p) - elif identifier.get('scheme') == 'datadotparl_id': - id = identifier.get('identifier') + elif identifier.get("scheme") == "datadotparl_id": + id = identifier.get("identifier") for m in memberships: p = person.copy() - p['start_date'] = m['start_date'] - p['end_date'] = m['end_date'] + p["start_date"] = m["start_date"] + p["end_date"] = m["end_date"] self.mnis.setdefault(id, []).append(p) def import_people_main_name(self, name, memberships): - mships = [m for m in memberships if m['start_date'] <= name.get('end_date', '9999-12-31') and m['end_date'] >= name.get('start_date', '1000-01-01')] - if not mships: return + mships = [ + m + for m in memberships + if m["start_date"] <= name.get("end_date", "9999-12-31") + and m["end_date"] >= name.get("start_date", "1000-01-01") + ] + if not mships: + return try: family_name = name["family_name"] given_name = name["given_name"] except: - family_name = name['lordname'] - if name['lordofname']: - family_name += ' of ' + name['lordofname'] - given_name = name['honorific_prefix'] - compoundname = '%s %s' % (given_name, family_name) - no_initial = '' - fnnomidinitial = re.findall('^(\S*)\s\S$', given_name) + family_name = name["lordname"] + if name["lordofname"]: + family_name += " of " + name["lordofname"] + given_name = name["honorific_prefix"] + compoundname = "%s %s" % (given_name, family_name) + no_initial = "" + fnnomidinitial = re.findall("^(\S*)\s\S$", given_name) if fnnomidinitial: no_initial = fnnomidinitial[0] + " " + family_name - initial_name = '' - if self.import_organization_id != 'house-of-commons' and given_name: + initial_name = "" + if self.import_organization_id != "house-of-commons" and given_name: initial_name = given_name[0] + " " + family_name for m in mships: - newattr = {'id': m['id'], 'person_id': m['person_id']} + newattr = {"id": m["id"], "person_id": m["person_id"]} # merge date ranges - take the smallest range covered by # the membership, and the alias's range (if it has one) - newattr['start_date'] = max(m['start_date'], name.get('start_date', '1000-01-01')) - newattr['end_date'] = min(m['end_date'], name.get('end_date', '9999-12-31')) + newattr["start_date"] = max( + m["start_date"], name.get("start_date", "1000-01-01") + ) + newattr["end_date"] = min(m["end_date"], name.get("end_date", "9999-12-31")) self.fullnames.setdefault(compoundname, []).append(newattr) if no_initial: self.fullnames.setdefault(no_initial, []).append(newattr) @@ -181,39 +215,54 @@ def import_people_main_name(self, name, memberships): self.lastnames.setdefault(family_name, []).append(newattr) def import_people_alternate_name(self, person, other_name, memberships): - if other_name.get('organization_id') not in (None, self.import_organization_id): return - mships = [m for m in memberships if m['start_date'] <= other_name.get('end_date', '9999-12-31') and m['end_date'] >= other_name.get('start_date', '1000-01-01')] + if other_name.get("organization_id") not in (None, self.import_organization_id): + return + mships = [ + m + for m in memberships + if m["start_date"] <= other_name.get("end_date", "9999-12-31") + and m["end_date"] >= other_name.get("start_date", "1000-01-01") + ] for m in mships: - newattr = {'id': m['id'], 'person_id': m['person_id']} + newattr = {"id": m["id"], "person_id": m["person_id"]} # merge date ranges - take the smallest range covered by # the membership, and the alias's range (if it has one) - newattr['start_date'] = max(m['start_date'], other_name.get('start_date', '1000-01-01')) - newattr['end_date'] = min(m['end_date'], other_name.get('end_date', '9999-12-31')) - if other_name.get('family_name'): - self.lastnames.setdefault(other_name['family_name'], []).append(newattr) + newattr["start_date"] = max( + m["start_date"], other_name.get("start_date", "1000-01-01") + ) + newattr["end_date"] = min( + m["end_date"], other_name.get("end_date", "9999-12-31") + ) + if other_name.get("family_name"): + self.lastnames.setdefault(other_name["family_name"], []).append(newattr) else: - self.fullnames.setdefault(other_name['name'], []).append(newattr) + self.fullnames.setdefault(other_name["name"], []).append(newattr) # Used by Commons and NI def name_on_date(self, person_id, date): person = self.persons[person_id] - for nm in person['other_names']: - if nm['note'] != 'Main': continue - if nm.get('start_date', '0000-00-00') <= date <= nm.get('end_date', '9999-12-31'): - if 'family_name' in nm: + for nm in person["other_names"]: + if nm["note"] != "Main": + continue + if ( + nm.get("start_date", "0000-00-00") + <= date + <= nm.get("end_date", "9999-12-31") + ): + if "family_name" in nm: name = nm["family_name"] - if nm.get('given_name'): + if nm.get("given_name"): name = nm["given_name"] + " " + name - if nm.get('honorific_prefix'): + if nm.get("honorific_prefix"): name = nm["honorific_prefix"] + " " + name - else: # Lord (e.g. Lord Morrow in NI) - name = nm['honorific_prefix'] - if nm['lordname']: - name += ' %s' % nm['lordname'] - if nm['lordofname']: - name += ' of %s' % nm['lordofname'] + else: # Lord (e.g. Lord Morrow in NI) + name = nm["honorific_prefix"] + if nm["lordname"]: + name += " %s" % nm["lordname"] + if nm["lordofname"]: + name += " of %s" % nm["lordofname"] return name - raise Exception('No found for %s on %s' % (person['id'], date)) + raise Exception("No found for %s on %s" % (person["id"], date)) def membertoperson(self, memberid): return self.membertopersonmap[memberid] @@ -221,12 +270,12 @@ def membertoperson(self, memberid): def _match_by_id(self, lookup, id, date): matches = getattr(self, lookup).get(id, []) for m in matches: - if m['start_date'] <= date <= m['end_date']: + if m["start_date"] <= date <= m["end_date"]: return m return None def match_by_mnis(self, mnis_id, date): - return self._match_by_id('mnis', mnis_id, date) + return self._match_by_id("mnis", mnis_id, date) def match_by_pims(self, pims_id, date): - return self._match_by_id('pims', pims_id, date) + return self._match_by_id("pims", pims_id, date) diff --git a/pyscraper/contextexception.py b/pyscraper/contextexception.py index 9d1b3b7d..5547caf6 100755 --- a/pyscraper/contextexception.py +++ b/pyscraper/contextexception.py @@ -1,9 +1,9 @@ #! $Id: contextexception.py,v 1.12 2004/12/23 12:27:09 goatchurch Exp $ # vim:sw=8:ts=8:et:nowrap -class ContextException(Exception): - def __init__(self, description, stamp = None, fragment = None): +class ContextException(Exception): + def __init__(self, description, stamp=None, fragment=None): self.description = description self.stamp = stamp self.fragment = fragment diff --git a/pyscraper/get_links_from_ep.py b/pyscraper/get_links_from_ep.py index 30db8627..d7e63cac 100755 --- a/pyscraper/get_links_from_ep.py +++ b/pyscraper/get_links_from_ep.py @@ -1,37 +1,35 @@ #!/usr/bin/env python3 import operator -from lxml import etree + from everypolitician import EveryPolitician +from lxml import etree def output_file(country, legislature, filename): data = EveryPolitician().country(country).legislature(legislature) output_filename = "../members/{0}.xml".format(filename) - root = etree.Element('publicwhip') + root = etree.Element("publicwhip") - sorted_people = sorted( - data.popolo().persons, - key=operator.attrgetter('name') - ) + sorted_people = sorted(data.popolo().persons, key=operator.attrgetter("name")) for person in sorted_people: - parlparse_id = person.identifier_value('parlparse') + parlparse_id = person.identifier_value("parlparse") if parlparse_id is not None: props = {} if person.twitter: - props['twitter_username'] = person.twitter + props["twitter_username"] = person.twitter if person.facebook: - props['facebook_page'] = person.facebook + props["facebook_page"] = person.facebook if props: - props['id'] = parlparse_id - info = etree.Element('personinfo', props) + props["id"] = parlparse_id + info = etree.Element("personinfo", props) root.append(info) et = etree.ElementTree(root) et.write(output_filename, pretty_print=True) -output_file('UK', 'Commons', 'social-media-commons') -output_file('Scotland', 'Parliament', 'social-media-sp') -output_file('Northern-Ireland', 'Assembly', 'social-media-ni') +output_file("UK", "Commons", "social-media-commons") +output_file("Scotland", "Parliament", "social-media-sp") +output_file("Northern-Ireland", "Assembly", "social-media-ni") diff --git a/pyscraper/gettwittermps.py b/pyscraper/gettwittermps.py index 3a10319d..4cecc71e 100755 --- a/pyscraper/gettwittermps.py +++ b/pyscraper/gettwittermps.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 -import urllib.request import csv +import urllib.request import xml.sax uri = "http://spreadsheets.google.com/tq?tqx=out:csv&key=0AjWA_TWMI4t_dFI5MWRWZkRWbFJ6MVhHQzVmVndrZnc&hl=en_GB" @@ -9,34 +9,39 @@ f = urllib.request.urlopen(uri) csv_data = f.read() lines = csv_data.split("\n") -rows = csv.reader(lines.__iter__(), delimiter=',', quotechar='"') +rows = csv.reader(lines.__iter__(), delimiter=",", quotechar='"') + class PeopleParser(xml.sax.handler.ContentHandler): def __init__(self): self.parser = xml.sax.make_parser() self.parser.setContentHandler(self) - def parse(self,filename): + + def parse(self, filename): self.office_id_to_person_id = {} self.parser.parse(filename) - def startElement(self,name,attrs): - if name == 'person': - self.current_person_id = attrs['id'] - elif name == 'office': - self.office_id_to_person_id[attrs['id']] = self.current_person_id - def endElement(self,name): - if name == 'person': + + def startElement(self, name, attrs): + if name == "person": + self.current_person_id = attrs["id"] + elif name == "office": + self.office_id_to_person_id[attrs["id"]] = self.current_person_id + + def endElement(self, name): + if name == "person": self.current_person_id = None + people_parser = PeopleParser() people_parser.parse("../members/people.xml") person_id_to_twitter_username = {} output_filename = "../members/twitter-commons.xml" -fp = open(output_filename,"w") -fp.write(''' +fp = open(output_filename, "w") +fp.write(""" -''') +""") for r in rows: if len(r) < 5: @@ -49,8 +54,10 @@ def endElement(self,name): if len(twitter_username) == 0: continue if member_id not in people_parser.office_id_to_person_id: - raise "No person ID found for %s in line %s" % (member_id,"#".join(r)) + raise "No person ID found for %s in line %s" % (member_id, "#".join(r)) person_id = people_parser.office_id_to_person_id[member_id] - fp.write("\n"%(person_id,twitter_username)) + fp.write( + '\n' % (person_id, twitter_username) + ) fp.write("") diff --git a/pyscraper/gidmatching.py b/pyscraper/gidmatching.py index c11ad2a5..84287746 100644 --- a/pyscraper/gidmatching.py +++ b/pyscraper/gidmatching.py @@ -1,344 +1,406 @@ +import difflib import re + import miscfuncs -import difflib -#from xmlfilewrite import PrevParsedFile + +# from xmlfilewrite import PrevParsedFile class PrevParsedFile: - pass + pass + toppath = miscfuncs.toppath pwxmldirs = miscfuncs.pwxmldirs tempfilename = miscfuncs.tempfilename -from miscfuncs import NextAlphaString, AlphaStringToOrder - - # get the min index that matches this def GetMinIndex(indx, a): - assert indx[0] == 0 and a < indx[-1] - i0, i1 = 0, len(indx) - 1 - while i0 + 1 < i1: - im = (i0 + i1) // 2 - assert i0 != im and i1 != im - if indx[im] <= a: - i0 = im - else: - i1 = im - assert indx[i0] <= a < indx[i1] - return i0 + assert indx[0] == 0 and a < indx[-1] + i0, i1 = 0, len(indx) - 1 + while i0 + 1 < i1: + im = (i0 + i1) // 2 + assert i0 != im and i1 != im + if indx[im] <= a: + i0 = im + else: + i1 = im + assert indx[i0] <= a < indx[i1] + return i0 def PrepareXMLForDiff(scrapeversion): - chks = re.findall("<(major-heading|minor-heading|oral-heading|speech|division|divisioncount|ques|reply)\s(.*?)>\n?([\s\S]*?)\n?\s*", scrapeversion) - - # make identically structured huge string over the previous xml file with heading stuff stripped out - essxlist = [ ] - essxindx = [ ] - for chk in chks: - # print chk - assert chk[0] == chk[3] # chunk type (this can fail if due to the lack of two \n's between the two labels, and thus detects an empty speech, which should not be there. - # new_chk = chk[2] - new_chk = re.sub( - r'(?s)(<(p|tr)\s[^>]*>)(.*?)(<\/\2>)', - lambda m: (''.join((m.group(1), re.sub('\n', ' ', m.group(3)), m.group(4)))), - chk[2] - ) - essxindx.append(len(essxlist)) - essxlist.append("HEADING-" + chk[0]) - speaker = re.search('nospeaker="true"|divnumber|(?:speakerid|person_id)="[^"]*"', chk[1]).group(0) - essxlist.append(speaker) - - if re.match("oral-heading|major-heading|minor-heading", chk[0]): - #assert not re.search("[<>]", chk[2]) - heading = new_chk.strip() - essxlist.extend(heading.split()) - else: - for ps in new_chk.split('\n'): - m = re.match("\s*<(?:p|tr)[^>]*>\s*(.*?)\s*\s*$", ps) - if m: - para = m.group(1) - else: - assert re.match("\s*]*>|&\w+;|[^<>\s]+", para)) - - essxindx.append(len(essxlist)) - assert len(chks) + 1 == len(essxindx) - return essxindx, essxlist, chks + chks = re.findall( + "<(major-heading|minor-heading|oral-heading|speech|division|divisioncount|ques|reply)\s(.*?)>\n?([\s\S]*?)\n?\s*", + scrapeversion, + ) + + # make identically structured huge string over the previous xml file with heading stuff stripped out + essxlist = [] + essxindx = [] + for chk in chks: + # print chk + assert ( + chk[0] == chk[3] + ) # chunk type (this can fail if due to the lack of two \n's between the two labels, and thus detects an empty speech, which should not be there. + # new_chk = chk[2] + new_chk = re.sub( + r"(?s)(<(p|tr)\s[^>]*>)(.*?)(<\/\2>)", + lambda m: ( + "".join((m.group(1), re.sub("\n", " ", m.group(3)), m.group(4))) + ), + chk[2], + ) + essxindx.append(len(essxlist)) + essxlist.append("HEADING-" + chk[0]) + speaker = re.search( + 'nospeaker="true"|divnumber|(?:speakerid|person_id)="[^"]*"', chk[1] + ).group(0) + essxlist.append(speaker) + + if re.match("oral-heading|major-heading|minor-heading", chk[0]): + # assert not re.search("[<>]", chk[2]) + heading = new_chk.strip() + essxlist.extend(heading.split()) + else: + for ps in new_chk.split("\n"): + m = re.match("\s*<(?:p|tr)[^>]*>\s*(.*?)\s*\s*$", ps) + if m: + para = m.group(1) + else: + assert re.match( + "\s*]*>|&\w+;|[^<>\s]+", para)) + + essxindx.append(len(essxlist)) + assert len(chks) + 1 == len(essxindx) + return essxindx, essxlist, chks + # the difficult function that finds matches in the gids # we don't use an xml parsing feature because it transforms the text # Very hard use of difflib going on here too # We make great use of the indices of the different lists def FactorChanges(flatb, scrapeversion): - essxindx, essxlist, chks = PrepareXMLForDiff(scrapeversion) - - # now make a huge string over the flatb with heading stuff stripped out - essflatblist = [ ] - essflatbindx = [ ] - for qb in flatb: - essflatbindx.append(len(essflatblist)) - essflatblist.append("HEADING-" + qb.typ) - essflatblist.append(re.search('nospeaker="true"|(?:speakerid|person_id)="[^"]*"', qb.speaker).group(0)) - - if re.match("oral-heading|major-heading|minor-heading", qb.typ): - heading = ("".join(qb.stext)).strip() - essflatblist.extend(heading.split()) - - # strip format labels out of paragraphs - else: - for ps in qb.stext: - m = re.match("\s*<(?:p|tr)[^>]*>\s*(.*?)\s*\s*$", ps) - if m: - para = m.group(1) - else: - assert re.match("\s*]*>|&\w+;|[^<>\s]+", para)) - - essflatbindx.append(len(essflatblist)) - assert len(essflatbindx) == len(flatb) + 1 - - - # make parallel sequences to the flatb and to this which are stripped down to their essence - # so that the difflib can work on them - return DoFactorDiff(essflatbindx, essflatblist, essxindx, essxlist, chks, flatb) + essxindx, essxlist, chks = PrepareXMLForDiff(scrapeversion) + + # now make a huge string over the flatb with heading stuff stripped out + essflatblist = [] + essflatbindx = [] + for qb in flatb: + essflatbindx.append(len(essflatblist)) + essflatblist.append("HEADING-" + qb.typ) + essflatblist.append( + re.search( + 'nospeaker="true"|(?:speakerid|person_id)="[^"]*"', qb.speaker + ).group(0) + ) + + if re.match("oral-heading|major-heading|minor-heading", qb.typ): + heading = ("".join(qb.stext)).strip() + essflatblist.extend(heading.split()) + + # strip format labels out of paragraphs + else: + for ps in qb.stext: + m = re.match("\s*<(?:p|tr)[^>]*>\s*(.*?)\s*\s*$", ps) + if m: + para = m.group(1) + else: + assert re.match( + "\s*]*>|&\w+;|[^<>\s]+", para)) + + essflatbindx.append(len(essflatblist)) + assert len(essflatbindx) == len(flatb) + 1 + + # make parallel sequences to the flatb and to this which are stripped down to their essence + # so that the difflib can work on them + return DoFactorDiff(essflatbindx, essflatblist, essxindx, essxlist, chks, flatb) def DoFactorDiff(essflatbindx, essflatblist, essxindx, essxlist, chks, flatb): - # now apply the diffing function on this - sm = difflib.SequenceMatcher(None, essxlist, essflatblist) - smblocks = [ ((smb[0], smb[0] + smb[2]), (smb[1], smb[1] + smb[2])) for smb in sm.get_matching_blocks()[:-1] ] - - # we collect the range for the previous speeches and map it to a set of ranges - # in the next speeches - - # case of missing entries map to the last speech matched to. - lastmatchg = None - - res = [ ] - for ix in range(len(chks)): - ixr = (essxindx[ix], essxindx[ix + 1]) - nixrl = [ ] - nixrlsz = 0 - - # intersect the set of ranges against the contiguous blocks and match forwards - for lsmb in smblocks: - if ixr[1] > lsmb[0][0] and ixr[0] < lsmb[0][1]: - ixi = (max(ixr[0], lsmb[0][0]), min(ixr[1], lsmb[0][1])) - assert ixi[0] < ixi[1] - offs = lsmb[1][0] - lsmb[0][0] - ixit = (ixi[0] + offs, ixi[1] + offs) - assert not nixrl or (nixrl[-1][1] <= ixit[0]) - nixrl.append(ixit) - nixrlsz += ixit[1] - ixit[0] - - # at least one word is overlapping - if nixrl: - # go through the matchint cases - matchlist = [ GetMinIndex(essflatbindx, nixrl[0][0]) ] - if nixrlsz != ixr[1] - ixr[0] or len(nixrl) > 1: - matchtype = "changes" - for ixit in nixrl: - ml = GetMinIndex(essflatbindx, ixit[0]) - if matchlist[-1] != ml: - matchlist.append(ml) - ml = GetMinIndex(essflatbindx, ixit[1] - 1) - if matchlist[-1] != ml: - matchlist.append(ml) - if len(matchlist) != 1: - matchtype = "multiplecover" - else: - assert len(nixrl) == 1 - matchtype = "perfectmatch" - - # missing speech - else: - print(chks[ix]) - if lastmatchg: - print("Missing speech matched to last matched speech") - matchlist = [ lastmatchg ] - else: - print("No match on first speech problem.") - matchlist = [] - matchtype = "missing" - - # output the (sometimes more than) one redirect of the right redirect type - chk = chks[ix] - oldgid = re.search('id="([\w\d\-\.,/]*)"', chk[1]).group(1) - for matchg in matchlist: - res.append('\n' % (oldgid, flatb[matchg].GID, matchtype)) - lastmatchg = matchg - - # output old version as well, if it's different - if matchtype != "perfectmatch": - res.append("<%s %s>\n" % (chk[0], chk[1])) - res.append(chk[2]) - res.append("\n") - res.append("\n" % chk[0]) - - return res + # now apply the diffing function on this + sm = difflib.SequenceMatcher(None, essxlist, essflatblist) + smblocks = [ + ((smb[0], smb[0] + smb[2]), (smb[1], smb[1] + smb[2])) + for smb in sm.get_matching_blocks()[:-1] + ] + + # we collect the range for the previous speeches and map it to a set of ranges + # in the next speeches + + # case of missing entries map to the last speech matched to. + lastmatchg = None + + res = [] + for ix in range(len(chks)): + ixr = (essxindx[ix], essxindx[ix + 1]) + nixrl = [] + nixrlsz = 0 + + # intersect the set of ranges against the contiguous blocks and match forwards + for lsmb in smblocks: + if ixr[1] > lsmb[0][0] and ixr[0] < lsmb[0][1]: + ixi = (max(ixr[0], lsmb[0][0]), min(ixr[1], lsmb[0][1])) + assert ixi[0] < ixi[1] + offs = lsmb[1][0] - lsmb[0][0] + ixit = (ixi[0] + offs, ixi[1] + offs) + assert not nixrl or (nixrl[-1][1] <= ixit[0]) + nixrl.append(ixit) + nixrlsz += ixit[1] - ixit[0] + + # at least one word is overlapping + if nixrl: + # go through the matchint cases + matchlist = [GetMinIndex(essflatbindx, nixrl[0][0])] + if nixrlsz != ixr[1] - ixr[0] or len(nixrl) > 1: + matchtype = "changes" + for ixit in nixrl: + ml = GetMinIndex(essflatbindx, ixit[0]) + if matchlist[-1] != ml: + matchlist.append(ml) + ml = GetMinIndex(essflatbindx, ixit[1] - 1) + if matchlist[-1] != ml: + matchlist.append(ml) + if len(matchlist) != 1: + matchtype = "multiplecover" + else: + assert len(nixrl) == 1 + matchtype = "perfectmatch" + + # missing speech + else: + print(chks[ix]) + if lastmatchg: + print("Missing speech matched to last matched speech") + matchlist = [lastmatchg] + else: + print("No match on first speech problem.") + matchlist = [] + matchtype = "missing" + + # output the (sometimes more than) one redirect of the right redirect type + chk = chks[ix] + oldgid = re.search('id="([\w\d\-\.,/]*)"', chk[1]).group(1) + for matchg in matchlist: + res.append( + '\n' + % (oldgid, flatb[matchg].GID, matchtype) + ) + lastmatchg = matchg + + # output old version as well, if it's different + if matchtype != "perfectmatch": + res.append("<%s %s>\n" % (chk[0], chk[1])) + res.append(chk[2]) + res.append("\n") + res.append("\n" % chk[0]) + + return res def MeasureBlockSimilarity(oldtext, qblock): - flattenoldtext = re.split("<[^>]*>|\s+", oldtext) - flattennewtext = qblock.FlattenTextWords() + flattenoldtext = re.split("<[^>]*>|\s+", oldtext) + flattennewtext = qblock.FlattenTextWords() - sm = difflib.SequenceMatcher(lambda x: x == "", flattenoldtext, flattennewtext) - return sm.ratio() + sm = difflib.SequenceMatcher(lambda x: x == "", flattenoldtext, flattennewtext) + return sm.ratio() # special case because the questions can be re-ordered def FactorChangesWrans(majblocks, scrapeversion): - - # we need to break the scrape version - # we separate out and match the major headings separately - # (anyway, these aren't really used) - - # and then match the questions - - # first extract all the oldtype gid-redirects that will have been put in here by the pre-2005 bMakeOldWransGidsToNew cases - res = re.findall('\n', scrapeversion) - - # extract major headings and match injectively exactly (till we find a failed example). - mhchks = re.findall(']*>\n\s*([\s\S]*?)\s*?\n', scrapeversion) - - majblocknames = [ "".join(majblock[0].stext).strip() for majblock in majblocks ] - for mhchk in mhchks: - if mhchk[1] in majblocknames: - i = majblocknames.index(mhchk[1]) - res.append('\n' % (mhchk[0], majblocks[i][0].qGID)) - majblocknames[i] = None # take it out of circulation - else: - res.append('\n' % (mhchk[0], majblocks[0][0].qGID)) - - # break into question blocks - # [0]=headingGID, [1]=further choss, [2]=headingtext, [3]=question+reply text - # the " tags have been removed, so split to end of document - qebchks = re.findall(']*)>\n([\s\S]*?)\n([\s\S]*?)\s*(?=<(?:major-heading|minor-heading|gidredirect[^>]*oldwranstype)|$)', - scrapeversion) - - # make the map from qnums to blocks - qnummissings = [ ] - qnummapq = { } - for majblock in majblocks: - for qblock in majblock[1]: - for qnum in qblock.qnums: - assert qnum not in qnummapq # failure means this qnum is found twice in the newly parsed file. - qnummapq[qnum] = qblock - if re.match("ZZZZerror", qnum): - qnummissings.append(qnum) - - - # for each block, find the map forward and check if we want to reprint it in full. - for qebchk in qebchks: - qqnums = re.findall('

]*?qnum="([\d\w]+)">', qebchk[3]) - assert qqnums - - # make sure that they all link to the same qnum in the new one - qblock = None - for qqnum in qqnums: - if qblock: - if qblock.headingqb.qGID != qnummapq[qqnum].headingqb.qGID: - print(qblock.headingqb.qGID, qnummapq[qqnum].headingqb.qGID) - assert qblock.headingqb.qGID == qnummapq[qqnum].headingqb.qGID - elif qqnum != '0' and qqnum in qnummapq: # 0 is when there is a missing qnum - qblock = qnummapq[qqnum] - - # in this case the qnums are fail for finding the match, so we either drop it, or find - # the match by closest in text. Prefer to match blocks to - if not qblock: - # find the closest match for this block out of this missing qnum blocks on the new page - # (this will need to account for all blocks if in future the correction is to add in the qnum) - if qnummissings: - qmissblocksscore = [ ] - for qqnum in qnummissings: - similarity = MeasureBlockSimilarity(qebchk[3], qnummapq[qqnum]) - qmissblocksscore.append((similarity, qqnum)) - qmissblockscorebest = max(qmissblocksscore) - qblock = qnummapq[qmissblockscorebest[1]] - if miscfuncs.IsNotQuiet(): - print("Missing qnum; mapping %s to %s with score %f" % (qebchk[0], qblock.headingqb.qGID, qmissblockscorebest[0])) - assert qmissblockscorebest[0] > 0.8 # otherwise it's not really a match and we need to look harder. - # perhaps it's matched to a block in the new file which newly has a qnum, and we then have to scan against all of them. - - # now have to check matching. - # convert both to strings and compare. - essxfq = [ ] # this forms the string which we will be comparing against. - qebchkquesids = [ ] # expect only one of each - qebchkreplids = [ ] - for wd in re.findall("<[^>]*>|&\w+;|[^<>\s]+", qebchk[3]): - mwd = re.match('<(p|tr|reply|ques)\s*(?:p?id="([^"]*)")?[^>]*>', wd) - if mwd: - essxfq.append("<%s>" % mwd.group(1)) - assert mwd.group(1) not in ("reply", "ques") or mwd.group(2) - if mwd.group(1) == "ques": - qebchkquesids.append(mwd.group(2)) - elif mwd.group(1) == "reply": - qebchkreplids.append(mwd.group(2)) - - elif not re.match("\n' % (qebchk[0], majblocks[0][0].qGID)) - for qebq in qebchkquesids: - res.append('\n' % (qebq, majblocks[0][0].qGID)) - for qebqr in qebchkreplids: - res.append('\n' % (qebqr, majblocks[0][0].qGID)) - # Is the lred current-gidredirects bit needed here too? Don't think so, but not sure - continue - - # build up the same summary from the question block - essbkfq = [ ] - for qblockqr in (qblock.queses, qblock.replies): - for qb in qblockqr: - essbkfq.append("<%s>" % qb.typ) - for wd in re.findall("<[^>]*>|&\w+;|[^<>\s]+", "\n".join(qb.stext)): - mwd = re.match("<(p|tr)[^>]*>", wd) - if mwd: - essbkfq.append("<%s>" % mwd.group(1)) - elif not re.match("" % qb.typ) - - # print the link forwards - bchanges = (essxfq != essbkfq) - matchtype = bchanges and "changes" or "perfectmatch" - if bchanges: - res.append("\n") - res.append('\n' % (qebchk[0], qblock.headingqb.qGID, matchtype)) - - # write the parallel redirects for the question and reply (both mapping to same parts of each) - # this may be more sophisticated once we see an example of failure - # ultimately this is a job for paragraph matching - - # sometimes we get more than one question. - # when we find a mismatch we'll deal with it as a special paragraph problem, or not bother. - if len(qebchkquesids) != len(qblock.queses): - print(len(qebchkquesids), len(qblock.queses), qblock.queses[0].qGID) - assert len(qebchkquesids) == len(qblock.queses) - for i in range(len(qebchkquesids)): - res.append('\n' % (qebchkquesids[i], qblock.queses[i].qGID, matchtype)) - - assert len(qebchkreplids) == len(qblock.replies) == 1 - for qebqr in qebchkreplids: - res.append('\n' % (qebqr, qblock.replies[0].qGID, matchtype)) - - - # if changes write out the original, else just the gidmaps - if bchanges: - res.append('\n' % qebchk[0:2]) - res.append(qebchk[2]) - res.append('\n') - res.append(qebchk[3]) - res.append("\n\n") - else: - for lred in re.findall("]*>\n", qebchk[3]): - res.append("\t") - res.append(lred) - - return res - + # we need to break the scrape version + # we separate out and match the major headings separately + # (anyway, these aren't really used) + + # and then match the questions + + # first extract all the oldtype gid-redirects that will have been put in here by the pre-2005 bMakeOldWransGidsToNew cases + res = re.findall( + '\n', + scrapeversion, + ) + + # extract major headings and match injectively exactly (till we find a failed example). + mhchks = re.findall( + ']*>\n\s*([\s\S]*?)\s*?\n', + scrapeversion, + ) + + majblocknames = ["".join(majblock[0].stext).strip() for majblock in majblocks] + for mhchk in mhchks: + if mhchk[1] in majblocknames: + i = majblocknames.index(mhchk[1]) + res.append( + '\n' + % (mhchk[0], majblocks[i][0].qGID) + ) + majblocknames[i] = None # take it out of circulation + else: + res.append( + '\n' + % (mhchk[0], majblocks[0][0].qGID) + ) + + # break into question blocks + # [0]=headingGID, [1]=further choss, [2]=headingtext, [3]=question+reply text + # the " tags have been removed, so split to end of document + qebchks = re.findall( + ']*)>\n([\s\S]*?)\n([\s\S]*?)\s*(?=<(?:major-heading|minor-heading|gidredirect[^>]*oldwranstype)|$)', + scrapeversion, + ) + + # make the map from qnums to blocks + qnummissings = [] + qnummapq = {} + for majblock in majblocks: + for qblock in majblock[1]: + for qnum in qblock.qnums: + assert ( + qnum not in qnummapq + ) # failure means this qnum is found twice in the newly parsed file. + qnummapq[qnum] = qblock + if re.match("ZZZZerror", qnum): + qnummissings.append(qnum) + + # for each block, find the map forward and check if we want to reprint it in full. + for qebchk in qebchks: + qqnums = re.findall('

]*?qnum="([\d\w]+)">', qebchk[3]) + assert qqnums + + # make sure that they all link to the same qnum in the new one + qblock = None + for qqnum in qqnums: + if qblock: + if qblock.headingqb.qGID != qnummapq[qqnum].headingqb.qGID: + print(qblock.headingqb.qGID, qnummapq[qqnum].headingqb.qGID) + assert qblock.headingqb.qGID == qnummapq[qqnum].headingqb.qGID + elif ( + qqnum != "0" and qqnum in qnummapq + ): # 0 is when there is a missing qnum + qblock = qnummapq[qqnum] + + # in this case the qnums are fail for finding the match, so we either drop it, or find + # the match by closest in text. Prefer to match blocks to + if not qblock: + # find the closest match for this block out of this missing qnum blocks on the new page + # (this will need to account for all blocks if in future the correction is to add in the qnum) + if qnummissings: + qmissblocksscore = [] + for qqnum in qnummissings: + similarity = MeasureBlockSimilarity(qebchk[3], qnummapq[qqnum]) + qmissblocksscore.append((similarity, qqnum)) + qmissblockscorebest = max(qmissblocksscore) + qblock = qnummapq[qmissblockscorebest[1]] + if miscfuncs.IsNotQuiet(): + print( + "Missing qnum; mapping %s to %s with score %f" + % (qebchk[0], qblock.headingqb.qGID, qmissblockscorebest[0]) + ) + assert ( + qmissblockscorebest[0] > 0.8 + ) # otherwise it's not really a match and we need to look harder. + # perhaps it's matched to a block in the new file which newly has a qnum, and we then have to scan against all of them. + + # now have to check matching. + # convert both to strings and compare. + essxfq = [] # this forms the string which we will be comparing against. + qebchkquesids = [] # expect only one of each + qebchkreplids = [] + for wd in re.findall("<[^>]*>|&\w+;|[^<>\s]+", qebchk[3]): + mwd = re.match('<(p|tr|reply|ques)\s*(?:p?id="([^"]*)")?[^>]*>', wd) + if mwd: + essxfq.append("<%s>" % mwd.group(1)) + assert mwd.group(1) not in ("reply", "ques") or mwd.group(2) + if mwd.group(1) == "ques": + qebchkquesids.append(mwd.group(2)) + elif mwd.group(1) == "reply": + qebchkreplids.append(mwd.group(2)) + + elif not re.match("\n' + % (qebchk[0], majblocks[0][0].qGID) + ) + for qebq in qebchkquesids: + res.append( + '\n' + % (qebq, majblocks[0][0].qGID) + ) + for qebqr in qebchkreplids: + res.append( + '\n' + % (qebqr, majblocks[0][0].qGID) + ) + # Is the lred current-gidredirects bit needed here too? Don't think so, but not sure + continue + + # build up the same summary from the question block + essbkfq = [] + for qblockqr in (qblock.queses, qblock.replies): + for qb in qblockqr: + essbkfq.append("<%s>" % qb.typ) + for wd in re.findall("<[^>]*>|&\w+;|[^<>\s]+", "\n".join(qb.stext)): + mwd = re.match("<(p|tr)[^>]*>", wd) + if mwd: + essbkfq.append("<%s>" % mwd.group(1)) + elif not re.match("" % qb.typ) + + # print the link forwards + bchanges = essxfq != essbkfq + matchtype = bchanges and "changes" or "perfectmatch" + if bchanges: + res.append("\n") + res.append( + '\n' + % (qebchk[0], qblock.headingqb.qGID, matchtype) + ) + + # write the parallel redirects for the question and reply (both mapping to same parts of each) + # this may be more sophisticated once we see an example of failure + # ultimately this is a job for paragraph matching + + # sometimes we get more than one question. + # when we find a mismatch we'll deal with it as a special paragraph problem, or not bother. + if len(qebchkquesids) != len(qblock.queses): + print(len(qebchkquesids), len(qblock.queses), qblock.queses[0].qGID) + assert len(qebchkquesids) == len(qblock.queses) + for i in range(len(qebchkquesids)): + res.append( + '\n' + % (qebchkquesids[i], qblock.queses[i].qGID, matchtype) + ) + + assert len(qebchkreplids) == len(qblock.replies) == 1 + for qebqr in qebchkreplids: + res.append( + '\n' + % (qebqr, qblock.replies[0].qGID, matchtype) + ) + + # if changes write out the original, else just the gidmaps + if bchanges: + res.append('\n' % qebchk[0:2]) + res.append(qebchk[2]) + res.append("\n") + res.append(qebchk[3]) + res.append("\n\n") + else: + for lred in re.findall("]*>\n", qebchk[3]): + res.append("\t") + res.append(lred) + + return res diff --git a/pyscraper/lazyrunall.py b/pyscraper/lazyrunall.py index 1da9d401..fc3eeda5 100755 --- a/pyscraper/lazyrunall.py +++ b/pyscraper/lazyrunall.py @@ -3,19 +3,19 @@ # Run the script with --help to see command line options -import sys import os +import sys # change current directory to pyscraper folder script is in -os.chdir(os.path.dirname(sys.argv[0]) or '.') +os.chdir(os.path.dirname(sys.argv[0]) or ".") from optparse import OptionParser -from runfilters import RunFiltersDir, RunNIFilters -from regmem.filter import RunRegmemFilters -import ni.scrape -from regmem.pullgluepages import RegmemPullGluePages +import ni.scrape from miscfuncs import SetQuiet +from regmem.filter import RunRegmemFilters +from regmem.pullgluepages import RegmemPullGluePages +from runfilters import RunFiltersDir, RunNIFilters # Parse the command line parameters @@ -42,33 +42,64 @@ # See what options there are -parser.add_option("--force-parse", - action="store_true", dest="forceparse", default=False, - help="forces reprocessing of debates by first deleting output files") -parser.add_option("--force-scrape", - action="store_true", dest="forcescrape", default=False, - help="forces redownloading of HTML first deleting output files") - -parser.add_option("--from", dest="datefrom", metavar="date", default="1000-01-01", - help="date to process back to, default is start of time") -parser.add_option("--to", dest="dateto", metavar="date", default="9999-12-31", - help="date to process up to, default is present day") -parser.add_option("--date", dest="date", metavar="date", default=None, - help="date to process (overrides --from and --to)") - -parser.add_option("--patchtool", - action="store_true", dest="patchtool", default=None, - help="launch ./patchtool to fix errors in source HTML") -parser.add_option("--quietc", - action="store_true", dest="quietc", default=None, - help="low volume error messages; continue processing further files") +parser.add_option( + "--force-parse", + action="store_true", + dest="forceparse", + default=False, + help="forces reprocessing of debates by first deleting output files", +) +parser.add_option( + "--force-scrape", + action="store_true", + dest="forcescrape", + default=False, + help="forces redownloading of HTML first deleting output files", +) + +parser.add_option( + "--from", + dest="datefrom", + metavar="date", + default="1000-01-01", + help="date to process back to, default is start of time", +) +parser.add_option( + "--to", + dest="dateto", + metavar="date", + default="9999-12-31", + help="date to process up to, default is present day", +) +parser.add_option( + "--date", + dest="date", + metavar="date", + default=None, + help="date to process (overrides --from and --to)", +) + +parser.add_option( + "--patchtool", + action="store_true", + dest="patchtool", + default=None, + help="launch ./patchtool to fix errors in source HTML", +) +parser.add_option( + "--quietc", + action="store_true", + dest="quietc", + default=None, + help="low volume error messages; continue processing further files", +) (options, args) = parser.parse_args() -if (options.date): - options.datefrom = options.date - options.dateto = options.date +if options.date: + options.datefrom = options.date + options.dateto = options.date if options.quietc: - SetQuiet() + SetQuiet() # See what commands there are @@ -78,33 +109,33 @@ options.regmem = False options.ni = False for arg in args: - if arg == "scrape": - options.scrape = True - elif arg == "parse": - options.parse = True - elif arg == "regmem": - options.regmem = True - options.remote = True - elif arg == "regmem-local": - options.regmem = True - options.remote = False - elif arg == "ni": - options.ni = True - else: - print("error: no such option %s" % arg, file=sys.stderr) - parser.print_help() - sys.exit(1) -if len(args) == 0: + if arg == "scrape": + options.scrape = True + elif arg == "parse": + options.parse = True + elif arg == "regmem": + options.regmem = True + options.remote = True + elif arg == "regmem-local": + options.regmem = True + options.remote = False + elif arg == "ni": + options.ni = True + else: + print("error: no such option %s" % arg, file=sys.stderr) parser.print_help() sys.exit(1) +if len(args) == 0: + parser.print_help() + sys.exit(1) if not options.scrape and not options.parse: - print("error: choose what to do; scrape, parse, or both", file=sys.stderr) - parser.print_help() - sys.exit(1) + print("error: choose what to do; scrape, parse, or both", file=sys.stderr) + parser.print_help() + sys.exit(1) if not options.regmem and not options.ni: - print("error: choose what work on; regmem, several of them", file=sys.stderr) - parser.print_help() - sys.exit(1) + print("error: choose what work on; regmem, several of them", file=sys.stderr) + parser.print_help() + sys.exit(1) # Download/generate the new data @@ -116,7 +147,7 @@ # Parse it into XML if options.parse: - if options.ni: - RunFiltersDir(RunNIFilters, 'ni', options, options.forceparse) - if options.regmem: - RunFiltersDir(RunRegmemFilters, 'regmem', options, options.forceparse) + if options.ni: + RunFiltersDir(RunNIFilters, "ni", options, options.forceparse) + if options.regmem: + RunFiltersDir(RunRegmemFilters, "regmem", options, options.forceparse) diff --git a/pyscraper/lords/resolvenames.py b/pyscraper/lords/resolvenames.py index 2589de6d..8c7d68f9 100644 --- a/pyscraper/lords/resolvenames.py +++ b/pyscraper/lords/resolvenames.py @@ -1,62 +1,85 @@ -import json -import os.path import re -from contextexception import ContextException from base_resolver import ResolverBase +from contextexception import ContextException -titleconv = { 'L.':'Lord', - 'B.':'Baroness', - 'Abp.':'Archbishop', - 'Bp.':'Bishop', - 'V.':'Viscount', - 'E.':'Earl', - 'D.':'Duke', - 'M.':'Marquess', - 'C.':'Countess', - 'Ly.':'Lady', - } +titleconv = { + "L.": "Lord", + "B.": "Baroness", + "Abp.": "Archbishop", + "Bp.": "Bishop", + "V.": "Viscount", + "E.": "Earl", + "D.": "Duke", + "M.": "Marquess", + "C.": "Countess", + "Ly.": "Lady", +} # more tedious stuff to do: "earl of" and "sitting as" cases -hontitles = [ 'Lord ?Bishop', 'Bishop', 'Marquess', 'Lord', 'Baroness', 'Viscount', 'Earl', 'Countess', - 'Lord Archbishop', 'Archbishop', 'Duke', 'Lady' ] -hontitleso = '|'.join(hontitles) - -honcompl = re.compile('(?:(%s)|(%s) \s*(.*?))(?:\s+of\s+(.*))?$' % (hontitleso, hontitleso)) +hontitles = [ + "Lord ?Bishop", + "Bishop", + "Marquess", + "Lord", + "Baroness", + "Viscount", + "Earl", + "Countess", + "Lord Archbishop", + "Archbishop", + "Duke", + "Lady", +] +hontitleso = "|".join(hontitles) + +honcompl = re.compile( + "(?:(%s)|(%s) \s*(.*?))(?:\s+of\s+(.*))?$" % (hontitleso, hontitleso) +) + +rehonorifics = re.compile("(?: [CKO]BE| DL| TD| QC| KCMG| KCB)+$") -rehonorifics = re.compile('(?: [CKO]BE| DL| TD| QC| KCMG| KCB)+$') class LordsList(ResolverBase): - import_organization_id = 'house-of-lords' + import_organization_id = "house-of-lords" def reloadJSON(self): super(LordsList, self).reloadJSON() - self.lordnames={} # "lordnames" --> lords - self.aliases={} # Corrections to full names + self.lordnames = {} # "lordnames" --> lords + self.aliases = {} # Corrections to full names self.import_people_json() def import_people_membership(self, mship, posts, orgs): - if 'organization_id' not in mship or mship['organization_id'] != self.import_organization_id: + if ( + "organization_id" not in mship + or mship["organization_id"] != self.import_organization_id + ): return if mship["id"] in self.membertopersonmap: raise Exception("Same member id %s appeared twice" % mship["id"]) - self.membertopersonmap[mship["id"]] = mship['person_id'] - self.persontomembermap.setdefault(mship['person_id'], []).append(mship["id"]) + self.membertopersonmap[mship["id"]] = mship["person_id"] + self.persontomembermap.setdefault(mship["person_id"], []).append(mship["id"]) if self.members.get(mship["id"]): raise Exception("Repeated identifier %s in members JSON file" % mship["id"]) self.members[mship["id"]] = mship - if 'end_date' not in mship: - mship['end_date'] = '9999-12-31' + if "end_date" not in mship: + mship["end_date"] = "9999-12-31" def import_people_main_name(self, name, memberships): - mships = [m for m in memberships if m['start_date'] <= name.get('end_date', '9999-12-31') and m['end_date'] >= name.get('start_date', '1000-01-01')] - if not mships: return + mships = [ + m + for m in memberships + if m["start_date"] <= name.get("end_date", "9999-12-31") + and m["end_date"] >= name.get("start_date", "1000-01-01") + ] + if not mships: + return lname = name["lordname"] or name["lordofname"] lname = re.sub("\.", "", lname) assert lname @@ -67,17 +90,22 @@ def import_people_main_name(self, name, memberships): } for m in mships: newattr = attr.copy() - newattr['start_date'] = max(m['start_date'], name.get('start_date', '1000-01-01')) - newattr['end_date'] = min(m['end_date'], name.get('end_date', '9999-12-31')) - newattr['id'] = m["id"] + newattr["start_date"] = max( + m["start_date"], name.get("start_date", "1000-01-01") + ) + newattr["end_date"] = min(m["end_date"], name.get("end_date", "9999-12-31")) + newattr["id"] = m["id"] self.lordnames.setdefault(lname, []).append(newattr) def import_people_alternate_name(self, person, other_name, memberships): - if 'name' not in other_name: return # Only full names in Lords aliases - self.aliases[other_name['name']] = person['id'] + if "name" not in other_name: + return # Only full names in Lords aliases + self.aliases[other_name["name"]] = person["id"] # main matching function - def GetLordID(self, ltitle, llordname, llordofname, loffice, stampurl, sdate, bDivision): + def GetLordID( + self, ltitle, llordname, llordofname, loffice, stampurl, sdate, bDivision + ): if ltitle == "Lord Bishop": ltitle = "Bishop" if ltitle == "Lord Archbishop": @@ -85,15 +113,15 @@ def GetLordID(self, ltitle, llordname, llordofname, loffice, stampurl, sdate, bD llordofname = llordofname.replace(".", "") llordname = llordname.replace(".", "") - llordname = re.sub('&#(039|146|8217);', "'", llordname) + llordname = re.sub("&#(039|146|8217);", "'", llordname) llordofname = llordofname.strip() llordname = llordname.strip() # TODO: Need a Lords version of member-aliases.xml I guess - if ltitle == "Bishop" and llordofname == "Southwell" and sdate>='2005-07-01': + if ltitle == "Bishop" and llordofname == "Southwell" and sdate >= "2005-07-01": llordofname = "Southwell and Nottingham" - if ltitle == "Bishop" and llordname == "Southwell" and sdate>='2005-07-01': + if ltitle == "Bishop" and llordname == "Southwell" and sdate >= "2005-07-01": llordname = "Southwell and Nottingham" lname = llordname or llordofname @@ -101,11 +129,11 @@ def GetLordID(self, ltitle, llordname, llordofname, loffice, stampurl, sdate, bD lmatches = self.lordnames.get(lname, []) # match to successive levels of precision for identification - res = [ ] + res = [] for lm in lmatches: if lm["title"] != ltitle: # mismatch title continue - if llordname and llordofname: # two name case + if llordname and llordofname: # two name case if (lm["lordname"] == llordname) and (lm["lordofname"] == llordofname): if lm["start_date"] <= sdate <= lm["end_date"]: res.append(lm) @@ -128,29 +156,62 @@ def GetLordID(self, ltitle, llordname, llordofname, loffice, stampurl, sdate, bD if lname == lmlname: if lm["start_date"] <= sdate <= lm["end_date"]: if lm["lordname"] and llordofname: - #if not IsNotQuiet(): - print("cm---", ltitle, lm["lordname"], lm["lordofname"], llordname, llordofname) - raise ContextException("lordofname matches lordname in lordlist", stamp=stampurl, fragment=lname) + # if not IsNotQuiet(): + print( + "cm---", + ltitle, + lm["lordname"], + lm["lordofname"], + llordname, + llordofname, + ) + raise ContextException( + "lordofname matches lordname in lordlist", + stamp=stampurl, + fragment=lname, + ) else: assert lm["lordofname"] and llordname # of-name distinction lost in division lists if not bDivision: - raise ContextException("lordname matches lordofname in lordlist", stamp=stampurl, fragment=lname) + raise ContextException( + "lordname matches lordofname in lordlist", + stamp=stampurl, + fragment=lname, + ) res.append(lm) - elif ltitle != "Bishop" and ltitle != "Archbishop" and (ltitle, lname) not in (("Duke", "Norfolk"), ("Duke", "Wellington"), ('Earl', 'Kinnoull'), ('Earl', 'Selborne')): + elif ( + ltitle != "Bishop" + and ltitle != "Archbishop" + and (ltitle, lname) + not in ( + ("Duke", "Norfolk"), + ("Duke", "Wellington"), + ("Earl", "Kinnoull"), + ("Earl", "Selborne"), + ) + ): print(lm) - raise ContextException("wrong dates on lords with same name", stamp=stampurl, fragment=lname) + raise ContextException( + "wrong dates on lords with same name", + stamp=stampurl, + fragment=lname, + ) if not res: - raise ContextException("unknown lord %s %s %s %s on %s" % (ltitle, llordname, llordofname, stampurl, sdate), stamp=stampurl, fragment=lname) + raise ContextException( + "unknown lord %s %s %s %s on %s" + % (ltitle, llordname, llordofname, stampurl, sdate), + stamp=stampurl, + fragment=lname, + ) assert len(res) == 1 return self.membertoperson(res[0]["id"]) - def GetLordIDfname(self, name, loffice, sdate, stampurl=None): name = re.sub("^The ", "", name) - name = name.replace(' Of ', ' of ') + name = name.replace(" Of ", " of ") if name in self.aliases: return self.aliases[name] @@ -160,7 +221,9 @@ def GetLordIDfname(self, name, loffice, sdate, stampurl=None): hom = honcompl.match(name) if not hom: - raise ContextException("lord name format failure on '%s'" % name, stamp=stampurl, fragment=name) + raise ContextException( + "lord name format failure on '%s'" % name, stamp=stampurl, fragment=name + ) # now we have a speaker, try and break it up ltit = hom.group(1) @@ -181,25 +244,28 @@ def GetLordIDfname(self, name, loffice, sdate, stampurl=None): return self.GetLordID(ltit, lname, lplace, loffice, stampurl, sdate, False) - def MatchRevName(self, fss, sdate, stampurl): assert fss - lfn = re.match('(.*?)(?: of (.*?))?, {0,3}((?:L|B|Abp|Bp|V|E|D|M|C|Ly)\.?)$', fss) + lfn = re.match( + "(.*?)(?: of (.*?))?, {0,3}((?:L|B|Abp|Bp|V|E|D|M|C|Ly)\.?)$", fss + ) if not lfn: print("$$$%s$$$" % fss) - raise ContextException("No match of format in MatchRevName", stamp=stampurl, fragment=fss) + raise ContextException( + "No match of format in MatchRevName", stamp=stampurl, fragment=fss + ) shorttitle = lfn.group(3) - if shorttitle[-1] != '.': + if shorttitle[-1] != ".": shorttitle += "." ltitle = titleconv[shorttitle] llordname = lfn.group(1).replace(".", "") llordname = llordname.replace("'", "'") llordname = re.sub("^De ", "de ", llordname) - fullname = '%s %s' % (ltitle, llordname) + fullname = "%s %s" % (ltitle, llordname) llordofname = "" if lfn.group(2): llordofname = lfn.group(2).replace(".", "") - fullname = '%s of %s' % (fullname, llordofname) + fullname = "%s of %s" % (fullname, llordofname) if fullname in self.aliases: return self.aliases[fullname] diff --git a/pyscraper/miscfuncs.py b/pyscraper/miscfuncs.py index 1cc463e3..ec06620a 100755 --- a/pyscraper/miscfuncs.py +++ b/pyscraper/miscfuncs.py @@ -1,43 +1,51 @@ +import os import re -import sys import string -import os import tempfile # make the top path data directory value -toppath = os.path.abspath('../../parldata') +toppath = os.path.abspath("../../parldata") if not os.path.exists(toppath): - toppath = os.path.abspath('../../../parldata') + toppath = os.path.abspath("../../../parldata") if not os.path.exists(toppath): - toppath = os.path.abspath(os.path.expanduser('~/parldata/')) + toppath = os.path.abspath(os.path.expanduser("~/parldata/")) if not os.path.exists(toppath): - toppath = 'C:\\parldata' + toppath = "C:\\parldata" # output directories used for the scraper pwcmdirs = os.path.join(toppath, "cmpages") pwxmldirs = os.path.join(toppath, "scrapedxml") -pwpatchesdirs = os.path.abspath("patches") # made locally, relative to the lazyrunall.py module. Should be relative to toppath eventually +pwpatchesdirs = os.path.abspath( + "patches" +) # made locally, relative to the lazyrunall.py module. Should be relative to toppath eventually -if (not os.path.isdir(toppath)): - raise Exception('Data directory %s does not exist, please create' % (toppath)) +if not os.path.isdir(toppath): + raise Exception("Data directory %s does not exist, please create" % (toppath)) # print "Data directory (set in miscfuncs.py): %s" % toppath # temporary files are stored here tmppath = os.path.join(toppath, "tmp") -if (not os.path.isdir(tmppath)): - os.mkdir(tmppath) +if not os.path.isdir(tmppath): + os.mkdir(tmppath) tempfilename = tempfile.mktemp("", "pw-gluetemp-", tmppath) # find raw data path rawdatapath = os.path.join(os.getcwd(), "../rawdata") -if (not os.path.isdir(toppath)): - raise Exception('Raw data directory %s does not exist, you\'ve not got a proper checkout from CVS.' % (toppath)) +if not os.path.isdir(toppath): + raise Exception( + "Raw data directory %s does not exist, you've not got a proper checkout from CVS." + % (toppath) + ) # quiet flag bNotQuiet = True + + def SetQuiet(): global bNotQuiet bNotQuiet = False + + def IsNotQuiet(): return bNotQuiet @@ -48,16 +56,17 @@ def IsNotQuiet(): # use this to generate chronological scraped files of the same page def NextAlphaString(s): - assert re.match('[a-z]*$', s) + assert re.match("[a-z]*$", s) if not s: - return 'a' + return "a" i = string.ascii_lowercase.find(s[-1]) + 1 if i < len(string.ascii_lowercase): return s[:-1] + string.ascii_lowercase[i] - return NextAlphaString(s[:-1]) + 'a' + return NextAlphaString(s[:-1]) + "a" + def AlphaStringToOrder(s): - assert re.match('[a-z]*$', s) + assert re.match("[a-z]*$", s) res = 0 while s: i = string.ascii_lowercase.find(s[0]) + 1 @@ -65,6 +74,7 @@ def AlphaStringToOrder(s): s = s[1:] return res + # Impossible to do 6pm, 7.15pm, 6.30pm, 6.45pm, 7pm without future timestamps # So not caring any more about timestamp errors # Need good timestamps for video ;-) So turning back on, might try different tack at some point @@ -73,8 +83,10 @@ def AlphaStringToOrder(s): regparsetime = re.compile("^(\d+)[\.:]\s*(\d+)(?:\s?| )([\w\.]*)$") # 7 pm regparsetimeonhour = re.compile("^(\d+)()(?:\s?| )([\w\.]+)$") + + def TimeProcessing(time, previoustimearr, bIsDivisionTime, stampurl): - #print "time ", time + # print "time ", time previoustime = None if previoustimearr: @@ -88,7 +100,7 @@ def TimeProcessing(time, previoustimearr, bIsDivisionTime, stampurl): timeparts = regparsetimeonhour.match(time) if timeparts: hour = int(timeparts.group(1)) - if (timeparts.group(2) != ""): + if timeparts.group(2) != "": mins = int(timeparts.group(2)) else: mins = 0 @@ -114,12 +126,15 @@ def TimeProcessing(time, previoustimearr, bIsDivisionTime, stampurl): hour -= 12 if previoustime and previoustimehour + 12 <= hour: - print("TIME: time shift by 12 (from %s to %s) -- should a p.m. be an a.m.? %s" % (previoustime, time, repr(stampurl))) + print( + "TIME: time shift by 12 (from %s to %s) -- should a p.m. be an a.m.? %s" + % (previoustime, time, repr(stampurl)) + ) - elif time == 'Midnight': + elif time == "Midnight": hour = 24 mins = 0 - elif time == 'Noon': + elif time == "Noon": hour = 12 mins = 0 else: @@ -127,11 +142,10 @@ def TimeProcessing(time, previoustimearr, bIsDivisionTime, stampurl): res = "%03d:%02d:00" % (hour, mins) - # day-rotate situation where they went on beyond midnight # it's uncommon enough to handle by listing exceptional days # (sometimes the division time is out of order because that is where it is inserted in the record -- maybe should patch to handle) - #print previoustime, res, bIsDivisionTime, stampurl.sdate + # print previoustime, res, bIsDivisionTime, stampurl.sdate if previoustime and res < previoustime: if stampurl.sdate in ["2005-03-10"]: if previoustime < "024": @@ -142,14 +156,21 @@ def TimeProcessing(time, previoustimearr, bIsDivisionTime, stampurl): elif stampurl.sdate in ["2002-10-28"]: return res - elif hour in [0, 1, 2, 3, 4] or stampurl.sdate in ["2003-10-20", "2000-10-03", "2000-07-24", "2011-01-17"]: + elif hour in [0, 1, 2, 3, 4] or stampurl.sdate in [ + "2003-10-20", + "2000-10-03", + "2000-07-24", + "2011-01-17", + ]: hour += 24 else: - print('TIME: time rotation (from %s to %s %s) not close to midnight %s' % (previoustime, time, res, repr(stampurl))) + print( + "TIME: time rotation (from %s to %s %s) not close to midnight %s" + % (previoustime, time, res, repr(stampurl)) + ) res = "%03d:%02d:00" % (hour, mins) - # capture the case where we are out of order by more than a few minutes # (divisions are often out of order slightly) @@ -160,7 +181,10 @@ def TimeProcessing(time, previoustimearr, bIsDivisionTime, stampurl): previoustimeminutes = previoustimehour * 60 + int(prevtimeMatch.group(2)) if timeminutes < previoustimeminutes: if not bIsDivisionTime or (previoustimeminutes - timeminutes > 10): - print('TIME: time out of order, from %s to %s (division=%s) %s' % (previoustime, res, bIsDivisionTime, repr(stampurl))) + print( + "TIME: time out of order, from %s to %s (division=%s) %s" + % (previoustime, res, bIsDivisionTime, repr(stampurl)) + ) return res @@ -168,84 +192,72 @@ def TimeProcessing(time, previoustimearr, bIsDivisionTime, stampurl): # http://www.bigbaer.com/reference/character_entity_reference.htm # Make sure you update WriteXMLHeader in xmlfilewrite.py also! entitymap = { - ' ':' ', - '&':'&', - - # see http://www.cs.tut.fi/~jkorpela/www/windows-chars.html for a useful, if now dated in - # terms of browser support for the proper solutions, info on windows ndash/mdash (150/151) - '–':'–', # convert windows latin-1 extension ndash into a real one - '—':'—', # likewise mdash - '¡':'¡', # inverted exclamation mark - '÷':'÷', # division sign - - 'è':'è', # this is e-grave - 'é':'é', # this is e-acute - 'ê':'ê', # this is e-hat - 'ë':'ë', # this is e-double-dot - - 'ß':'ß', - 'à':'à', # this is a-grave - 'á':'á', # this is a-acute - 'â':'â', # this is a-hat as in debacle - 'ã':'ã', # this is a-hat as in debacle - 'ä':'ä', - - 'ô':'ô', # this is o-hat - 'ö':'ö', # this is o-double-dot - 'Ö':'Ö', # this is capital o-double-dot - 'ó':'ó', # this is o-acute - 'ø':'ø', # this is o-slash - 'õ':'õ', # this is o-tilde - - 'í':'í', # this is i-acute - 'î':'î', # this is i-circumflex - 'ï':'ï', # this is i-double-dot, as in naive - - 'ç':'ç', # this is cedilla - 'ú':'ú', - 'ü':'ü', # this is u-double-dot - 'ñ':'ñ', # spanish n as in Senor - 'þ':'þ', - - '±':'±', # this is +/- symbol - '£':'£', # UK currency - '§':'§', # UK currency - '©':'©', - '·':'·', # middle dot - '°':'°', # this is the degrees - 'º':'º', # this is the M ordinal - '®':'®', # this is the degrees - '¶':'¶', # end-paragraph (pi) symbol - - 'µ':'µ', # this is one quarter symbol - '¼':'¼', # this is one quarter symbol - '½':'½', # this is one half symbol - '¾':'¾', # this is three quarter symbol - - '#':'#', # this is hash - '_':'_', # this is underscore symbol - '_':'_', # this is underscore symbol - - ''':"'", # possession apostrophe - "€":'€', # this is euro currency - "™":'™', - "•":'•', - '&lquo;':"'", - '&rquo;':"'", - '−':"-", - - '‘':"'", - '’':"'", - '“':'"', - '”':'"', - '…':'...', - '†':'†', - - '²':'²', - '’':"'", - 'œ':'œ', - 'æ':'æ', - '†':'†', + " ": " ", + "&": "&", + # see http://www.cs.tut.fi/~jkorpela/www/windows-chars.html for a useful, if now dated in + # terms of browser support for the proper solutions, info on windows ndash/mdash (150/151) + "–": "–", # convert windows latin-1 extension ndash into a real one + "—": "—", # likewise mdash + "¡": "¡", # inverted exclamation mark + "÷": "÷", # division sign + "è": "è", # this is e-grave + "é": "é", # this is e-acute + "ê": "ê", # this is e-hat + "ë": "ë", # this is e-double-dot + "ß": "ß", + "à": "à", # this is a-grave + "á": "á", # this is a-acute + "â": "â", # this is a-hat as in debacle + "ã": "ã", # this is a-hat as in debacle + "ä": "ä", + "ô": "ô", # this is o-hat + "ö": "ö", # this is o-double-dot + "Ö": "Ö", # this is capital o-double-dot + "ó": "ó", # this is o-acute + "ø": "ø", # this is o-slash + "õ": "õ", # this is o-tilde + "í": "í", # this is i-acute + "î": "î", # this is i-circumflex + "ï": "ï", # this is i-double-dot, as in naive + "ç": "ç", # this is cedilla + "ú": "ú", + "ü": "ü", # this is u-double-dot + "ñ": "ñ", # spanish n as in Senor + "þ": "þ", + "±": "±", # this is +/- symbol + "£": "£", # UK currency + "§": "§", # UK currency + "©": "©", + "·": "·", # middle dot + "°": "°", # this is the degrees + "º": "º", # this is the M ordinal + "®": "®", # this is the degrees + "¶": "¶", # end-paragraph (pi) symbol + "µ": "µ", # this is one quarter symbol + "¼": "¼", # this is one quarter symbol + "½": "½", # this is one half symbol + "¾": "¾", # this is three quarter symbol + "#": "#", # this is hash + "_": "_", # this is underscore symbol + "_": "_", # this is underscore symbol + "'": "'", # possession apostrophe + "€": "€", # this is euro currency + "™": "™", + "•": "•", + "&lquo;": "'", + "&rquo;": "'", + "−": "-", + "‘": "'", + "’": "'", + "“": """, + "”": """, + "…": "...", + "†": "†", + "²": "²", + "’": "'", + "œ": "œ", + "æ": "æ", + "†": "†", } entitymaprev = entitymap.values() @@ -253,14 +265,14 @@ def TimeProcessing(time, previoustimearr, bIsDivisionTime, stampurl): def StripAnchorTags(text): raise Exception("I've never called this function, so test it") - abf = re.split('(<[^>]*>)', text) + abf = re.split("(<[^>]*>)", text) - ret = '' + ret = "" for ab in abf: - if re.match(']*>(?i)', ab): + if re.match("]*>(?i)", ab): pass - elif re.match('(?i)', ab): + elif re.match("(?i)", ab): pass else: @@ -270,148 +282,163 @@ def StripAnchorTags(text): def WriteCleanText(fout, text, striphref=True): - text = re.sub('', '', text) - abf = re.split('(<[^>]*>)', text) + text = re.sub("", "", text) + abf = re.split("(<[^>]*>)", text) for ab in abf: # delete comments and links - if re.match(']*?->', ab): + if re.match("]*?->", ab): pass # XXX Differs from pullgluepages version - elif striphref and re.match(']+>(?i)', ab): - anamem = re.match(']+>(?i)", ab): + anamem = re.match("(?i)', ab): + elif striphref and re.match("(?i)", ab): pass # spaces only inside tags - elif re.match('<[^>]*>', ab): - fout.write(re.sub('\s', ' ', ab)) + elif re.match("<[^>]*>", ab): + fout.write(re.sub("\s", " ", ab)) # take out spurious > symbols and dos linefeeds else: - fout.write(re.sub('>|\r', '', ab)) + fout.write(re.sub(">|\r", "", ab)) # Legacy patch system, use patchfilter.py and patchtool now def ApplyFixSubstitutions(text, sdate, fixsubs): for sub in fixsubs: - if sub[3] == 'all' or sub[3] == sdate: + if sub[3] == "all" or sub[3] == sdate: (text, n) = re.subn(sub[0], sub[1], text) if (sub[2] != -1) and (n != sub[2]): print(sub) - raise Exception('wrong number of substitutions %d on %s' % (n, sub[0])) + raise Exception("wrong number of substitutions %d on %s" % (n, sub[0])) return text # this only accepts and tags def StraightenHTMLrecurse(stex, stampurl): # split the text into and and and - qisup = re.search(r'(<(a|i|b|s|small|sup|sub)( href="[^"]*")?>(.*?))(?i)', stex) + qisup = re.search( + r'(<(a|i|b|s|small|sup|sub)( href="[^"]*")?>(.*?))(?i)', stex + ) if qisup: qtagtype = qisup.group(2) - qhref = qisup.group(3) or '' - qtag = ('<%s%s>' % (qtagtype, qhref), '' % qtagtype) + qhref = qisup.group(3) or "" + qtag = ("<%s%s>" % (qtagtype, qhref), "" % qtagtype) if not qisup: qisup = re.search('(<(a) href="([^"]*)">(.*?))(?i)', stex) if qisup: - qtag = ('' % qisup.group(3), '') + qtag = ('' % qisup.group(3), "") if qisup: - sres = StraightenHTMLrecurse(stex[:qisup.start(1)], stampurl) + sres = StraightenHTMLrecurse(stex[: qisup.start(1)], stampurl) sres.append(qtag[0]) sres.extend(StraightenHTMLrecurse(qisup.group(4), stampurl)) sres.append(qtag[1]) - sres.extend(StraightenHTMLrecurse(stex[qisup.end(1):], stampurl)) + sres.extend(StraightenHTMLrecurse(stex[qisup.end(1) :], stampurl)) return sres - sres = re.split('(&[a-z0-9]*?;|&#\d+;|"|\xa3|&|\x01|\x0e|\x14|\x92|\xb0|\xab|\xe9|\xc3\xb8|\xc3\xb1|<[^>]*>|<|>)', stex) + sres = re.split( + '(&[a-z0-9]*?;|&#\d+;|"|\xa3|&|\x01|\x0e|\x14|\x92|\xb0|\xab|\xe9|\xc3\xb8|\xc3\xb1|<[^>]*>|<|>)', + stex, + ) for i in range(len(sres)): - #print "sresi ", sres[i], "\n" - #print "-----------------------------------------------\n" + # print "sresi ", sres[i], "\n" + # print "-----------------------------------------------\n" if not sres[i]: pass - elif re.match('&#[0-9]+;', sres[i]) and not re.match('[345][0-9];', sres[i]): + elif re.match("&#[0-9]+;", sres[i]) and not re.match("[345][0-9];", sres[i]): pass - elif sres[i][0] == '&': + elif sres[i][0] == "&": if sres[i] in entitymap: sres[i] = entitymap[sres[i]] elif sres[i] in entitymaprev: pass - elif sres[i] == '—': # special case as entitymap maps it with spaces + elif sres[i] == "—": # special case as entitymap maps it with spaces pass - elif sres[i] in ('"', '&', '<', '>'): + elif sres[i] in (""", "&", "<", ">"): pass - elif sres[i] in ('“', '”'): - sres[i] = '"' + elif sres[i] in ("“", "”"): + sres[i] = """ else: - raise Exception(sres[i] + ' unknown ent') - sres[i] = 'UNKNOWN-ENTITY' + raise Exception(sres[i] + " unknown ent") + sres[i] = "UNKNOWN-ENTITY" elif sres[i] == '"': - sres[i] = '"' + sres[i] = """ # junk chars sometimes get in # NB this only works if the characters are split in the regexp above - elif sres[i] == '\x01': - sres[i] = '' - elif sres[i] == '\x0e': - sres[i] = ' ' - elif sres[i] == '\x14': - sres[i] = ' ' - elif sres[i] == '\x92': + elif sres[i] == "\x01": + sres[i] = "" + elif sres[i] == "\x0e": + sres[i] = " " + elif sres[i] == "\x14": + sres[i] = " " + elif sres[i] == "\x92": sres[i] = "'" - elif sres[i] == '\xa3': - sres[i] = '£' - elif sres[i] == '\xb0': - sres[i] = '°' - elif sres[i] == '\xab': - sres[i] = 'é' - elif sres[i] == '\xe9': - sres[i] = 'é' - elif sres[i] == '\xc3\xb8': - sres[i] = 'ø' - elif sres[i] == '\xc3\xb1': - sres[i] = 'ñ' - - elif re.match('$(?i)', sres[i]): - sres[i] = '' # 'OPEN-i-TAG-OUT-OF-PLACE' 'CLOSE-i-TAG-OUT-OF-PLACE' - - elif re.match('$', sres[i]): # what is this? wrans 2003-05-13 has one - sres[i] = '' + elif sres[i] == "\xa3": + sres[i] = "£" + elif sres[i] == "\xb0": + sres[i] = "°" + elif sres[i] == "\xab": + sres[i] = "é" + elif sres[i] == "\xe9": + sres[i] = "é" + elif sres[i] == "\xc3\xb8": + sres[i] = "ø" + elif sres[i] == "\xc3\xb1": + sres[i] = "ñ" + + elif re.match("$(?i)", sres[i]): + sres[i] = "" # 'OPEN-i-TAG-OUT-OF-PLACE' 'CLOSE-i-TAG-OUT-OF-PLACE' + + elif re.match( + "$", sres[i] + ): # what is this? wrans 2003-05-13 has one + sres[i] = "" # allow brs through - elif re.match('
$(?i)', sres[i]): - sres[i] = '
' + elif re.match("
$(?i)", sres[i]): + sres[i] = "
" # discard garbage that appears in recent today postings - elif re.match('$(?i)', sres[i]): - sres[i] = '' + elif re.match("$(?i)", sres[i]): + sres[i] = "" - elif sres[i][0] == '<' or sres[i][0] == '>': + elif sres[i][0] == "<" or sres[i][0] == ">": print("Part:", sres[i][0]) - print("All:",sres[i]) + print("All:", sres[i]) print("stex:", stex) print("raising") - raise ContextException('tag %s tag out of place in %s' % (sres[i], stex), stamp=stampurl, fragment=stex) + raise ContextException( + "tag %s tag out of place in %s" % (sres[i], stex), + stamp=stampurl, + fragment=stex, + ) return sres # The lookahead assertion (?=

      |
||
|]*>(?i)' -reparts = re.compile('(|(?=]*?>|' + restmatcher + ')') +restmatcher = paratag + "|
        |
||
|]*>(?i)" +reparts = re.compile("(|(?=]*?>|" + restmatcher + ")") + +retable = re.compile("(?i)") +retablestart = re.compile("
      |
|||]*>|]*>$(?i)" +) +reparaempty = re.compile("(?:\s|| )*$(?i)") +reitalif = re.compile("\s*\s*$(?i)") -retable = re.compile('(?i)') -retablestart = re.compile('
      |
|||]*>|]*>$(?i)') -reparaempty = re.compile('(?:\s|| )*$(?i)') -reitalif = re.compile('\s*\s*$(?i)') # Break text into paragraphs. # the result alternates between lists of space types, and strings @@ -423,12 +450,11 @@ def SplitParaSpace(text, stampurl): # list of space objects, list of string spclist = [] - pstring = '' + pstring = "" parts = reparts.split(text) newparts = [] # split up the start bits without end
into component parts for nf in parts: - # a tiny bit of extra splitting up as output if retablestart.match(nf) and not retable.match(nf): newparts.extend(reparts2.split(nf)) @@ -437,11 +463,11 @@ def SplitParaSpace(text, stampurl): # get rid of blank and boring paragraphs if reparaempty.match(nf): - if pstring and re.search('\S', nf): + if pstring and re.search("\S", nf): print(text) - print('---' + pstring) - print('---' + nf) - raise Exception(' it carried across empty para ') + print("---" + pstring) + print("---" + nf) + raise Exception(" it carried across empty para ") continue # list of space type objects @@ -456,11 +482,10 @@ def SplitParaSpace(text, stampurl): print(text) print(spclist) print(pstring) - raise Exception(' double italic in paraspace ') - pstring = '' + raise Exception(" double italic in paraspace ") + pstring = "" continue - # we now have a string of a paragraph which we are putting into the list. # table type @@ -468,7 +493,7 @@ def SplitParaSpace(text, stampurl): if retable.match(nf): if pstring: print(text) - raise Exception(' non-empty preceding string ') + raise Exception(" non-empty preceding string ") pstring = nf bthisparaalone = True @@ -479,21 +504,22 @@ def SplitParaSpace(text, stampurl): else: pstring = lnf.strip() - # check that paragraphs have some text - if re.match('(?:<[^>]*>|\s)*$', pstring): + if re.match("(?:<[^>]*>|\s)*$", pstring): print("\nspclist:", spclist) print("\npstring:", pstring) print("\nthe text:", text[:100]) print("\nnf:", nf) - raise ContextException('no text in paragraph', stamp=stampurl, fragment=pstring) + raise ContextException( + "no text in paragraph", stamp=stampurl, fragment=pstring + ) # check that paragraph spaces aren't only font text, and have something # real in them, unless they are breaks because of tables if not (bprevparaalone or bthisparaalone): bnonfont = False for sl in spclist: - if not re.match(']*>(?i)', sl): + if not re.match("]*>(?i)", sl): bnonfont = True if not bnonfont: print("text:", text) @@ -502,17 +528,20 @@ def SplitParaSpace(text, stampurl): print("----------") print("nf", nf) print("----------") - raise ContextException('font found in middle of paragraph should be a paragraph break or removed', stamp=stampurl, fragment=pstring) + raise ContextException( + "font found in middle of paragraph should be a paragraph break or removed", + stamp=stampurl, + fragment=pstring, + ) bprevparaalone = bthisparaalone - # put the preceding space, then the string into output list res.append(spclist) res.append(pstring) - #print "???%s???" % pstring + # print "???%s???" % pstring - spclist = [ ] - pstring = '' + spclist = [] + pstring = "" # findal spaces into the output list res.append(spclist) @@ -523,27 +552,29 @@ def SplitParaSpace(text, stampurl): # Break text into paragraphs and mark the paragraphs according to their