\s*.*?]*>([^<]*)', page)
for row in m:
url, name = row
- pid, canonname, canoncons = memberList.matchfullnamecons(name, None, today)
+ pid, canonname, canoncons = memberList.matchfullnamecons(name, None, today)
print((' ' % (pid, name)))
-print('')
-
+print("")
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..a7b7661f
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,72 @@
+[tool.poetry]
+package-mode = false
+
+[tool.poetry.dependencies]
+python = "^3.9"
+beautifulsoup4 = "4.12.3"
+everypolitician = "0.0.13"
+lxml = "5.2.1"
+python-dateutil = "2.2"
+requests = { version = "2.32.3", extras = ["security"] }
+requests-cache = "0.4.13"
+Click = "7.0"
+click-log = "0.3.2"
+
+[tool.poetry.group.dev.dependencies]
+ruff = "^0.6.7"
+
+[tool.ruff]
+
+extend-include = ["scripts/*"]
+extend-exclude = [
+ 'scripts/.gitignore',
+ 'scripts/2016_data_update/README.txt',
+ 'scripts/config.pm.incvs',
+ 'scripts/consts',
+ 'scripts/crontab',
+ 'scripts/dailyupdate',
+ 'scripts/datadotparl/mp-party-check',
+ 'scripts/datadotparl/one-off-add-pims-ids',
+ 'scripts/datadotparl/one-off-sync-lord-parties',
+ 'scripts/dircmp',
+ 'scripts/divisionextractor.pl',
+ 'scripts/morningupdate',
+ 'scripts/ni-format-revert',
+ 'scripts/ni_membership.php',
+ 'scripts/one-off-move-names-to-persons',
+ 'scripts/other-sites-update',
+ 'scripts/updatedaterange-parse',
+ 'scripts/updatedaterange-scrape',
+ 'scripts/weeklyupdate',
+ 'scripts/ynmp/tests.txt'
+]
+
+
+
+[tool.ruff.lint]
+select = [
+ "E",
+ # flake8
+ "F",
+ # isort
+ "I",
+]
+ignore = [
+ # line too long, sorted with formatter where it can be
+ "E501",
+]
+
+
+[tool.ruff.lint.isort]
+known-first-party = ["hub"]
+section-order = [
+ "future",
+ "standard-library",
+ "django",
+ "third-party",
+ "first-party",
+ "local-folder"
+]
+
+[tool.ruff.lint.isort.sections]
+django = ["django"]
\ No newline at end of file
diff --git a/pyscraper/base_resolver.py b/pyscraper/base_resolver.py
index ea7939b4..62beacab 100644
--- a/pyscraper/base_resolver.py
+++ b/pyscraper/base_resolver.py
@@ -2,177 +2,211 @@
import os
import re
-members_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'members'))
+members_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "members"))
+
class ResolverBase(object):
def __init__(self):
self.reloadJSON()
def reloadJSON(self):
- self.members = {} # ID --> membership
- self.persons = {} # ID --> person
- self.fullnames = {} # "Firstname Lastname" --> memberships
- self.lastnames = {} # Surname --> memberships
-
- self.constoidmap = {} # constituency name --> cons attributes (with date and ID)
- self.considtonamemap = {} # cons ID --> name
- self.considtomembermap = {} # cons ID --> memberships
- self.historichansard = {} # Historic Hansard commons membership ID -> MPs
- self.pims = {} # Pims membership ID and date -> MPs
- self.mnis = {} # Parliament Member Names ID to person
-
- self.parties = {} # party --> memberships
- self.membertopersonmap = {} # member ID --> person ID
- self.persontomembermap = {} # person ID --> memberships
+ self.members = {} # ID --> membership
+ self.persons = {} # ID --> person
+ self.fullnames = {} # "Firstname Lastname" --> memberships
+ self.lastnames = {} # Surname --> memberships
+
+ self.constoidmap = {} # constituency name --> cons attributes (with date and ID)
+ self.considtonamemap = {} # cons ID --> name
+ self.considtomembermap = {} # cons ID --> memberships
+ self.historichansard = {} # Historic Hansard commons membership ID -> MPs
+ self.pims = {} # Pims membership ID and date -> MPs
+ self.mnis = {} # Parliament Member Names ID to person
+
+ self.parties = {} # party --> memberships
+ self.membertopersonmap = {} # member ID --> person ID
+ self.persontomembermap = {} # person ID --> memberships
def import_constituencies(self):
- data = json.load(open(os.path.join(members_dir, 'people.json')))
- for con in data['posts']:
- if con['organization_id'] != self.import_organization_id:
+ data = json.load(open(os.path.join(members_dir, "people.json")))
+ for con in data["posts"]:
+ if con["organization_id"] != self.import_organization_id:
continue
attr = {
- 'id': con['id'],
- 'start_date': con.get('start_date', '0000-00-00'),
- 'end_date': con.get('end_date', '9999-12-31'),
+ "id": con["id"],
+ "start_date": con.get("start_date", "0000-00-00"),
+ "end_date": con.get("end_date", "9999-12-31"),
}
- if len(attr['start_date']) == 4:
- attr['start_date'] = '%s-01-01' % attr['start_date']
- if len(attr['end_date']) == 4:
- attr['end_date'] = '%s-12-31' % attr['end_date']
+ if len(attr["start_date"]) == 4:
+ attr["start_date"] = "%s-01-01" % attr["start_date"]
+ if len(attr["end_date"]) == 4:
+ attr["end_date"] = "%s-12-31" % attr["end_date"]
- names = [con['area']['name']] + con['area'].get('other_names', [])
+ names = [con["area"]["name"]] + con["area"].get("other_names", [])
for name in names:
- if not con['id'] in self.considtonamemap:
- self.considtonamemap[con['id']] = name
+ if con["id"] not in self.considtonamemap:
+ self.considtonamemap[con["id"]] = name
self.constoidmap.setdefault(name, []).append(attr)
nopunc = self.strip_punctuation(name)
self.constoidmap.setdefault(nopunc, []).append(attr)
def strip_punctuation(self, cons):
- nopunc = cons.replace(',','').replace('-','').replace(' ','').lower().strip()
+ nopunc = cons.replace(",", "").replace("-", "").replace(" ", "").lower().strip()
return nopunc
def import_people_json(self):
- data = json.load(open(os.path.join(members_dir, 'people.json')))
- posts = {post['id']: post for post in data['posts']}
- orgs = {org['id']: org for org in data['organizations']}
- for mship in data['memberships']:
+ data = json.load(open(os.path.join(members_dir, "people.json")))
+ posts = {post["id"]: post for post in data["posts"]}
+ orgs = {org["id"]: org for org in data["organizations"]}
+ for mship in data["memberships"]:
self.import_people_membership(mship, posts, orgs)
- for person in data['persons']:
+ for person in data["persons"]:
self.import_people_names(person)
def import_people_membership(self, mship, posts, orgs):
- if 'post_id' not in mship or posts[mship['post_id']]['organization_id'] != self.import_organization_id:
+ if (
+ "post_id" not in mship
+ or posts[mship["post_id"]]["organization_id"] != self.import_organization_id
+ ):
return
if mship["id"] in self.membertopersonmap:
raise Exception("Same member id %s appeared twice" % mship["id"])
- self.membertopersonmap[mship["id"]] = mship['person_id']
- self.persontomembermap.setdefault(mship['person_id'], []).append(mship["id"])
+ self.membertopersonmap[mship["id"]] = mship["person_id"]
+ self.persontomembermap.setdefault(mship["person_id"], []).append(mship["id"])
if self.members.get(mship["id"]):
raise Exception("Repeated identifier %s in members JSON file" % mship["id"])
self.members[mship["id"]] = mship
- if 'end_date' not in mship:
- mship['end_date'] = '9999-12-31'
+ if "end_date" not in mship:
+ mship["end_date"] = "9999-12-31"
# index by constituency
- mship['constituency'] = posts[mship['post_id']]['area']['name']
- consids = self.constoidmap[mship['constituency']]
+ mship["constituency"] = posts[mship["post_id"]]["area"]["name"]
+ consids = self.constoidmap[mship["constituency"]]
consid = None
# find the constituency id for this person
- mship_start_date = len(mship['start_date'])==4 and ('%s-01-01' % mship['start_date']) or mship['start_date']
- mship_end_date = len(mship['end_date'])==4 and ('%s-12-31' % mship['end_date']) or mship['end_date']
+ mship_start_date = (
+ len(mship["start_date"]) == 4
+ and ("%s-01-01" % mship["start_date"])
+ or mship["start_date"]
+ )
+ mship_end_date = (
+ len(mship["end_date"]) == 4
+ and ("%s-12-31" % mship["end_date"])
+ or mship["end_date"]
+ )
for cons in consids:
- if (cons['start_date'] <= mship_start_date and
- mship_start_date <= mship_end_date and
- mship_end_date <= cons['end_date']):
- if consid and consid != cons['id']:
- raise Exception("Two constituency ids %s %s overlap with MP %s" % (consid, cons['id'], mship['id']))
- consid = cons['id']
+ if (
+ cons["start_date"] <= mship_start_date
+ and mship_start_date <= mship_end_date
+ and mship_end_date <= cons["end_date"]
+ ):
+ if consid and consid != cons["id"]:
+ raise Exception(
+ "Two constituency ids %s %s overlap with MP %s"
+ % (consid, cons["id"], mship["id"])
+ )
+ consid = cons["id"]
if not consid:
raise Exception("Constituency '%s' not found" % mship["constituency"])
# check name in members file is same as default in cons file
backformed_cons = self.considtonamemap[consid]
if backformed_cons != mship["constituency"]:
- raise Exception("Constituency '%s' in members file differs from first constituency '%s' listed in cons file" % (mship["constituency"], backformed_cons))
+ raise Exception(
+ "Constituency '%s' in members file differs from first constituency '%s' listed in cons file"
+ % (mship["constituency"], backformed_cons)
+ )
# check first date ranges don't overlap, MPs only
# Only check modern MPs as we might have overlapping data previously
- if self.import_organization_id == 'house-of-commons':
+ if self.import_organization_id == "house-of-commons":
for cons in self.considtomembermap.get(consid, []):
- if cons['end_date'] < '1997-05-01': continue
- if cons['start_date'] <= mship['start_date'] <= cons['end_date'] \
- or cons['start_date'] <= mship['end_date'] <= cons['end_date'] \
- or mship['start_date'] <= cons['start_date'] <= mship['end_date'] \
- or mship['start_date'] <= cons['end_date'] <= mship['end_date']:
- raise Exception("%s %s Two MP entries for constituency %s with overlapping dates" % (mship, cons, consid))
+ if cons["end_date"] < "1997-05-01":
+ continue
+ if (
+ cons["start_date"] <= mship["start_date"] <= cons["end_date"]
+ or cons["start_date"] <= mship["end_date"] <= cons["end_date"]
+ or mship["start_date"] <= cons["start_date"] <= mship["end_date"]
+ or mship["start_date"] <= cons["end_date"] <= mship["end_date"]
+ ):
+ raise Exception(
+ "%s %s Two MP entries for constituency %s with overlapping dates"
+ % (mship, cons, consid)
+ )
# then add in
self.considtomembermap.setdefault(consid, []).append(mship)
# ... and by party
- if 'on_behalf_of_id' in mship:
- mship['party'] = orgs[mship['on_behalf_of_id']]['name']
- self.parties.setdefault(mship['party'], []).append(mship)
+ if "on_behalf_of_id" in mship:
+ mship["party"] = orgs[mship["on_behalf_of_id"]]["name"]
+ self.parties.setdefault(mship["party"], []).append(mship)
- if 'hansard_id' in mship:
- self.historichansard.setdefault(int(mship['hansard_id']), []).append(mship)
+ if "hansard_id" in mship:
+ self.historichansard.setdefault(int(mship["hansard_id"]), []).append(mship)
def import_people_names(self, person):
- if person['id'] not in self.persontomembermap:
+ if person["id"] not in self.persontomembermap:
return
- self.persons[person['id']] = person
- memberships = [self.members[x] for x in self.persontomembermap[person['id']]]
- for other_name in person.get('other_names', []):
- if other_name.get('note') == 'Main':
+ self.persons[person["id"]] = person
+ memberships = [self.members[x] for x in self.persontomembermap[person["id"]]]
+ for other_name in person.get("other_names", []):
+ if other_name.get("note") == "Main":
self.import_people_main_name(other_name, memberships)
- elif other_name.get('note') == 'Alternate':
+ elif other_name.get("note") == "Alternate":
self.import_people_alternate_name(person, other_name, memberships)
- for identifier in person.get('identifiers', []):
- if identifier.get('scheme') == 'pims_id':
- id = identifier.get('identifier')
+ for identifier in person.get("identifiers", []):
+ if identifier.get("scheme") == "pims_id":
+ id = identifier.get("identifier")
for m in memberships:
p = person.copy()
- p['start_date'] = m['start_date']
- p['end_date'] = m['end_date']
+ p["start_date"] = m["start_date"]
+ p["end_date"] = m["end_date"]
self.pims.setdefault(id, []).append(p)
- elif identifier.get('scheme') == 'datadotparl_id':
- id = identifier.get('identifier')
+ elif identifier.get("scheme") == "datadotparl_id":
+ id = identifier.get("identifier")
for m in memberships:
p = person.copy()
- p['start_date'] = m['start_date']
- p['end_date'] = m['end_date']
+ p["start_date"] = m["start_date"]
+ p["end_date"] = m["end_date"]
self.mnis.setdefault(id, []).append(p)
def import_people_main_name(self, name, memberships):
- mships = [m for m in memberships if m['start_date'] <= name.get('end_date', '9999-12-31') and m['end_date'] >= name.get('start_date', '1000-01-01')]
- if not mships: return
+ mships = [
+ m
+ for m in memberships
+ if m["start_date"] <= name.get("end_date", "9999-12-31")
+ and m["end_date"] >= name.get("start_date", "1000-01-01")
+ ]
+ if not mships:
+ return
try:
family_name = name["family_name"]
given_name = name["given_name"]
except:
- family_name = name['lordname']
- if name['lordofname']:
- family_name += ' of ' + name['lordofname']
- given_name = name['honorific_prefix']
- compoundname = '%s %s' % (given_name, family_name)
- no_initial = ''
- fnnomidinitial = re.findall('^(\S*)\s\S$', given_name)
+ family_name = name["lordname"]
+ if name["lordofname"]:
+ family_name += " of " + name["lordofname"]
+ given_name = name["honorific_prefix"]
+ compoundname = "%s %s" % (given_name, family_name)
+ no_initial = ""
+ fnnomidinitial = re.findall("^(\S*)\s\S$", given_name)
if fnnomidinitial:
no_initial = fnnomidinitial[0] + " " + family_name
- initial_name = ''
- if self.import_organization_id != 'house-of-commons' and given_name:
+ initial_name = ""
+ if self.import_organization_id != "house-of-commons" and given_name:
initial_name = given_name[0] + " " + family_name
for m in mships:
- newattr = {'id': m['id'], 'person_id': m['person_id']}
+ newattr = {"id": m["id"], "person_id": m["person_id"]}
# merge date ranges - take the smallest range covered by
# the membership, and the alias's range (if it has one)
- newattr['start_date'] = max(m['start_date'], name.get('start_date', '1000-01-01'))
- newattr['end_date'] = min(m['end_date'], name.get('end_date', '9999-12-31'))
+ newattr["start_date"] = max(
+ m["start_date"], name.get("start_date", "1000-01-01")
+ )
+ newattr["end_date"] = min(m["end_date"], name.get("end_date", "9999-12-31"))
self.fullnames.setdefault(compoundname, []).append(newattr)
if no_initial:
self.fullnames.setdefault(no_initial, []).append(newattr)
@@ -181,39 +215,54 @@ def import_people_main_name(self, name, memberships):
self.lastnames.setdefault(family_name, []).append(newattr)
def import_people_alternate_name(self, person, other_name, memberships):
- if other_name.get('organization_id') not in (None, self.import_organization_id): return
- mships = [m for m in memberships if m['start_date'] <= other_name.get('end_date', '9999-12-31') and m['end_date'] >= other_name.get('start_date', '1000-01-01')]
+ if other_name.get("organization_id") not in (None, self.import_organization_id):
+ return
+ mships = [
+ m
+ for m in memberships
+ if m["start_date"] <= other_name.get("end_date", "9999-12-31")
+ and m["end_date"] >= other_name.get("start_date", "1000-01-01")
+ ]
for m in mships:
- newattr = {'id': m['id'], 'person_id': m['person_id']}
+ newattr = {"id": m["id"], "person_id": m["person_id"]}
# merge date ranges - take the smallest range covered by
# the membership, and the alias's range (if it has one)
- newattr['start_date'] = max(m['start_date'], other_name.get('start_date', '1000-01-01'))
- newattr['end_date'] = min(m['end_date'], other_name.get('end_date', '9999-12-31'))
- if other_name.get('family_name'):
- self.lastnames.setdefault(other_name['family_name'], []).append(newattr)
+ newattr["start_date"] = max(
+ m["start_date"], other_name.get("start_date", "1000-01-01")
+ )
+ newattr["end_date"] = min(
+ m["end_date"], other_name.get("end_date", "9999-12-31")
+ )
+ if other_name.get("family_name"):
+ self.lastnames.setdefault(other_name["family_name"], []).append(newattr)
else:
- self.fullnames.setdefault(other_name['name'], []).append(newattr)
+ self.fullnames.setdefault(other_name["name"], []).append(newattr)
# Used by Commons and NI
def name_on_date(self, person_id, date):
person = self.persons[person_id]
- for nm in person['other_names']:
- if nm['note'] != 'Main': continue
- if nm.get('start_date', '0000-00-00') <= date <= nm.get('end_date', '9999-12-31'):
- if 'family_name' in nm:
+ for nm in person["other_names"]:
+ if nm["note"] != "Main":
+ continue
+ if (
+ nm.get("start_date", "0000-00-00")
+ <= date
+ <= nm.get("end_date", "9999-12-31")
+ ):
+ if "family_name" in nm:
name = nm["family_name"]
- if nm.get('given_name'):
+ if nm.get("given_name"):
name = nm["given_name"] + " " + name
- if nm.get('honorific_prefix'):
+ if nm.get("honorific_prefix"):
name = nm["honorific_prefix"] + " " + name
- else: # Lord (e.g. Lord Morrow in NI)
- name = nm['honorific_prefix']
- if nm['lordname']:
- name += ' %s' % nm['lordname']
- if nm['lordofname']:
- name += ' of %s' % nm['lordofname']
+ else: # Lord (e.g. Lord Morrow in NI)
+ name = nm["honorific_prefix"]
+ if nm["lordname"]:
+ name += " %s" % nm["lordname"]
+ if nm["lordofname"]:
+ name += " of %s" % nm["lordofname"]
return name
- raise Exception('No found for %s on %s' % (person['id'], date))
+ raise Exception("No found for %s on %s" % (person["id"], date))
def membertoperson(self, memberid):
return self.membertopersonmap[memberid]
@@ -221,12 +270,12 @@ def membertoperson(self, memberid):
def _match_by_id(self, lookup, id, date):
matches = getattr(self, lookup).get(id, [])
for m in matches:
- if m['start_date'] <= date <= m['end_date']:
+ if m["start_date"] <= date <= m["end_date"]:
return m
return None
def match_by_mnis(self, mnis_id, date):
- return self._match_by_id('mnis', mnis_id, date)
+ return self._match_by_id("mnis", mnis_id, date)
def match_by_pims(self, pims_id, date):
- return self._match_by_id('pims', pims_id, date)
+ return self._match_by_id("pims", pims_id, date)
diff --git a/pyscraper/contextexception.py b/pyscraper/contextexception.py
index 9d1b3b7d..5547caf6 100755
--- a/pyscraper/contextexception.py
+++ b/pyscraper/contextexception.py
@@ -1,9 +1,9 @@
#! $Id: contextexception.py,v 1.12 2004/12/23 12:27:09 goatchurch Exp $
# vim:sw=8:ts=8:et:nowrap
-class ContextException(Exception):
- def __init__(self, description, stamp = None, fragment = None):
+class ContextException(Exception):
+ def __init__(self, description, stamp=None, fragment=None):
self.description = description
self.stamp = stamp
self.fragment = fragment
diff --git a/pyscraper/get_links_from_ep.py b/pyscraper/get_links_from_ep.py
index 30db8627..d7e63cac 100755
--- a/pyscraper/get_links_from_ep.py
+++ b/pyscraper/get_links_from_ep.py
@@ -1,37 +1,35 @@
#!/usr/bin/env python3
import operator
-from lxml import etree
+
from everypolitician import EveryPolitician
+from lxml import etree
def output_file(country, legislature, filename):
data = EveryPolitician().country(country).legislature(legislature)
output_filename = "../members/{0}.xml".format(filename)
- root = etree.Element('publicwhip')
+ root = etree.Element("publicwhip")
- sorted_people = sorted(
- data.popolo().persons,
- key=operator.attrgetter('name')
- )
+ sorted_people = sorted(data.popolo().persons, key=operator.attrgetter("name"))
for person in sorted_people:
- parlparse_id = person.identifier_value('parlparse')
+ parlparse_id = person.identifier_value("parlparse")
if parlparse_id is not None:
props = {}
if person.twitter:
- props['twitter_username'] = person.twitter
+ props["twitter_username"] = person.twitter
if person.facebook:
- props['facebook_page'] = person.facebook
+ props["facebook_page"] = person.facebook
if props:
- props['id'] = parlparse_id
- info = etree.Element('personinfo', props)
+ props["id"] = parlparse_id
+ info = etree.Element("personinfo", props)
root.append(info)
et = etree.ElementTree(root)
et.write(output_filename, pretty_print=True)
-output_file('UK', 'Commons', 'social-media-commons')
-output_file('Scotland', 'Parliament', 'social-media-sp')
-output_file('Northern-Ireland', 'Assembly', 'social-media-ni')
+output_file("UK", "Commons", "social-media-commons")
+output_file("Scotland", "Parliament", "social-media-sp")
+output_file("Northern-Ireland", "Assembly", "social-media-ni")
diff --git a/pyscraper/gettwittermps.py b/pyscraper/gettwittermps.py
index 3a10319d..4cecc71e 100755
--- a/pyscraper/gettwittermps.py
+++ b/pyscraper/gettwittermps.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
-import urllib.request
import csv
+import urllib.request
import xml.sax
uri = "http://spreadsheets.google.com/tq?tqx=out:csv&key=0AjWA_TWMI4t_dFI5MWRWZkRWbFJ6MVhHQzVmVndrZnc&hl=en_GB"
@@ -9,34 +9,39 @@
f = urllib.request.urlopen(uri)
csv_data = f.read()
lines = csv_data.split("\n")
-rows = csv.reader(lines.__iter__(), delimiter=',', quotechar='"')
+rows = csv.reader(lines.__iter__(), delimiter=",", quotechar='"')
+
class PeopleParser(xml.sax.handler.ContentHandler):
def __init__(self):
self.parser = xml.sax.make_parser()
self.parser.setContentHandler(self)
- def parse(self,filename):
+
+ def parse(self, filename):
self.office_id_to_person_id = {}
self.parser.parse(filename)
- def startElement(self,name,attrs):
- if name == 'person':
- self.current_person_id = attrs['id']
- elif name == 'office':
- self.office_id_to_person_id[attrs['id']] = self.current_person_id
- def endElement(self,name):
- if name == 'person':
+
+ def startElement(self, name, attrs):
+ if name == "person":
+ self.current_person_id = attrs["id"]
+ elif name == "office":
+ self.office_id_to_person_id[attrs["id"]] = self.current_person_id
+
+ def endElement(self, name):
+ if name == "person":
self.current_person_id = None
+
people_parser = PeopleParser()
people_parser.parse("../members/people.xml")
person_id_to_twitter_username = {}
output_filename = "../members/twitter-commons.xml"
-fp = open(output_filename,"w")
-fp.write('''
+fp = open(output_filename, "w")
+fp.write("""
-''')
+""")
for r in rows:
if len(r) < 5:
@@ -49,8 +54,10 @@ def endElement(self,name):
if len(twitter_username) == 0:
continue
if member_id not in people_parser.office_id_to_person_id:
- raise "No person ID found for %s in line %s" % (member_id,"#".join(r))
+ raise "No person ID found for %s in line %s" % (member_id, "#".join(r))
person_id = people_parser.office_id_to_person_id[member_id]
- fp.write("\n"%(person_id,twitter_username))
+ fp.write(
+ '\n' % (person_id, twitter_username)
+ )
fp.write("")
diff --git a/pyscraper/gidmatching.py b/pyscraper/gidmatching.py
index c11ad2a5..84287746 100644
--- a/pyscraper/gidmatching.py
+++ b/pyscraper/gidmatching.py
@@ -1,344 +1,406 @@
+import difflib
import re
+
import miscfuncs
-import difflib
-#from xmlfilewrite import PrevParsedFile
+
+# from xmlfilewrite import PrevParsedFile
class PrevParsedFile:
- pass
+ pass
+
toppath = miscfuncs.toppath
pwxmldirs = miscfuncs.pwxmldirs
tempfilename = miscfuncs.tempfilename
-from miscfuncs import NextAlphaString, AlphaStringToOrder
-
-
# get the min index that matches this
def GetMinIndex(indx, a):
- assert indx[0] == 0 and a < indx[-1]
- i0, i1 = 0, len(indx) - 1
- while i0 + 1 < i1:
- im = (i0 + i1) // 2
- assert i0 != im and i1 != im
- if indx[im] <= a:
- i0 = im
- else:
- i1 = im
- assert indx[i0] <= a < indx[i1]
- return i0
+ assert indx[0] == 0 and a < indx[-1]
+ i0, i1 = 0, len(indx) - 1
+ while i0 + 1 < i1:
+ im = (i0 + i1) // 2
+ assert i0 != im and i1 != im
+ if indx[im] <= a:
+ i0 = im
+ else:
+ i1 = im
+ assert indx[i0] <= a < indx[i1]
+ return i0
def PrepareXMLForDiff(scrapeversion):
- chks = re.findall("<(major-heading|minor-heading|oral-heading|speech|division|divisioncount|ques|reply)\s(.*?)>\n?([\s\S]*?)\n?\s*(major-heading|minor-heading|oral-heading|speech|division|divisioncount|ques|reply)>", scrapeversion)
-
- # make identically structured huge string over the previous xml file with heading stuff stripped out
- essxlist = [ ]
- essxindx = [ ]
- for chk in chks:
- # print chk
- assert chk[0] == chk[3] # chunk type (this can fail if due to the lack of two \n's between the two labels, and thus detects an empty speech, which should not be there.
- # new_chk = chk[2]
- new_chk = re.sub(
- r'(?s)(<(p|tr)\s[^>]*>)(.*?)(<\/\2>)',
- lambda m: (''.join((m.group(1), re.sub('\n', ' ', m.group(3)), m.group(4)))),
- chk[2]
- )
- essxindx.append(len(essxlist))
- essxlist.append("HEADING-" + chk[0])
- speaker = re.search('nospeaker="true"|divnumber|(?:speakerid|person_id)="[^"]*"', chk[1]).group(0)
- essxlist.append(speaker)
-
- if re.match("oral-heading|major-heading|minor-heading", chk[0]):
- #assert not re.search("[<>]", chk[2])
- heading = new_chk.strip()
- essxlist.extend(heading.split())
- else:
- for ps in new_chk.split('\n'):
- m = re.match("\s*<(?:p|tr)[^>]*>\s*(.*?)\s*(?:p|tr)>\s*$", ps)
- if m:
- para = m.group(1)
- else:
- assert re.match("\s*?(?:table|tbody|thead|caption|divisioncount|mplist|mpname|lordlist|lord)", ps)
- para = ps
- essxlist.extend(re.findall("<[^>]*>|&\w+;|[^<>\s]+", para))
-
- essxindx.append(len(essxlist))
- assert len(chks) + 1 == len(essxindx)
- return essxindx, essxlist, chks
+ chks = re.findall(
+ "<(major-heading|minor-heading|oral-heading|speech|division|divisioncount|ques|reply)\s(.*?)>\n?([\s\S]*?)\n?\s*(major-heading|minor-heading|oral-heading|speech|division|divisioncount|ques|reply)>",
+ scrapeversion,
+ )
+
+ # make identically structured huge string over the previous xml file with heading stuff stripped out
+ essxlist = []
+ essxindx = []
+ for chk in chks:
+ # print chk
+ assert (
+ chk[0] == chk[3]
+ ) # chunk type (this can fail if due to the lack of two \n's between the two labels, and thus detects an empty speech, which should not be there.
+ # new_chk = chk[2]
+ new_chk = re.sub(
+ r"(?s)(<(p|tr)\s[^>]*>)(.*?)(<\/\2>)",
+ lambda m: (
+ "".join((m.group(1), re.sub("\n", " ", m.group(3)), m.group(4)))
+ ),
+ chk[2],
+ )
+ essxindx.append(len(essxlist))
+ essxlist.append("HEADING-" + chk[0])
+ speaker = re.search(
+ 'nospeaker="true"|divnumber|(?:speakerid|person_id)="[^"]*"', chk[1]
+ ).group(0)
+ essxlist.append(speaker)
+
+ if re.match("oral-heading|major-heading|minor-heading", chk[0]):
+ # assert not re.search("[<>]", chk[2])
+ heading = new_chk.strip()
+ essxlist.extend(heading.split())
+ else:
+ for ps in new_chk.split("\n"):
+ m = re.match("\s*<(?:p|tr)[^>]*>\s*(.*?)\s*(?:p|tr)>\s*$", ps)
+ if m:
+ para = m.group(1)
+ else:
+ assert re.match(
+ "\s*?(?:table|tbody|thead|caption|divisioncount|mplist|mpname|lordlist|lord)",
+ ps,
+ )
+ para = ps
+ essxlist.extend(re.findall("<[^>]*>|&\w+;|[^<>\s]+", para))
+
+ essxindx.append(len(essxlist))
+ assert len(chks) + 1 == len(essxindx)
+ return essxindx, essxlist, chks
+
# the difficult function that finds matches in the gids
# we don't use an xml parsing feature because it transforms the text
# Very hard use of difflib going on here too
# We make great use of the indices of the different lists
def FactorChanges(flatb, scrapeversion):
- essxindx, essxlist, chks = PrepareXMLForDiff(scrapeversion)
-
- # now make a huge string over the flatb with heading stuff stripped out
- essflatblist = [ ]
- essflatbindx = [ ]
- for qb in flatb:
- essflatbindx.append(len(essflatblist))
- essflatblist.append("HEADING-" + qb.typ)
- essflatblist.append(re.search('nospeaker="true"|(?:speakerid|person_id)="[^"]*"', qb.speaker).group(0))
-
- if re.match("oral-heading|major-heading|minor-heading", qb.typ):
- heading = ("".join(qb.stext)).strip()
- essflatblist.extend(heading.split())
-
- # strip format labels out of paragraphs
- else:
- for ps in qb.stext:
- m = re.match("\s*<(?:p|tr)[^>]*>\s*(.*?)\s*(?:p|tr)>\s*$", ps)
- if m:
- para = m.group(1)
- else:
- assert re.match("\s*?(?:table|tbody|thead|caption|divisioncount|mplist|mpname|lordlist|lord)", ps)
- para = ps
- # html tags should be words on their own
- essflatblist.extend(re.findall("<[^>]*>|&\w+;|[^<>\s]+", para))
-
- essflatbindx.append(len(essflatblist))
- assert len(essflatbindx) == len(flatb) + 1
-
-
- # make parallel sequences to the flatb and to this which are stripped down to their essence
- # so that the difflib can work on them
- return DoFactorDiff(essflatbindx, essflatblist, essxindx, essxlist, chks, flatb)
+ essxindx, essxlist, chks = PrepareXMLForDiff(scrapeversion)
+
+ # now make a huge string over the flatb with heading stuff stripped out
+ essflatblist = []
+ essflatbindx = []
+ for qb in flatb:
+ essflatbindx.append(len(essflatblist))
+ essflatblist.append("HEADING-" + qb.typ)
+ essflatblist.append(
+ re.search(
+ 'nospeaker="true"|(?:speakerid|person_id)="[^"]*"', qb.speaker
+ ).group(0)
+ )
+
+ if re.match("oral-heading|major-heading|minor-heading", qb.typ):
+ heading = ("".join(qb.stext)).strip()
+ essflatblist.extend(heading.split())
+
+ # strip format labels out of paragraphs
+ else:
+ for ps in qb.stext:
+ m = re.match("\s*<(?:p|tr)[^>]*>\s*(.*?)\s*(?:p|tr)>\s*$", ps)
+ if m:
+ para = m.group(1)
+ else:
+ assert re.match(
+ "\s*?(?:table|tbody|thead|caption|divisioncount|mplist|mpname|lordlist|lord)",
+ ps,
+ )
+ para = ps
+ # html tags should be words on their own
+ essflatblist.extend(re.findall("<[^>]*>|&\w+;|[^<>\s]+", para))
+
+ essflatbindx.append(len(essflatblist))
+ assert len(essflatbindx) == len(flatb) + 1
+
+ # make parallel sequences to the flatb and to this which are stripped down to their essence
+ # so that the difflib can work on them
+ return DoFactorDiff(essflatbindx, essflatblist, essxindx, essxlist, chks, flatb)
def DoFactorDiff(essflatbindx, essflatblist, essxindx, essxlist, chks, flatb):
- # now apply the diffing function on this
- sm = difflib.SequenceMatcher(None, essxlist, essflatblist)
- smblocks = [ ((smb[0], smb[0] + smb[2]), (smb[1], smb[1] + smb[2])) for smb in sm.get_matching_blocks()[:-1] ]
-
- # we collect the range for the previous speeches and map it to a set of ranges
- # in the next speeches
-
- # case of missing entries map to the last speech matched to.
- lastmatchg = None
-
- res = [ ]
- for ix in range(len(chks)):
- ixr = (essxindx[ix], essxindx[ix + 1])
- nixrl = [ ]
- nixrlsz = 0
-
- # intersect the set of ranges against the contiguous blocks and match forwards
- for lsmb in smblocks:
- if ixr[1] > lsmb[0][0] and ixr[0] < lsmb[0][1]:
- ixi = (max(ixr[0], lsmb[0][0]), min(ixr[1], lsmb[0][1]))
- assert ixi[0] < ixi[1]
- offs = lsmb[1][0] - lsmb[0][0]
- ixit = (ixi[0] + offs, ixi[1] + offs)
- assert not nixrl or (nixrl[-1][1] <= ixit[0])
- nixrl.append(ixit)
- nixrlsz += ixit[1] - ixit[0]
-
- # at least one word is overlapping
- if nixrl:
- # go through the matchint cases
- matchlist = [ GetMinIndex(essflatbindx, nixrl[0][0]) ]
- if nixrlsz != ixr[1] - ixr[0] or len(nixrl) > 1:
- matchtype = "changes"
- for ixit in nixrl:
- ml = GetMinIndex(essflatbindx, ixit[0])
- if matchlist[-1] != ml:
- matchlist.append(ml)
- ml = GetMinIndex(essflatbindx, ixit[1] - 1)
- if matchlist[-1] != ml:
- matchlist.append(ml)
- if len(matchlist) != 1:
- matchtype = "multiplecover"
- else:
- assert len(nixrl) == 1
- matchtype = "perfectmatch"
-
- # missing speech
- else:
- print(chks[ix])
- if lastmatchg:
- print("Missing speech matched to last matched speech")
- matchlist = [ lastmatchg ]
- else:
- print("No match on first speech problem.")
- matchlist = []
- matchtype = "missing"
-
- # output the (sometimes more than) one redirect of the right redirect type
- chk = chks[ix]
- oldgid = re.search('id="([\w\d\-\.,/]*)"', chk[1]).group(1)
- for matchg in matchlist:
- res.append('\n' % (oldgid, flatb[matchg].GID, matchtype))
- lastmatchg = matchg
-
- # output old version as well, if it's different
- if matchtype != "perfectmatch":
- res.append("<%s %s>\n" % (chk[0], chk[1]))
- res.append(chk[2])
- res.append("\n")
- res.append("%s>\n" % chk[0])
-
- return res
+ # now apply the diffing function on this
+ sm = difflib.SequenceMatcher(None, essxlist, essflatblist)
+ smblocks = [
+ ((smb[0], smb[0] + smb[2]), (smb[1], smb[1] + smb[2]))
+ for smb in sm.get_matching_blocks()[:-1]
+ ]
+
+ # we collect the range for the previous speeches and map it to a set of ranges
+ # in the next speeches
+
+ # case of missing entries map to the last speech matched to.
+ lastmatchg = None
+
+ res = []
+ for ix in range(len(chks)):
+ ixr = (essxindx[ix], essxindx[ix + 1])
+ nixrl = []
+ nixrlsz = 0
+
+ # intersect the set of ranges against the contiguous blocks and match forwards
+ for lsmb in smblocks:
+ if ixr[1] > lsmb[0][0] and ixr[0] < lsmb[0][1]:
+ ixi = (max(ixr[0], lsmb[0][0]), min(ixr[1], lsmb[0][1]))
+ assert ixi[0] < ixi[1]
+ offs = lsmb[1][0] - lsmb[0][0]
+ ixit = (ixi[0] + offs, ixi[1] + offs)
+ assert not nixrl or (nixrl[-1][1] <= ixit[0])
+ nixrl.append(ixit)
+ nixrlsz += ixit[1] - ixit[0]
+
+ # at least one word is overlapping
+ if nixrl:
+ # go through the matchint cases
+ matchlist = [GetMinIndex(essflatbindx, nixrl[0][0])]
+ if nixrlsz != ixr[1] - ixr[0] or len(nixrl) > 1:
+ matchtype = "changes"
+ for ixit in nixrl:
+ ml = GetMinIndex(essflatbindx, ixit[0])
+ if matchlist[-1] != ml:
+ matchlist.append(ml)
+ ml = GetMinIndex(essflatbindx, ixit[1] - 1)
+ if matchlist[-1] != ml:
+ matchlist.append(ml)
+ if len(matchlist) != 1:
+ matchtype = "multiplecover"
+ else:
+ assert len(nixrl) == 1
+ matchtype = "perfectmatch"
+
+ # missing speech
+ else:
+ print(chks[ix])
+ if lastmatchg:
+ print("Missing speech matched to last matched speech")
+ matchlist = [lastmatchg]
+ else:
+ print("No match on first speech problem.")
+ matchlist = []
+ matchtype = "missing"
+
+ # output the (sometimes more than) one redirect of the right redirect type
+ chk = chks[ix]
+ oldgid = re.search('id="([\w\d\-\.,/]*)"', chk[1]).group(1)
+ for matchg in matchlist:
+ res.append(
+ '\n'
+ % (oldgid, flatb[matchg].GID, matchtype)
+ )
+ lastmatchg = matchg
+
+ # output old version as well, if it's different
+ if matchtype != "perfectmatch":
+ res.append("<%s %s>\n" % (chk[0], chk[1]))
+ res.append(chk[2])
+ res.append("\n")
+ res.append("%s>\n" % chk[0])
+
+ return res
def MeasureBlockSimilarity(oldtext, qblock):
- flattenoldtext = re.split("<[^>]*>|\s+", oldtext)
- flattennewtext = qblock.FlattenTextWords()
+ flattenoldtext = re.split("<[^>]*>|\s+", oldtext)
+ flattennewtext = qblock.FlattenTextWords()
- sm = difflib.SequenceMatcher(lambda x: x == "", flattenoldtext, flattennewtext)
- return sm.ratio()
+ sm = difflib.SequenceMatcher(lambda x: x == "", flattenoldtext, flattennewtext)
+ return sm.ratio()
# special case because the questions can be re-ordered
def FactorChangesWrans(majblocks, scrapeversion):
-
- # we need to break the scrape version
- # we separate out and match the major headings separately
- # (anyway, these aren't really used)
-
- # and then match the questions
-
- # first extract all the oldtype gid-redirects that will have been put in here by the pre-2005 bMakeOldWransGidsToNew cases
- res = re.findall('\n', scrapeversion)
-
- # extract major headings and match injectively exactly (till we find a failed example).
- mhchks = re.findall(']*>\n\s*([\s\S]*?)\s*?\n', scrapeversion)
-
- majblocknames = [ "".join(majblock[0].stext).strip() for majblock in majblocks ]
- for mhchk in mhchks:
- if mhchk[1] in majblocknames:
- i = majblocknames.index(mhchk[1])
- res.append('\n' % (mhchk[0], majblocks[i][0].qGID))
- majblocknames[i] = None # take it out of circulation
- else:
- res.append('\n' % (mhchk[0], majblocks[0][0].qGID))
-
- # break into question blocks
- # [0]=headingGID, [1]=further choss, [2]=headingtext, [3]=question+reply text
- # the " tags have been removed, so split to end of document
- qebchks = re.findall(']*)>\n([\s\S]*?)\n([\s\S]*?)\s*(?=<(?:major-heading|minor-heading|gidredirect[^>]*oldwranstype)|$)',
- scrapeversion)
-
- # make the map from qnums to blocks
- qnummissings = [ ]
- qnummapq = { }
- for majblock in majblocks:
- for qblock in majblock[1]:
- for qnum in qblock.qnums:
- assert qnum not in qnummapq # failure means this qnum is found twice in the newly parsed file.
- qnummapq[qnum] = qblock
- if re.match("ZZZZerror", qnum):
- qnummissings.append(qnum)
-
-
- # for each block, find the map forward and check if we want to reprint it in full.
- for qebchk in qebchks:
- qqnums = re.findall(' ]*?qnum="([\d\w]+)">', qebchk[3])
- assert qqnums
-
- # make sure that they all link to the same qnum in the new one
- qblock = None
- for qqnum in qqnums:
- if qblock:
- if qblock.headingqb.qGID != qnummapq[qqnum].headingqb.qGID:
- print(qblock.headingqb.qGID, qnummapq[qqnum].headingqb.qGID)
- assert qblock.headingqb.qGID == qnummapq[qqnum].headingqb.qGID
- elif qqnum != '0' and qqnum in qnummapq: # 0 is when there is a missing qnum
- qblock = qnummapq[qqnum]
-
- # in this case the qnums are fail for finding the match, so we either drop it, or find
- # the match by closest in text. Prefer to match blocks to
- if not qblock:
- # find the closest match for this block out of this missing qnum blocks on the new page
- # (this will need to account for all blocks if in future the correction is to add in the qnum)
- if qnummissings:
- qmissblocksscore = [ ]
- for qqnum in qnummissings:
- similarity = MeasureBlockSimilarity(qebchk[3], qnummapq[qqnum])
- qmissblocksscore.append((similarity, qqnum))
- qmissblockscorebest = max(qmissblocksscore)
- qblock = qnummapq[qmissblockscorebest[1]]
- if miscfuncs.IsNotQuiet():
- print("Missing qnum; mapping %s to %s with score %f" % (qebchk[0], qblock.headingqb.qGID, qmissblockscorebest[0]))
- assert qmissblockscorebest[0] > 0.8 # otherwise it's not really a match and we need to look harder.
- # perhaps it's matched to a block in the new file which newly has a qnum, and we then have to scan against all of them.
-
- # now have to check matching.
- # convert both to strings and compare.
- essxfq = [ ] # this forms the string which we will be comparing against.
- qebchkquesids = [ ] # expect only one of each
- qebchkreplids = [ ]
- for wd in re.findall("<[^>]*>|&\w+;|[^<>\s]+", qebchk[3]):
- mwd = re.match('<(p|tr|reply|ques)\s*(?:p?id="([^"]*)")?[^>]*>', wd)
- if mwd:
- essxfq.append("<%s>" % mwd.group(1))
- assert mwd.group(1) not in ("reply", "ques") or mwd.group(2)
- if mwd.group(1) == "ques":
- qebchkquesids.append(mwd.group(2))
- elif mwd.group(1) == "reply":
- qebchkreplids.append(mwd.group(2))
-
- elif not re.match("\n' % (qebchk[0], majblocks[0][0].qGID))
- for qebq in qebchkquesids:
- res.append('\n' % (qebq, majblocks[0][0].qGID))
- for qebqr in qebchkreplids:
- res.append('\n' % (qebqr, majblocks[0][0].qGID))
- # Is the lred current-gidredirects bit needed here too? Don't think so, but not sure
- continue
-
- # build up the same summary from the question block
- essbkfq = [ ]
- for qblockqr in (qblock.queses, qblock.replies):
- for qb in qblockqr:
- essbkfq.append("<%s>" % qb.typ)
- for wd in re.findall("<[^>]*>|&\w+;|[^<>\s]+", "\n".join(qb.stext)):
- mwd = re.match("<(p|tr)[^>]*>", wd)
- if mwd:
- essbkfq.append("<%s>" % mwd.group(1))
- elif not re.match("" % qb.typ)
-
- # print the link forwards
- bchanges = (essxfq != essbkfq)
- matchtype = bchanges and "changes" or "perfectmatch"
- if bchanges:
- res.append("\n")
- res.append('\n' % (qebchk[0], qblock.headingqb.qGID, matchtype))
-
- # write the parallel redirects for the question and reply (both mapping to same parts of each)
- # this may be more sophisticated once we see an example of failure
- # ultimately this is a job for paragraph matching
-
- # sometimes we get more than one question.
- # when we find a mismatch we'll deal with it as a special paragraph problem, or not bother.
- if len(qebchkquesids) != len(qblock.queses):
- print(len(qebchkquesids), len(qblock.queses), qblock.queses[0].qGID)
- assert len(qebchkquesids) == len(qblock.queses)
- for i in range(len(qebchkquesids)):
- res.append('\n' % (qebchkquesids[i], qblock.queses[i].qGID, matchtype))
-
- assert len(qebchkreplids) == len(qblock.replies) == 1
- for qebqr in qebchkreplids:
- res.append('\n' % (qebqr, qblock.replies[0].qGID, matchtype))
-
-
- # if changes write out the original, else just the gidmaps
- if bchanges:
- res.append('\n' % qebchk[0:2])
- res.append(qebchk[2])
- res.append('\n')
- res.append(qebchk[3])
- res.append("\n\n")
- else:
- for lred in re.findall("]*>\n", qebchk[3]):
- res.append("\t")
- res.append(lred)
-
- return res
-
+ # we need to break the scrape version
+ # we separate out and match the major headings separately
+ # (anyway, these aren't really used)
+
+ # and then match the questions
+
+ # first extract all the oldtype gid-redirects that will have been put in here by the pre-2005 bMakeOldWransGidsToNew cases
+ res = re.findall(
+ '\n',
+ scrapeversion,
+ )
+
+ # extract major headings and match injectively exactly (till we find a failed example).
+ mhchks = re.findall(
+ ']*>\n\s*([\s\S]*?)\s*?\n',
+ scrapeversion,
+ )
+
+ majblocknames = ["".join(majblock[0].stext).strip() for majblock in majblocks]
+ for mhchk in mhchks:
+ if mhchk[1] in majblocknames:
+ i = majblocknames.index(mhchk[1])
+ res.append(
+ '\n'
+ % (mhchk[0], majblocks[i][0].qGID)
+ )
+ majblocknames[i] = None # take it out of circulation
+ else:
+ res.append(
+ '\n'
+ % (mhchk[0], majblocks[0][0].qGID)
+ )
+
+ # break into question blocks
+ # [0]=headingGID, [1]=further choss, [2]=headingtext, [3]=question+reply text
+ # the " tags have been removed, so split to end of document
+ qebchks = re.findall(
+ ']*)>\n([\s\S]*?)\n([\s\S]*?)\s*(?=<(?:major-heading|minor-heading|gidredirect[^>]*oldwranstype)|$)',
+ scrapeversion,
+ )
+
+ # make the map from qnums to blocks
+ qnummissings = []
+ qnummapq = {}
+ for majblock in majblocks:
+ for qblock in majblock[1]:
+ for qnum in qblock.qnums:
+ assert (
+ qnum not in qnummapq
+ ) # failure means this qnum is found twice in the newly parsed file.
+ qnummapq[qnum] = qblock
+ if re.match("ZZZZerror", qnum):
+ qnummissings.append(qnum)
+
+ # for each block, find the map forward and check if we want to reprint it in full.
+ for qebchk in qebchks:
+ qqnums = re.findall(']*?qnum="([\d\w]+)">', qebchk[3])
+ assert qqnums
+
+ # make sure that they all link to the same qnum in the new one
+ qblock = None
+ for qqnum in qqnums:
+ if qblock:
+ if qblock.headingqb.qGID != qnummapq[qqnum].headingqb.qGID:
+ print(qblock.headingqb.qGID, qnummapq[qqnum].headingqb.qGID)
+ assert qblock.headingqb.qGID == qnummapq[qqnum].headingqb.qGID
+ elif (
+ qqnum != "0" and qqnum in qnummapq
+ ): # 0 is when there is a missing qnum
+ qblock = qnummapq[qqnum]
+
+ # in this case the qnums are fail for finding the match, so we either drop it, or find
+ # the match by closest in text. Prefer to match blocks to
+ if not qblock:
+ # find the closest match for this block out of this missing qnum blocks on the new page
+ # (this will need to account for all blocks if in future the correction is to add in the qnum)
+ if qnummissings:
+ qmissblocksscore = []
+ for qqnum in qnummissings:
+ similarity = MeasureBlockSimilarity(qebchk[3], qnummapq[qqnum])
+ qmissblocksscore.append((similarity, qqnum))
+ qmissblockscorebest = max(qmissblocksscore)
+ qblock = qnummapq[qmissblockscorebest[1]]
+ if miscfuncs.IsNotQuiet():
+ print(
+ "Missing qnum; mapping %s to %s with score %f"
+ % (qebchk[0], qblock.headingqb.qGID, qmissblockscorebest[0])
+ )
+ assert (
+ qmissblockscorebest[0] > 0.8
+ ) # otherwise it's not really a match and we need to look harder.
+ # perhaps it's matched to a block in the new file which newly has a qnum, and we then have to scan against all of them.
+
+ # now have to check matching.
+ # convert both to strings and compare.
+ essxfq = [] # this forms the string which we will be comparing against.
+ qebchkquesids = [] # expect only one of each
+ qebchkreplids = []
+ for wd in re.findall("<[^>]*>|&\w+;|[^<>\s]+", qebchk[3]):
+ mwd = re.match('<(p|tr|reply|ques)\s*(?:p?id="([^"]*)")?[^>]*>', wd)
+ if mwd:
+ essxfq.append("<%s>" % mwd.group(1))
+ assert mwd.group(1) not in ("reply", "ques") or mwd.group(2)
+ if mwd.group(1) == "ques":
+ qebchkquesids.append(mwd.group(2))
+ elif mwd.group(1) == "reply":
+ qebchkreplids.append(mwd.group(2))
+
+ elif not re.match("\n'
+ % (qebchk[0], majblocks[0][0].qGID)
+ )
+ for qebq in qebchkquesids:
+ res.append(
+ '\n'
+ % (qebq, majblocks[0][0].qGID)
+ )
+ for qebqr in qebchkreplids:
+ res.append(
+ '\n'
+ % (qebqr, majblocks[0][0].qGID)
+ )
+ # Is the lred current-gidredirects bit needed here too? Don't think so, but not sure
+ continue
+
+ # build up the same summary from the question block
+ essbkfq = []
+ for qblockqr in (qblock.queses, qblock.replies):
+ for qb in qblockqr:
+ essbkfq.append("<%s>" % qb.typ)
+ for wd in re.findall("<[^>]*>|&\w+;|[^<>\s]+", "\n".join(qb.stext)):
+ mwd = re.match("<(p|tr)[^>]*>", wd)
+ if mwd:
+ essbkfq.append("<%s>" % mwd.group(1))
+ elif not re.match("" % qb.typ)
+
+ # print the link forwards
+ bchanges = essxfq != essbkfq
+ matchtype = bchanges and "changes" or "perfectmatch"
+ if bchanges:
+ res.append("\n")
+ res.append(
+ '\n'
+ % (qebchk[0], qblock.headingqb.qGID, matchtype)
+ )
+
+ # write the parallel redirects for the question and reply (both mapping to same parts of each)
+ # this may be more sophisticated once we see an example of failure
+ # ultimately this is a job for paragraph matching
+
+ # sometimes we get more than one question.
+ # when we find a mismatch we'll deal with it as a special paragraph problem, or not bother.
+ if len(qebchkquesids) != len(qblock.queses):
+ print(len(qebchkquesids), len(qblock.queses), qblock.queses[0].qGID)
+ assert len(qebchkquesids) == len(qblock.queses)
+ for i in range(len(qebchkquesids)):
+ res.append(
+ '\n'
+ % (qebchkquesids[i], qblock.queses[i].qGID, matchtype)
+ )
+
+ assert len(qebchkreplids) == len(qblock.replies) == 1
+ for qebqr in qebchkreplids:
+ res.append(
+ '\n'
+ % (qebqr, qblock.replies[0].qGID, matchtype)
+ )
+
+ # if changes write out the original, else just the gidmaps
+ if bchanges:
+ res.append('\n' % qebchk[0:2])
+ res.append(qebchk[2])
+ res.append("\n")
+ res.append(qebchk[3])
+ res.append("\n\n")
+ else:
+ for lred in re.findall("]*>\n", qebchk[3]):
+ res.append("\t")
+ res.append(lred)
+
+ return res
diff --git a/pyscraper/lazyrunall.py b/pyscraper/lazyrunall.py
index 1da9d401..fc3eeda5 100755
--- a/pyscraper/lazyrunall.py
+++ b/pyscraper/lazyrunall.py
@@ -3,19 +3,19 @@
# Run the script with --help to see command line options
-import sys
import os
+import sys
# change current directory to pyscraper folder script is in
-os.chdir(os.path.dirname(sys.argv[0]) or '.')
+os.chdir(os.path.dirname(sys.argv[0]) or ".")
from optparse import OptionParser
-from runfilters import RunFiltersDir, RunNIFilters
-from regmem.filter import RunRegmemFilters
-import ni.scrape
-from regmem.pullgluepages import RegmemPullGluePages
+import ni.scrape
from miscfuncs import SetQuiet
+from regmem.filter import RunRegmemFilters
+from regmem.pullgluepages import RegmemPullGluePages
+from runfilters import RunFiltersDir, RunNIFilters
# Parse the command line parameters
@@ -42,33 +42,64 @@
# See what options there are
-parser.add_option("--force-parse",
- action="store_true", dest="forceparse", default=False,
- help="forces reprocessing of debates by first deleting output files")
-parser.add_option("--force-scrape",
- action="store_true", dest="forcescrape", default=False,
- help="forces redownloading of HTML first deleting output files")
-
-parser.add_option("--from", dest="datefrom", metavar="date", default="1000-01-01",
- help="date to process back to, default is start of time")
-parser.add_option("--to", dest="dateto", metavar="date", default="9999-12-31",
- help="date to process up to, default is present day")
-parser.add_option("--date", dest="date", metavar="date", default=None,
- help="date to process (overrides --from and --to)")
-
-parser.add_option("--patchtool",
- action="store_true", dest="patchtool", default=None,
- help="launch ./patchtool to fix errors in source HTML")
-parser.add_option("--quietc",
- action="store_true", dest="quietc", default=None,
- help="low volume error messages; continue processing further files")
+parser.add_option(
+ "--force-parse",
+ action="store_true",
+ dest="forceparse",
+ default=False,
+ help="forces reprocessing of debates by first deleting output files",
+)
+parser.add_option(
+ "--force-scrape",
+ action="store_true",
+ dest="forcescrape",
+ default=False,
+ help="forces redownloading of HTML first deleting output files",
+)
+
+parser.add_option(
+ "--from",
+ dest="datefrom",
+ metavar="date",
+ default="1000-01-01",
+ help="date to process back to, default is start of time",
+)
+parser.add_option(
+ "--to",
+ dest="dateto",
+ metavar="date",
+ default="9999-12-31",
+ help="date to process up to, default is present day",
+)
+parser.add_option(
+ "--date",
+ dest="date",
+ metavar="date",
+ default=None,
+ help="date to process (overrides --from and --to)",
+)
+
+parser.add_option(
+ "--patchtool",
+ action="store_true",
+ dest="patchtool",
+ default=None,
+ help="launch ./patchtool to fix errors in source HTML",
+)
+parser.add_option(
+ "--quietc",
+ action="store_true",
+ dest="quietc",
+ default=None,
+ help="low volume error messages; continue processing further files",
+)
(options, args) = parser.parse_args()
-if (options.date):
- options.datefrom = options.date
- options.dateto = options.date
+if options.date:
+ options.datefrom = options.date
+ options.dateto = options.date
if options.quietc:
- SetQuiet()
+ SetQuiet()
# See what commands there are
@@ -78,33 +109,33 @@
options.regmem = False
options.ni = False
for arg in args:
- if arg == "scrape":
- options.scrape = True
- elif arg == "parse":
- options.parse = True
- elif arg == "regmem":
- options.regmem = True
- options.remote = True
- elif arg == "regmem-local":
- options.regmem = True
- options.remote = False
- elif arg == "ni":
- options.ni = True
- else:
- print("error: no such option %s" % arg, file=sys.stderr)
- parser.print_help()
- sys.exit(1)
-if len(args) == 0:
+ if arg == "scrape":
+ options.scrape = True
+ elif arg == "parse":
+ options.parse = True
+ elif arg == "regmem":
+ options.regmem = True
+ options.remote = True
+ elif arg == "regmem-local":
+ options.regmem = True
+ options.remote = False
+ elif arg == "ni":
+ options.ni = True
+ else:
+ print("error: no such option %s" % arg, file=sys.stderr)
parser.print_help()
sys.exit(1)
+if len(args) == 0:
+ parser.print_help()
+ sys.exit(1)
if not options.scrape and not options.parse:
- print("error: choose what to do; scrape, parse, or both", file=sys.stderr)
- parser.print_help()
- sys.exit(1)
+ print("error: choose what to do; scrape, parse, or both", file=sys.stderr)
+ parser.print_help()
+ sys.exit(1)
if not options.regmem and not options.ni:
- print("error: choose what work on; regmem, several of them", file=sys.stderr)
- parser.print_help()
- sys.exit(1)
+ print("error: choose what work on; regmem, several of them", file=sys.stderr)
+ parser.print_help()
+ sys.exit(1)
# Download/generate the new data
@@ -116,7 +147,7 @@
# Parse it into XML
if options.parse:
- if options.ni:
- RunFiltersDir(RunNIFilters, 'ni', options, options.forceparse)
- if options.regmem:
- RunFiltersDir(RunRegmemFilters, 'regmem', options, options.forceparse)
+ if options.ni:
+ RunFiltersDir(RunNIFilters, "ni", options, options.forceparse)
+ if options.regmem:
+ RunFiltersDir(RunRegmemFilters, "regmem", options, options.forceparse)
diff --git a/pyscraper/lords/resolvenames.py b/pyscraper/lords/resolvenames.py
index 2589de6d..8c7d68f9 100644
--- a/pyscraper/lords/resolvenames.py
+++ b/pyscraper/lords/resolvenames.py
@@ -1,62 +1,85 @@
-import json
-import os.path
import re
-from contextexception import ContextException
from base_resolver import ResolverBase
+from contextexception import ContextException
-titleconv = { 'L.':'Lord',
- 'B.':'Baroness',
- 'Abp.':'Archbishop',
- 'Bp.':'Bishop',
- 'V.':'Viscount',
- 'E.':'Earl',
- 'D.':'Duke',
- 'M.':'Marquess',
- 'C.':'Countess',
- 'Ly.':'Lady',
- }
+titleconv = {
+ "L.": "Lord",
+ "B.": "Baroness",
+ "Abp.": "Archbishop",
+ "Bp.": "Bishop",
+ "V.": "Viscount",
+ "E.": "Earl",
+ "D.": "Duke",
+ "M.": "Marquess",
+ "C.": "Countess",
+ "Ly.": "Lady",
+}
# more tedious stuff to do: "earl of" and "sitting as" cases
-hontitles = [ 'Lord ?Bishop', 'Bishop', 'Marquess', 'Lord', 'Baroness', 'Viscount', 'Earl', 'Countess',
- 'Lord Archbishop', 'Archbishop', 'Duke', 'Lady' ]
-hontitleso = '|'.join(hontitles)
-
-honcompl = re.compile('(?:(%s)|(%s) \s*(.*?))(?:\s+of\s+(.*))?$' % (hontitleso, hontitleso))
+hontitles = [
+ "Lord ?Bishop",
+ "Bishop",
+ "Marquess",
+ "Lord",
+ "Baroness",
+ "Viscount",
+ "Earl",
+ "Countess",
+ "Lord Archbishop",
+ "Archbishop",
+ "Duke",
+ "Lady",
+]
+hontitleso = "|".join(hontitles)
+
+honcompl = re.compile(
+ "(?:(%s)|(%s) \s*(.*?))(?:\s+of\s+(.*))?$" % (hontitleso, hontitleso)
+)
+
+rehonorifics = re.compile("(?: [CKO]BE| DL| TD| QC| KCMG| KCB)+$")
-rehonorifics = re.compile('(?: [CKO]BE| DL| TD| QC| KCMG| KCB)+$')
class LordsList(ResolverBase):
- import_organization_id = 'house-of-lords'
+ import_organization_id = "house-of-lords"
def reloadJSON(self):
super(LordsList, self).reloadJSON()
- self.lordnames={} # "lordnames" --> lords
- self.aliases={} # Corrections to full names
+ self.lordnames = {} # "lordnames" --> lords
+ self.aliases = {} # Corrections to full names
self.import_people_json()
def import_people_membership(self, mship, posts, orgs):
- if 'organization_id' not in mship or mship['organization_id'] != self.import_organization_id:
+ if (
+ "organization_id" not in mship
+ or mship["organization_id"] != self.import_organization_id
+ ):
return
if mship["id"] in self.membertopersonmap:
raise Exception("Same member id %s appeared twice" % mship["id"])
- self.membertopersonmap[mship["id"]] = mship['person_id']
- self.persontomembermap.setdefault(mship['person_id'], []).append(mship["id"])
+ self.membertopersonmap[mship["id"]] = mship["person_id"]
+ self.persontomembermap.setdefault(mship["person_id"], []).append(mship["id"])
if self.members.get(mship["id"]):
raise Exception("Repeated identifier %s in members JSON file" % mship["id"])
self.members[mship["id"]] = mship
- if 'end_date' not in mship:
- mship['end_date'] = '9999-12-31'
+ if "end_date" not in mship:
+ mship["end_date"] = "9999-12-31"
def import_people_main_name(self, name, memberships):
- mships = [m for m in memberships if m['start_date'] <= name.get('end_date', '9999-12-31') and m['end_date'] >= name.get('start_date', '1000-01-01')]
- if not mships: return
+ mships = [
+ m
+ for m in memberships
+ if m["start_date"] <= name.get("end_date", "9999-12-31")
+ and m["end_date"] >= name.get("start_date", "1000-01-01")
+ ]
+ if not mships:
+ return
lname = name["lordname"] or name["lordofname"]
lname = re.sub("\.", "", lname)
assert lname
@@ -67,17 +90,22 @@ def import_people_main_name(self, name, memberships):
}
for m in mships:
newattr = attr.copy()
- newattr['start_date'] = max(m['start_date'], name.get('start_date', '1000-01-01'))
- newattr['end_date'] = min(m['end_date'], name.get('end_date', '9999-12-31'))
- newattr['id'] = m["id"]
+ newattr["start_date"] = max(
+ m["start_date"], name.get("start_date", "1000-01-01")
+ )
+ newattr["end_date"] = min(m["end_date"], name.get("end_date", "9999-12-31"))
+ newattr["id"] = m["id"]
self.lordnames.setdefault(lname, []).append(newattr)
def import_people_alternate_name(self, person, other_name, memberships):
- if 'name' not in other_name: return # Only full names in Lords aliases
- self.aliases[other_name['name']] = person['id']
+ if "name" not in other_name:
+ return # Only full names in Lords aliases
+ self.aliases[other_name["name"]] = person["id"]
# main matching function
- def GetLordID(self, ltitle, llordname, llordofname, loffice, stampurl, sdate, bDivision):
+ def GetLordID(
+ self, ltitle, llordname, llordofname, loffice, stampurl, sdate, bDivision
+ ):
if ltitle == "Lord Bishop":
ltitle = "Bishop"
if ltitle == "Lord Archbishop":
@@ -85,15 +113,15 @@ def GetLordID(self, ltitle, llordname, llordofname, loffice, stampurl, sdate, bD
llordofname = llordofname.replace(".", "")
llordname = llordname.replace(".", "")
- llordname = re.sub('(039|146|8217);', "'", llordname)
+ llordname = re.sub("(039|146|8217);", "'", llordname)
llordofname = llordofname.strip()
llordname = llordname.strip()
# TODO: Need a Lords version of member-aliases.xml I guess
- if ltitle == "Bishop" and llordofname == "Southwell" and sdate>='2005-07-01':
+ if ltitle == "Bishop" and llordofname == "Southwell" and sdate >= "2005-07-01":
llordofname = "Southwell and Nottingham"
- if ltitle == "Bishop" and llordname == "Southwell" and sdate>='2005-07-01':
+ if ltitle == "Bishop" and llordname == "Southwell" and sdate >= "2005-07-01":
llordname = "Southwell and Nottingham"
lname = llordname or llordofname
@@ -101,11 +129,11 @@ def GetLordID(self, ltitle, llordname, llordofname, loffice, stampurl, sdate, bD
lmatches = self.lordnames.get(lname, [])
# match to successive levels of precision for identification
- res = [ ]
+ res = []
for lm in lmatches:
if lm["title"] != ltitle: # mismatch title
continue
- if llordname and llordofname: # two name case
+ if llordname and llordofname: # two name case
if (lm["lordname"] == llordname) and (lm["lordofname"] == llordofname):
if lm["start_date"] <= sdate <= lm["end_date"]:
res.append(lm)
@@ -128,29 +156,62 @@ def GetLordID(self, ltitle, llordname, llordofname, loffice, stampurl, sdate, bD
if lname == lmlname:
if lm["start_date"] <= sdate <= lm["end_date"]:
if lm["lordname"] and llordofname:
- #if not IsNotQuiet():
- print("cm---", ltitle, lm["lordname"], lm["lordofname"], llordname, llordofname)
- raise ContextException("lordofname matches lordname in lordlist", stamp=stampurl, fragment=lname)
+ # if not IsNotQuiet():
+ print(
+ "cm---",
+ ltitle,
+ lm["lordname"],
+ lm["lordofname"],
+ llordname,
+ llordofname,
+ )
+ raise ContextException(
+ "lordofname matches lordname in lordlist",
+ stamp=stampurl,
+ fragment=lname,
+ )
else:
assert lm["lordofname"] and llordname
# of-name distinction lost in division lists
if not bDivision:
- raise ContextException("lordname matches lordofname in lordlist", stamp=stampurl, fragment=lname)
+ raise ContextException(
+ "lordname matches lordofname in lordlist",
+ stamp=stampurl,
+ fragment=lname,
+ )
res.append(lm)
- elif ltitle != "Bishop" and ltitle != "Archbishop" and (ltitle, lname) not in (("Duke", "Norfolk"), ("Duke", "Wellington"), ('Earl', 'Kinnoull'), ('Earl', 'Selborne')):
+ elif (
+ ltitle != "Bishop"
+ and ltitle != "Archbishop"
+ and (ltitle, lname)
+ not in (
+ ("Duke", "Norfolk"),
+ ("Duke", "Wellington"),
+ ("Earl", "Kinnoull"),
+ ("Earl", "Selborne"),
+ )
+ ):
print(lm)
- raise ContextException("wrong dates on lords with same name", stamp=stampurl, fragment=lname)
+ raise ContextException(
+ "wrong dates on lords with same name",
+ stamp=stampurl,
+ fragment=lname,
+ )
if not res:
- raise ContextException("unknown lord %s %s %s %s on %s" % (ltitle, llordname, llordofname, stampurl, sdate), stamp=stampurl, fragment=lname)
+ raise ContextException(
+ "unknown lord %s %s %s %s on %s"
+ % (ltitle, llordname, llordofname, stampurl, sdate),
+ stamp=stampurl,
+ fragment=lname,
+ )
assert len(res) == 1
return self.membertoperson(res[0]["id"])
-
def GetLordIDfname(self, name, loffice, sdate, stampurl=None):
name = re.sub("^The ", "", name)
- name = name.replace(' Of ', ' of ')
+ name = name.replace(" Of ", " of ")
if name in self.aliases:
return self.aliases[name]
@@ -160,7 +221,9 @@ def GetLordIDfname(self, name, loffice, sdate, stampurl=None):
hom = honcompl.match(name)
if not hom:
- raise ContextException("lord name format failure on '%s'" % name, stamp=stampurl, fragment=name)
+ raise ContextException(
+ "lord name format failure on '%s'" % name, stamp=stampurl, fragment=name
+ )
# now we have a speaker, try and break it up
ltit = hom.group(1)
@@ -181,25 +244,28 @@ def GetLordIDfname(self, name, loffice, sdate, stampurl=None):
return self.GetLordID(ltit, lname, lplace, loffice, stampurl, sdate, False)
-
def MatchRevName(self, fss, sdate, stampurl):
assert fss
- lfn = re.match('(.*?)(?: of (.*?))?, {0,3}((?:L|B|Abp|Bp|V|E|D|M|C|Ly)\.?)$', fss)
+ lfn = re.match(
+ "(.*?)(?: of (.*?))?, {0,3}((?:L|B|Abp|Bp|V|E|D|M|C|Ly)\.?)$", fss
+ )
if not lfn:
print("$$$%s$$$" % fss)
- raise ContextException("No match of format in MatchRevName", stamp=stampurl, fragment=fss)
+ raise ContextException(
+ "No match of format in MatchRevName", stamp=stampurl, fragment=fss
+ )
shorttitle = lfn.group(3)
- if shorttitle[-1] != '.':
+ if shorttitle[-1] != ".":
shorttitle += "."
ltitle = titleconv[shorttitle]
llordname = lfn.group(1).replace(".", "")
llordname = llordname.replace("'", "'")
llordname = re.sub("^De ", "de ", llordname)
- fullname = '%s %s' % (ltitle, llordname)
+ fullname = "%s %s" % (ltitle, llordname)
llordofname = ""
if lfn.group(2):
llordofname = lfn.group(2).replace(".", "")
- fullname = '%s of %s' % (fullname, llordofname)
+ fullname = "%s of %s" % (fullname, llordofname)
if fullname in self.aliases:
return self.aliases[fullname]
diff --git a/pyscraper/miscfuncs.py b/pyscraper/miscfuncs.py
index 1cc463e3..ec06620a 100755
--- a/pyscraper/miscfuncs.py
+++ b/pyscraper/miscfuncs.py
@@ -1,43 +1,51 @@
+import os
import re
-import sys
import string
-import os
import tempfile
# make the top path data directory value
-toppath = os.path.abspath('../../parldata')
+toppath = os.path.abspath("../../parldata")
if not os.path.exists(toppath):
- toppath = os.path.abspath('../../../parldata')
+ toppath = os.path.abspath("../../../parldata")
if not os.path.exists(toppath):
- toppath = os.path.abspath(os.path.expanduser('~/parldata/'))
+ toppath = os.path.abspath(os.path.expanduser("~/parldata/"))
if not os.path.exists(toppath):
- toppath = 'C:\\parldata'
+ toppath = "C:\\parldata"
# output directories used for the scraper
pwcmdirs = os.path.join(toppath, "cmpages")
pwxmldirs = os.path.join(toppath, "scrapedxml")
-pwpatchesdirs = os.path.abspath("patches") # made locally, relative to the lazyrunall.py module. Should be relative to toppath eventually
+pwpatchesdirs = os.path.abspath(
+ "patches"
+) # made locally, relative to the lazyrunall.py module. Should be relative to toppath eventually
-if (not os.path.isdir(toppath)):
- raise Exception('Data directory %s does not exist, please create' % (toppath))
+if not os.path.isdir(toppath):
+ raise Exception("Data directory %s does not exist, please create" % (toppath))
# print "Data directory (set in miscfuncs.py): %s" % toppath
# temporary files are stored here
tmppath = os.path.join(toppath, "tmp")
-if (not os.path.isdir(tmppath)):
- os.mkdir(tmppath)
+if not os.path.isdir(tmppath):
+ os.mkdir(tmppath)
tempfilename = tempfile.mktemp("", "pw-gluetemp-", tmppath)
# find raw data path
rawdatapath = os.path.join(os.getcwd(), "../rawdata")
-if (not os.path.isdir(toppath)):
- raise Exception('Raw data directory %s does not exist, you\'ve not got a proper checkout from CVS.' % (toppath))
+if not os.path.isdir(toppath):
+ raise Exception(
+ "Raw data directory %s does not exist, you've not got a proper checkout from CVS."
+ % (toppath)
+ )
# quiet flag
bNotQuiet = True
+
+
def SetQuiet():
global bNotQuiet
bNotQuiet = False
+
+
def IsNotQuiet():
return bNotQuiet
@@ -48,16 +56,17 @@ def IsNotQuiet():
# use this to generate chronological scraped files of the same page
def NextAlphaString(s):
- assert re.match('[a-z]*$', s)
+ assert re.match("[a-z]*$", s)
if not s:
- return 'a'
+ return "a"
i = string.ascii_lowercase.find(s[-1]) + 1
if i < len(string.ascii_lowercase):
return s[:-1] + string.ascii_lowercase[i]
- return NextAlphaString(s[:-1]) + 'a'
+ return NextAlphaString(s[:-1]) + "a"
+
def AlphaStringToOrder(s):
- assert re.match('[a-z]*$', s)
+ assert re.match("[a-z]*$", s)
res = 0
while s:
i = string.ascii_lowercase.find(s[0]) + 1
@@ -65,6 +74,7 @@ def AlphaStringToOrder(s):
s = s[1:]
return res
+
# Impossible to do 6pm, 7.15pm, 6.30pm, 6.45pm, 7pm without future timestamps
# So not caring any more about timestamp errors
# Need good timestamps for video ;-) So turning back on, might try different tack at some point
@@ -73,8 +83,10 @@ def AlphaStringToOrder(s):
regparsetime = re.compile("^(\d+)[\.:]\s*(\d+)(?:\s?| )([\w\.]*)$")
# 7 pm
regparsetimeonhour = re.compile("^(\d+)()(?:\s?| )([\w\.]+)$")
+
+
def TimeProcessing(time, previoustimearr, bIsDivisionTime, stampurl):
- #print "time ", time
+ # print "time ", time
previoustime = None
if previoustimearr:
@@ -88,7 +100,7 @@ def TimeProcessing(time, previoustimearr, bIsDivisionTime, stampurl):
timeparts = regparsetimeonhour.match(time)
if timeparts:
hour = int(timeparts.group(1))
- if (timeparts.group(2) != ""):
+ if timeparts.group(2) != "":
mins = int(timeparts.group(2))
else:
mins = 0
@@ -114,12 +126,15 @@ def TimeProcessing(time, previoustimearr, bIsDivisionTime, stampurl):
hour -= 12
if previoustime and previoustimehour + 12 <= hour:
- print("TIME: time shift by 12 (from %s to %s) -- should a p.m. be an a.m.? %s" % (previoustime, time, repr(stampurl)))
+ print(
+ "TIME: time shift by 12 (from %s to %s) -- should a p.m. be an a.m.? %s"
+ % (previoustime, time, repr(stampurl))
+ )
- elif time == 'Midnight':
+ elif time == "Midnight":
hour = 24
mins = 0
- elif time == 'Noon':
+ elif time == "Noon":
hour = 12
mins = 0
else:
@@ -127,11 +142,10 @@ def TimeProcessing(time, previoustimearr, bIsDivisionTime, stampurl):
res = "%03d:%02d:00" % (hour, mins)
-
# day-rotate situation where they went on beyond midnight
# it's uncommon enough to handle by listing exceptional days
# (sometimes the division time is out of order because that is where it is inserted in the record -- maybe should patch to handle)
- #print previoustime, res, bIsDivisionTime, stampurl.sdate
+ # print previoustime, res, bIsDivisionTime, stampurl.sdate
if previoustime and res < previoustime:
if stampurl.sdate in ["2005-03-10"]:
if previoustime < "024":
@@ -142,14 +156,21 @@ def TimeProcessing(time, previoustimearr, bIsDivisionTime, stampurl):
elif stampurl.sdate in ["2002-10-28"]:
return res
- elif hour in [0, 1, 2, 3, 4] or stampurl.sdate in ["2003-10-20", "2000-10-03", "2000-07-24", "2011-01-17"]:
+ elif hour in [0, 1, 2, 3, 4] or stampurl.sdate in [
+ "2003-10-20",
+ "2000-10-03",
+ "2000-07-24",
+ "2011-01-17",
+ ]:
hour += 24
else:
- print('TIME: time rotation (from %s to %s %s) not close to midnight %s' % (previoustime, time, res, repr(stampurl)))
+ print(
+ "TIME: time rotation (from %s to %s %s) not close to midnight %s"
+ % (previoustime, time, res, repr(stampurl))
+ )
res = "%03d:%02d:00" % (hour, mins)
-
# capture the case where we are out of order by more than a few minutes
# (divisions are often out of order slightly)
@@ -160,7 +181,10 @@ def TimeProcessing(time, previoustimearr, bIsDivisionTime, stampurl):
previoustimeminutes = previoustimehour * 60 + int(prevtimeMatch.group(2))
if timeminutes < previoustimeminutes:
if not bIsDivisionTime or (previoustimeminutes - timeminutes > 10):
- print('TIME: time out of order, from %s to %s (division=%s) %s' % (previoustime, res, bIsDivisionTime, repr(stampurl)))
+ print(
+ "TIME: time out of order, from %s to %s (division=%s) %s"
+ % (previoustime, res, bIsDivisionTime, repr(stampurl))
+ )
return res
@@ -168,84 +192,72 @@ def TimeProcessing(time, previoustimearr, bIsDivisionTime, stampurl):
# http://www.bigbaer.com/reference/character_entity_reference.htm
# Make sure you update WriteXMLHeader in xmlfilewrite.py also!
entitymap = {
- ' ':' ',
- '&':'&',
-
- # see http://www.cs.tut.fi/~jkorpela/www/windows-chars.html for a useful, if now dated in
- # terms of browser support for the proper solutions, info on windows ndash/mdash (150/151)
- '':'–', # convert windows latin-1 extension ndash into a real one
- '':'—', # likewise mdash
- '¡':'¡', # inverted exclamation mark
- '÷':'÷', # division sign
-
- 'è':'è', # this is e-grave
- 'é':'é', # this is e-acute
- 'ê':'ê', # this is e-hat
- 'ë':'ë', # this is e-double-dot
-
- 'ß':'ß',
- 'à':'à', # this is a-grave
- 'á':'á', # this is a-acute
- 'â':'â', # this is a-hat as in debacle
- 'ã':'ã', # this is a-hat as in debacle
- 'ä':'ä',
-
- 'ô':'ô', # this is o-hat
- 'ö':'ö', # this is o-double-dot
- 'Ö':'Ö', # this is capital o-double-dot
- 'ó':'ó', # this is o-acute
- 'ø':'ø', # this is o-slash
- 'õ':'õ', # this is o-tilde
-
- 'í':'í', # this is i-acute
- 'î':'î', # this is i-circumflex
- 'ï':'ï', # this is i-double-dot, as in naive
-
- 'ç':'ç', # this is cedilla
- 'ú':'ú',
- 'ü':'ü', # this is u-double-dot
- 'ñ':'ñ', # spanish n as in Senor
- 'þ':'þ',
-
- '±':'±', # this is +/- symbol
- '£':'£', # UK currency
- '§':'§', # UK currency
- '©':'©',
- '·':'·', # middle dot
- '°':'°', # this is the degrees
- 'º':'º', # this is the M ordinal
- '®':'®', # this is the degrees
- '¶':'¶', # end-paragraph (pi) symbol
-
- 'µ':'µ', # this is one quarter symbol
- '¼':'¼', # this is one quarter symbol
- '½':'½', # this is one half symbol
- '¾':'¾', # this is three quarter symbol
-
- '#':'#', # this is hash
- '_':'_', # this is underscore symbol
- '_':'_', # this is underscore symbol
-
- ''':"'", # possession apostrophe
- "€":'€', # this is euro currency
- "™":'™',
- "•":'•',
- '&lquo;':"'",
- '&rquo;':"'",
- '−':"-",
-
- '':"'",
- '':"'",
- '':'"',
- '':'"',
- '
':'...',
- '':'†',
-
- '²':'²',
- '’':"'",
- 'œ':'œ',
- 'æ':'æ',
- '†':'†',
+ " ": " ",
+ "&": "&",
+ # see http://www.cs.tut.fi/~jkorpela/www/windows-chars.html for a useful, if now dated in
+ # terms of browser support for the proper solutions, info on windows ndash/mdash (150/151)
+ "": "–", # convert windows latin-1 extension ndash into a real one
+ "": "—", # likewise mdash
+ "¡": "¡", # inverted exclamation mark
+ "÷": "÷", # division sign
+ "è": "è", # this is e-grave
+ "é": "é", # this is e-acute
+ "ê": "ê", # this is e-hat
+ "ë": "ë", # this is e-double-dot
+ "ß": "ß",
+ "à": "à", # this is a-grave
+ "á": "á", # this is a-acute
+ "â": "â", # this is a-hat as in debacle
+ "ã": "ã", # this is a-hat as in debacle
+ "ä": "ä",
+ "ô": "ô", # this is o-hat
+ "ö": "ö", # this is o-double-dot
+ "Ö": "Ö", # this is capital o-double-dot
+ "ó": "ó", # this is o-acute
+ "ø": "ø", # this is o-slash
+ "õ": "õ", # this is o-tilde
+ "í": "í", # this is i-acute
+ "î": "î", # this is i-circumflex
+ "ï": "ï", # this is i-double-dot, as in naive
+ "ç": "ç", # this is cedilla
+ "ú": "ú",
+ "ü": "ü", # this is u-double-dot
+ "ñ": "ñ", # spanish n as in Senor
+ "þ": "þ",
+ "±": "±", # this is +/- symbol
+ "£": "£", # UK currency
+ "§": "§", # UK currency
+ "©": "©",
+ "·": "·", # middle dot
+ "°": "°", # this is the degrees
+ "º": "º", # this is the M ordinal
+ "®": "®", # this is the degrees
+ "¶": "¶", # end-paragraph (pi) symbol
+ "µ": "µ", # this is one quarter symbol
+ "¼": "¼", # this is one quarter symbol
+ "½": "½", # this is one half symbol
+ "¾": "¾", # this is three quarter symbol
+ "#": "#", # this is hash
+ "_": "_", # this is underscore symbol
+ "_": "_", # this is underscore symbol
+ "'": "'", # possession apostrophe
+ "€": "€", # this is euro currency
+ "™": "™",
+ "•": "•",
+ "&lquo;": "'",
+ "&rquo;": "'",
+ "−": "-",
+ "": "'",
+ "": "'",
+ "": """,
+ "": """,
+ "
": "...",
+ "": "†",
+ "²": "²",
+ "’": "'",
+ "œ": "œ",
+ "æ": "æ",
+ "†": "†",
}
entitymaprev = entitymap.values()
@@ -253,14 +265,14 @@ def TimeProcessing(time, previoustimearr, bIsDivisionTime, stampurl):
def StripAnchorTags(text):
raise Exception("I've never called this function, so test it")
- abf = re.split('(<[^>]*>)', text)
+ abf = re.split("(<[^>]*>)", text)
- ret = ''
+ ret = ""
for ab in abf:
- if re.match(']*>(?i)', ab):
+ if re.match("]*>(?i)", ab):
pass
- elif re.match('(?i)', ab):
+ elif re.match("(?i)", ab):
pass
else:
@@ -270,148 +282,163 @@ def StripAnchorTags(text):
def WriteCleanText(fout, text, striphref=True):
- text = re.sub('', '', text)
- abf = re.split('(<[^>]*>)', text)
+ text = re.sub("", "", text)
+ abf = re.split("(<[^>]*>)", text)
for ab in abf:
# delete comments and links
- if re.match(']*?->', ab):
+ if re.match("]*?->", ab):
pass
# XXX Differs from pullgluepages version
- elif striphref and re.match(']+>(?i)', ab):
- anamem = re.match(']+>(?i)", ab):
+ anamem = re.match("(?i)', ab):
+ elif striphref and re.match("?a>(?i)", ab):
pass
# spaces only inside tags
- elif re.match('<[^>]*>', ab):
- fout.write(re.sub('\s', ' ', ab))
+ elif re.match("<[^>]*>", ab):
+ fout.write(re.sub("\s", " ", ab))
# take out spurious > symbols and dos linefeeds
else:
- fout.write(re.sub('>|\r', '', ab))
+ fout.write(re.sub(">|\r", "", ab))
# Legacy patch system, use patchfilter.py and patchtool now
def ApplyFixSubstitutions(text, sdate, fixsubs):
for sub in fixsubs:
- if sub[3] == 'all' or sub[3] == sdate:
+ if sub[3] == "all" or sub[3] == sdate:
(text, n) = re.subn(sub[0], sub[1], text)
if (sub[2] != -1) and (n != sub[2]):
print(sub)
- raise Exception('wrong number of substitutions %d on %s' % (n, sub[0]))
+ raise Exception("wrong number of substitutions %d on %s" % (n, sub[0]))
return text
# this only accepts and tags
def StraightenHTMLrecurse(stex, stampurl):
# split the text into and and and
- qisup = re.search(r'(<(a|i|b|s|small|sup|sub)( href="[^"]*")?>(.*?)\2>)(?i)', stex)
+ qisup = re.search(
+ r'(<(a|i|b|s|small|sup|sub)( href="[^"]*")?>(.*?)\2>)(?i)', stex
+ )
if qisup:
qtagtype = qisup.group(2)
- qhref = qisup.group(3) or ''
- qtag = ('<%s%s>' % (qtagtype, qhref), '%s>' % qtagtype)
+ qhref = qisup.group(3) or ""
+ qtag = ("<%s%s>" % (qtagtype, qhref), "%s>" % qtagtype)
if not qisup:
qisup = re.search('(<(a) href="([^"]*)">(.*?))(?i)', stex)
if qisup:
- qtag = ('' % qisup.group(3), '')
+ qtag = ('' % qisup.group(3), "")
if qisup:
- sres = StraightenHTMLrecurse(stex[:qisup.start(1)], stampurl)
+ sres = StraightenHTMLrecurse(stex[: qisup.start(1)], stampurl)
sres.append(qtag[0])
sres.extend(StraightenHTMLrecurse(qisup.group(4), stampurl))
sres.append(qtag[1])
- sres.extend(StraightenHTMLrecurse(stex[qisup.end(1):], stampurl))
+ sres.extend(StraightenHTMLrecurse(stex[qisup.end(1) :], stampurl))
return sres
- sres = re.split('(&[a-z0-9]*?;|\d+;|"|\xa3|&|\x01|\x0e|\x14|\x92|\xb0|\xab|\xe9|\xc3\xb8|\xc3\xb1|<[^>]*>|<|>)', stex)
+ sres = re.split(
+ '(&[a-z0-9]*?;|\d+;|"|\xa3|&|\x01|\x0e|\x14|\x92|\xb0|\xab|\xe9|\xc3\xb8|\xc3\xb1|<[^>]*>|<|>)',
+ stex,
+ )
for i in range(len(sres)):
- #print "sresi ", sres[i], "\n"
- #print "-----------------------------------------------\n"
+ # print "sresi ", sres[i], "\n"
+ # print "-----------------------------------------------\n"
if not sres[i]:
pass
- elif re.match('[0-9]+;', sres[i]) and not re.match('[345][0-9];', sres[i]):
+ elif re.match("[0-9]+;", sres[i]) and not re.match("[345][0-9];", sres[i]):
pass
- elif sres[i][0] == '&':
+ elif sres[i][0] == "&":
if sres[i] in entitymap:
sres[i] = entitymap[sres[i]]
elif sres[i] in entitymaprev:
pass
- elif sres[i] == '—': # special case as entitymap maps it with spaces
+ elif sres[i] == "—": # special case as entitymap maps it with spaces
pass
- elif sres[i] in ('"', '&', '<', '>'):
+ elif sres[i] in (""", "&", "<", ">"):
pass
- elif sres[i] in ('“', '”'):
- sres[i] = '"'
+ elif sres[i] in ("“", "”"):
+ sres[i] = """
else:
- raise Exception(sres[i] + ' unknown ent')
- sres[i] = 'UNKNOWN-ENTITY'
+ raise Exception(sres[i] + " unknown ent")
+ sres[i] = "UNKNOWN-ENTITY"
elif sres[i] == '"':
- sres[i] = '"'
+ sres[i] = """
# junk chars sometimes get in
# NB this only works if the characters are split in the regexp above
- elif sres[i] == '\x01':
- sres[i] = ''
- elif sres[i] == '\x0e':
- sres[i] = ' '
- elif sres[i] == '\x14':
- sres[i] = ' '
- elif sres[i] == '\x92':
+ elif sres[i] == "\x01":
+ sres[i] = ""
+ elif sres[i] == "\x0e":
+ sres[i] = " "
+ elif sres[i] == "\x14":
+ sres[i] = " "
+ elif sres[i] == "\x92":
sres[i] = "'"
- elif sres[i] == '\xa3':
- sres[i] = '£'
- elif sres[i] == '\xb0':
- sres[i] = '°'
- elif sres[i] == '\xab':
- sres[i] = 'é'
- elif sres[i] == '\xe9':
- sres[i] = 'é'
- elif sres[i] == '\xc3\xb8':
- sres[i] = 'ø'
- elif sres[i] == '\xc3\xb1':
- sres[i] = 'ñ'
-
- elif re.match('?i>$(?i)', sres[i]):
- sres[i] = '' # 'OPEN-i-TAG-OUT-OF-PLACE' 'CLOSE-i-TAG-OUT-OF-PLACE'
-
- elif re.match('$', sres[i]): # what is this? wrans 2003-05-13 has one
- sres[i] = ''
+ elif sres[i] == "\xa3":
+ sres[i] = "£"
+ elif sres[i] == "\xb0":
+ sres[i] = "°"
+ elif sres[i] == "\xab":
+ sres[i] = "é"
+ elif sres[i] == "\xe9":
+ sres[i] = "é"
+ elif sres[i] == "\xc3\xb8":
+ sres[i] = "ø"
+ elif sres[i] == "\xc3\xb1":
+ sres[i] = "ñ"
+
+ elif re.match("?i>$(?i)", sres[i]):
+ sres[i] = "" # 'OPEN-i-TAG-OUT-OF-PLACE' 'CLOSE-i-TAG-OUT-OF-PLACE'
+
+ elif re.match(
+ "$", sres[i]
+ ): # what is this? wrans 2003-05-13 has one
+ sres[i] = ""
# allow brs through
- elif re.match(' $(?i)', sres[i]):
- sres[i] = ' '
+ elif re.match(" $(?i)", sres[i]):
+ sres[i] = " "
# discard garbage that appears in recent today postings
- elif re.match('$(?i)', sres[i]):
- sres[i] = ''
+ elif re.match("$(?i)", sres[i]):
+ sres[i] = ""
- elif sres[i][0] == '<' or sres[i][0] == '>':
+ elif sres[i][0] == "<" or sres[i][0] == ">":
print("Part:", sres[i][0])
- print("All:",sres[i])
+ print("All:", sres[i])
print("stex:", stex)
print("raising")
- raise ContextException('tag %s tag out of place in %s' % (sres[i], stex), stamp=stampurl, fragment=stex)
+ raise ContextException(
+ "tag %s tag out of place in %s" % (sres[i], stex),
+ stamp=stampurl,
+ fragment=stex,
+ )
return sres
# The lookahead assertion (?=|?ul>| |?font[^>]*>(?i)'
-reparts = re.compile('(|(?=]*?>|' + restmatcher + ')')
+restmatcher = paratag + "||?ul>| |?font[^>]*>(?i)"
+reparts = re.compile("(|(?=]*?>|" + restmatcher + ")")
+
+retable = re.compile("(?i)")
+retablestart = re.compile("|?ul>|?br>|?font[^>]*>|]*>$(?i)"
+)
+reparaempty = re.compile("(?:\s|?i>| )*$(?i)")
+reitalif = re.compile("\s*\s*$(?i)")
-retable = re.compile('(?i)')
-retablestart = re.compile('|?ul>|?br>|?font[^>]*>|]*>$(?i)')
-reparaempty = re.compile('(?:\s|?i>| )*$(?i)')
-reitalif = re.compile('\s*\s*$(?i)')
# Break text into paragraphs.
# the result alternates between lists of space types, and strings
@@ -423,12 +450,11 @@ def SplitParaSpace(text, stampurl):
# list of space objects, list of string
spclist = []
- pstring = ''
+ pstring = ""
parts = reparts.split(text)
newparts = []
# split up the start into component parts
for nf in parts:
-
# a tiny bit of extra splitting up as output
if retablestart.match(nf) and not retable.match(nf):
newparts.extend(reparts2.split(nf))
@@ -437,11 +463,11 @@ def SplitParaSpace(text, stampurl):
# get rid of blank and boring paragraphs
if reparaempty.match(nf):
- if pstring and re.search('\S', nf):
+ if pstring and re.search("\S", nf):
print(text)
- print('---' + pstring)
- print('---' + nf)
- raise Exception(' it carried across empty para ')
+ print("---" + pstring)
+ print("---" + nf)
+ raise Exception(" it carried across empty para ")
continue
# list of space type objects
@@ -456,11 +482,10 @@ def SplitParaSpace(text, stampurl):
print(text)
print(spclist)
print(pstring)
- raise Exception(' double italic in paraspace ')
- pstring = ''
+ raise Exception(" double italic in paraspace ")
+ pstring = ""
continue
-
# we now have a string of a paragraph which we are putting into the list.
# table type
@@ -468,7 +493,7 @@ def SplitParaSpace(text, stampurl):
if retable.match(nf):
if pstring:
print(text)
- raise Exception(' non-empty preceding string ')
+ raise Exception(" non-empty preceding string ")
pstring = nf
bthisparaalone = True
@@ -479,21 +504,22 @@ def SplitParaSpace(text, stampurl):
else:
pstring = lnf.strip()
-
# check that paragraphs have some text
- if re.match('(?:<[^>]*>|\s)*$', pstring):
+ if re.match("(?:<[^>]*>|\s)*$", pstring):
print("\nspclist:", spclist)
print("\npstring:", pstring)
print("\nthe text:", text[:100])
print("\nnf:", nf)
- raise ContextException('no text in paragraph', stamp=stampurl, fragment=pstring)
+ raise ContextException(
+ "no text in paragraph", stamp=stampurl, fragment=pstring
+ )
# check that paragraph spaces aren't only font text, and have something
# real in them, unless they are breaks because of tables
if not (bprevparaalone or bthisparaalone):
bnonfont = False
for sl in spclist:
- if not re.match('?font[^>]*>(?i)', sl):
+ if not re.match("?font[^>]*>(?i)", sl):
bnonfont = True
if not bnonfont:
print("text:", text)
@@ -502,17 +528,20 @@ def SplitParaSpace(text, stampurl):
print("----------")
print("nf", nf)
print("----------")
- raise ContextException('font found in middle of paragraph should be a paragraph break or removed', stamp=stampurl, fragment=pstring)
+ raise ContextException(
+ "font found in middle of paragraph should be a paragraph break or removed",
+ stamp=stampurl,
+ fragment=pstring,
+ )
bprevparaalone = bthisparaalone
-
# put the preceding space, then the string into output list
res.append(spclist)
res.append(pstring)
- #print "???%s???" % pstring
+ # print "???%s???" % pstring
- spclist = [ ]
- pstring = ''
+ spclist = []
+ pstring = ""
# findal spaces into the output list
res.append(spclist)
@@ -523,27 +552,29 @@ def SplitParaSpace(text, stampurl):
# Break text into paragraphs and mark the paragraphs according to their indentation
def SplitParaIndents(text, stampurl):
dell = SplitParaSpace(text, stampurl)
- #print "dell", dell
+ # print "dell", dell
- res = [ ]
- resdent = [ ]
+ res = []
+ resdent = []
bIndent = 0
for i in range(len(dell)):
if (i % 2) == 0:
for sp in dell[i]:
- if re.match('(?:)?(?i)', sp):
- if bIndent==1:
- print(dell[i - 1: i + 1])
- raise ContextException(' already indented ', stamp=stampurl, fragment=sp)
+ if re.match("(?:)?(?i)", sp):
+ if bIndent == 1:
+ print(dell[i - 1 : i + 1])
+ raise ContextException(
+ " already indented ", stamp=stampurl, fragment=sp
+ )
bIndent = 1
- elif re.match('(?: )? (?i)', sp):
+ elif re.match("(?: )? (?i)", sp):
# no error
- #if not bIndent:
+ # if not bIndent:
# raise Exception, ' already not-indentented '
bIndent = 0
elif re.match('', sp):
bIndent = 2
- elif bIndent == 2 and re.match(' ', sp):
+ elif bIndent == 2 and re.match("", sp):
bIndent = 0
continue
@@ -555,7 +586,7 @@ def SplitParaIndents(text, stampurl):
tex = dell[i]
cindent = bIndent > 0 and 1 or 0
- qitbod = re.match('([\s\S]*?)[.:]?$', tex)
+ qitbod = re.match("([\s\S]*?)[.:]?$", tex)
if qitbod:
tex = qitbod.group(1)
cindent = cindent + 2
@@ -563,14 +594,7 @@ def SplitParaIndents(text, stampurl):
res.append(tex)
resdent.append(cindent)
- #if bIndent:
+ # if bIndent:
# print text
# raise ' still indented after last space '
return (res, resdent)
-
-
-
-
-
-
-
diff --git a/pyscraper/new_hansard.py b/pyscraper/new_hansard.py
index f4e41fe0..43d517ac 100755
--- a/pyscraper/new_hansard.py
+++ b/pyscraper/new_hansard.py
@@ -1,31 +1,34 @@
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
+import codecs
import datetime
-import re
+import io
import os
+import re
import sys
-import io
import tempfile
-from lxml import etree
import xml.sax
+
import miscfuncs
+from lxml import etree
xmlvalidate = xml.sax.make_parser()
-sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
-sys.path.append(os.path.join(os.path.dirname(__file__), 'lords'))
+sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
+sys.path.append(os.path.join(os.path.dirname(__file__), "lords"))
-from pullgluepages import MakeDayMap, GetFileDayVersions
+from contextexception import ContextException
+from gidmatching import DoFactorDiff, PrepareXMLForDiff
from miscfuncs import pwxmldirs
+from pullgluepages import GetFileDayVersions, MakeDayMap
from resolvemembernames import MemberList
from resolvenames import LordsList
-from filtersentence_xml import PhraseTokenize
-from gidmatching import PrepareXMLForDiff, DoFactorDiff
-from contextexception import ContextException
from xmlfilewrite import WriteXMLHeader
-parldata = '../../../parldata/'
+from filtersentence_xml import PhraseTokenize
+
+parldata = "../../../parldata/"
xml_parser = etree.XMLParser(ns_clean=True)
etree.set_default_parser(xml_parser)
@@ -33,11 +36,11 @@
class PimsList(MemberList):
def pbc_match(self, name, date):
- name = re.sub(r'\n', ' ', name)
+ name = re.sub(r"\n", " ", name)
# names are mostly lastname,\nfirstname so reform first
- if re.search(',', name):
- last, first = name.split(',')
- full = '{0} {1}'.format(first.strip(), last.strip())
+ if re.search(",", name):
+ last, first = name.split(",")
+ full = "{0} {1}".format(first.strip(), last.strip())
# apart from committee chairman which we can use as is
else:
full = name.strip()
@@ -46,8 +49,8 @@ def pbc_match(self, name, date):
mem_id = ids.pop()
person_id = self.membertopersonmap[mem_id]
member = self.persons[person_id]
- member['person_id'] = member.get('id')
- member['name'] = self.name_on_date(member['person_id'], date)
+ member["person_id"] = member.get("id")
+ member["name"] = self.name_on_date(member["person_id"], date)
return member
return None
@@ -58,119 +61,114 @@ class BaseParseDayXML(object):
resolver = PimsList()
type_to_xpath = {
- 'debate': (
+ "debate": (
'//ns:System[@type="Debate"]',
- 'http://www.parliament.uk/commons/hansard/print'
+ "http://www.parliament.uk/commons/hansard/print",
),
- 'westminhall': (
+ "westminhall": (
'//ns:System[@type="WestHall"]',
- 'http://www.parliament.uk/commons/hansard/print'
+ "http://www.parliament.uk/commons/hansard/print",
),
- 'lords': (
+ "lords": (
'//ns:System[@type="Debate"]',
- 'http://www.parliament.uk/lords/hansard/print'
+ "http://www.parliament.uk/lords/hansard/print",
),
- 'standing': (
+ "standing": (
'//ns:System[@type="Debate"]',
- 'http://www.parliament.uk/commons/hansard/print'
+ "http://www.parliament.uk/commons/hansard/print",
),
}
- oral_headings = [
- 'hs_3OralAnswers'
- ]
+ oral_headings = ["hs_3OralAnswers"]
major_headings = [
- 'hs_6bDepartment',
- 'hs_6bBigBoldHdg',
- 'hs_2cBillTitle',
- 'hs_2cUrgentQuestion',
- 'hs_3cMainHdg',
- 'hs_2BusinessWODebate',
- 'hs_2cStatement',
- 'hs_2BillTitle',
- 'hs_6bBillTitle',
- 'hs_6bBusinessB4Questions',
- 'hs_6bPrivateBusiness',
- 'hs_6bRoyalAssent',
- 'hs_6bBillsPresented', # FIXME should grab text of following tag
- 'hs_6fCntrItalHdg',
- 'hs_2cSO24Application',
- 'hs_6bFormalmotion',
- 'hs_2cDeferredDiv',
- 'hs_3cPetitions',
+ "hs_6bDepartment",
+ "hs_6bBigBoldHdg",
+ "hs_2cBillTitle",
+ "hs_2cUrgentQuestion",
+ "hs_3cMainHdg",
+ "hs_2BusinessWODebate",
+ "hs_2cStatement",
+ "hs_2BillTitle",
+ "hs_6bBillTitle",
+ "hs_6bBusinessB4Questions",
+ "hs_6bPrivateBusiness",
+ "hs_6bRoyalAssent",
+ "hs_6bBillsPresented", # FIXME should grab text of following tag
+ "hs_6fCntrItalHdg",
+ "hs_2cSO24Application",
+ "hs_6bFormalmotion",
+ "hs_2cDeferredDiv",
+ "hs_3cPetitions",
]
chair_headings = [
- 'hs_76fChair',
+ "hs_76fChair",
]
minor_headings = [
- 'hs_8Question',
- 'hs_8GenericHdg',
- 'hs_8Clause',
- 'hs_7SmCapsHdg',
- 'hs_7PrivateBusinessHdg',
- 'hs_7Bill',
- 'hs_6bcBigBoldHdg',
- 'hs_6bCorrection',
+ "hs_8Question",
+ "hs_8GenericHdg",
+ "hs_8Clause",
+ "hs_7SmCapsHdg",
+ "hs_7PrivateBusinessHdg",
+ "hs_7Bill",
+ "hs_6bcBigBoldHdg",
+ "hs_6bCorrection",
]
generic_headings = [
- 'hs_2cDebatedMotion',
- 'hs_2cGenericHdg',
- 'hs_2GenericHdg',
- ]
- whall_headings = [
- 'hs_2cWestHallDebate',
- 'hs_2WestHallDebate'
+ "hs_2cDebatedMotion",
+ "hs_2cGenericHdg",
+ "hs_2GenericHdg",
]
+ whall_headings = ["hs_2cWestHallDebate", "hs_2WestHallDebate"]
paras = [
- 'hs_Para',
- 'hs_AmendmentLevel1',
- 'hs_AmendmentLevel2',
- 'hs_AmendmentLevel3',
- 'hs_AmendmentLevel4',
- 'hs_AmendmentHeading',
- 'hs_newline10',
- 'hs_newline12',
- 'hs_Question',
- 'hs_6CntrCapsHdg',
+ "hs_Para",
+ "hs_AmendmentLevel1",
+ "hs_AmendmentLevel2",
+ "hs_AmendmentLevel3",
+ "hs_AmendmentLevel4",
+ "hs_AmendmentHeading",
+ "hs_newline10",
+ "hs_newline12",
+ "hs_Question",
+ "hs_6CntrCapsHdg",
]
indents = [
- 'hs_quote',
- 'hs_QuoteAllIndent',
- 'hs_ParaIndent',
- 'hs_AmendmentLevel0',
- 'hs_IndentOne',
- 'hs_IndentTwo',
+ "hs_quote",
+ "hs_QuoteAllIndent",
+ "hs_ParaIndent",
+ "hs_AmendmentLevel0",
+ "hs_IndentOne",
+ "hs_IndentTwo",
]
empty_tags = [
- 'StartProcedure',
- 'EndProcedure',
+ "StartProcedure",
+ "EndProcedure",
]
ignored_tags = [
- 'hs_TimeCode',
- 'hs_6bPetitions',
- 'hs_3MainHdg',
- 'hs_3cWestHall',
- 'hs_Venue'
+ "hs_TimeCode",
+ "hs_6bPetitions",
+ "hs_3MainHdg",
+ "hs_3cWestHall",
+ "hs_Venue",
]
root = None
- ns = ''
+ ns = ""
ns_map = {}
- division_number_element = 'Number'
- division_ayes_attribute = 'ayes'
- division_noes_attribute = 'noes'
+ division_number_element = "Number"
+ division_ayes_attribute = "ayes"
+ division_noes_attribute = "noes"
debate_type = None
current_speech = None
- date = ''
- rev = 'a'
+ date = ""
+ rev = "a"
use_pids = True
current_col = 0
current_speech_col = 0
current_speech_num = 0
next_speech_num = 0
current_speech_part = 1
- current_time = ''
+ current_time = ""
output_heading = False
skip_tag = None
uc_titles = False
@@ -181,14 +179,14 @@ def __init__(self):
def reset(self):
self.debate_type = None
self.current_speech = None
- self.date = ''
- self.rev = 'a'
+ self.date = ""
+ self.rev = "a"
self.current_col = 0
self.current_speech_col = 0
self.current_speech_num = 0
self.next_speech_num = 0
self.current_speech_part = 1
- self.current_time = ''
+ self.current_time = ""
self.root = None
self.input_root = None
self.output_heading = False
@@ -204,18 +202,15 @@ def is_pre_new_parser(self):
def get_tag_name_no_ns(self, tag):
# remove annoying namespace for brevities sake
tag_name = str(tag.tag)
- tag_name = tag_name.replace(
- '{{{0}}}'.format(self.ns),
- ''
- )
+ tag_name = tag_name.replace("{{{0}}}".format(self.ns), "")
return tag_name
def get_pid(self):
- pid = '{0}{1}.{2}/{3}'.format(
+ pid = "{0}{1}.{2}/{3}".format(
self.rev,
self.current_speech_col,
self.current_speech_num,
- self.current_speech_part
+ self.current_speech_part,
)
self.current_speech_part = self.current_speech_part + 1
return pid
@@ -224,21 +219,21 @@ def get_speech_id_first_part(self):
return self.date
def get_speech_url(self, url):
- return ''
+ return ""
def get_major_url(self, url):
- return ''
+ return ""
def get_minor_url(self, url):
- return ''
+ return ""
def get_speech_id(self):
- speech_id = 'uk.org.publicwhip/{0}/{1}{2}.{3}.{4}'.format(
+ speech_id = "uk.org.publicwhip/{0}/{1}{2}.{3}.{4}".format(
self.debate_type,
self.get_speech_id_first_part(),
self.rev,
self.current_speech_col,
- self.next_speech_num
+ self.next_speech_num,
)
self.current_speech_num = self.next_speech_num
if self.current_speech_col == self.current_col:
@@ -257,8 +252,8 @@ def check_for_pi(self, tag):
def check_for_pi_at_start(self, tag):
self.pi_at_start = False
- for c in tag.xpath('./node()'):
- if isinstance(c, str) and re.match('\s*$', c):
+ for c in tag.xpath("./node()"):
+ if isinstance(c, str) and re.match("\s*$", c):
continue
elif type(c) is etree._ProcessingInstruction:
self.parse_pi(c)
@@ -268,12 +263,12 @@ def check_for_pi_at_start(self, tag):
# this just makes any gid redirection easier
def get_text_from_element(self, el):
text = self.get_single_line_text_from_element(el)
- text = '\n{0}\n'.format(text)
+ text = "\n{0}\n".format(text)
return text
def get_single_line_text_from_element(self, el):
- text = ''.join(el.xpath('.//text()'))
- text = re.sub('\n', ' ', text).strip()
+ text = "".join(el.xpath(".//text()"))
+ text = re.sub("\n", " ", text).strip()
return text
def clear_current_speech(self):
@@ -284,29 +279,30 @@ def clear_current_speech(self):
def new_speech(self, member, url):
self.clear_current_speech()
- self.current_speech = etree.Element('speech')
- self.current_speech.set('id', self.get_speech_id())
+ self.current_speech = etree.Element("speech")
+ self.current_speech.set("id", self.get_speech_id())
if member is not None:
- self.current_speech.set('speakername', member['name'])
- if 'type' in member:
- self.current_speech.set('type', member['type'])
- if 'person_id' in member:
- self.current_speech.set('person_id', member['person_id'])
+ self.current_speech.set("speakername", member["name"])
+ if "type" in member:
+ self.current_speech.set("type", member["type"])
+ if "person_id" in member:
+ self.current_speech.set("person_id", member["person_id"])
else:
- self.current_speech.set('nospeaker', 'true')
+ self.current_speech.set("nospeaker", "true")
else:
- self.current_speech.set('nospeaker', 'true')
- self.current_speech.set('colnum', self.current_col)
- self.current_speech.set('time', self.current_time)
- self.current_speech.set(
- 'url',
- self.get_speech_url(url)
- )
+ self.current_speech.set("nospeaker", "true")
+ self.current_speech.set("colnum", self.current_col)
+ self.current_speech.set("time", self.current_time)
+ self.current_speech.set("url", self.get_speech_url(url))
self.current_speech_part = 1
def parse_system_header(self, header):
- sitting = header.xpath('./ns:Sitting', namespaces=self.ns_map)[0]
- date = datetime.datetime.strptime(sitting.get('short-date'), '%d %B %Y').date().isoformat()
+ sitting = header.xpath("./ns:Sitting", namespaces=self.ns_map)[0]
+ date = (
+ datetime.datetime.strptime(sitting.get("short-date"), "%d %B %Y")
+ .date()
+ .isoformat()
+ )
if date:
self.date = date
@@ -316,110 +312,111 @@ def handle_minus_member(self, member):
def _parse_member_or_b(self, tag):
member_tag = None
tag_name = self.get_tag_name_no_ns(tag)
- if tag_name == 'B':
- member_tags = tag.xpath('.//ns:Member', namespaces=self.ns_map)
+ if tag_name == "B":
+ member_tags = tag.xpath(".//ns:Member", namespaces=self.ns_map)
if len(member_tags) == 1:
member_tag = member_tags[0]
- elif tag_name == 'Member':
+ elif tag_name == "Member":
member_tag = tag
return member_tag
def parse_member(self, tag):
member_tag = self._parse_member_or_b(tag)
if member_tag is not None:
- mnis_id = member_tag.get('MnisId')
+ mnis_id = member_tag.get("MnisId")
pims_id = None
- if mnis_id in (None, '-1'):
- pims_id = member_tag.get('PimsId')
+ if mnis_id in (None, "-1"):
+ pims_id = member_tag.get("PimsId")
if pims_id in (None, "0", "-1"):
return self.handle_minus_member(member_tag)
- if pims_id: # Old way
+ if pims_id: # Old way
member = self.resolver.match_by_pims(pims_id, self.date)
else:
member = self.resolver.match_by_mnis(mnis_id, self.date)
if member is not None:
- member['person_id'] = member.get('id')
- member['name'] = self.resolver.name_on_date(member['person_id'], self.date)
- if member_tag.get('ContributionType'):
- member['type'] = member_tag.get('ContributionType')
+ member["person_id"] = member.get("id")
+ member["name"] = self.resolver.name_on_date(
+ member["person_id"], self.date
+ )
+ if member_tag.get("ContributionType"):
+ member["type"] = member_tag.get("ContributionType")
return member
else:
raise ContextException(
- 'No match for MnisId {0}\n'.format(mnis_id),
- stamp=tag.get('url'),
- fragment=member_tag.text
+ "No match for MnisId {0}\n".format(mnis_id),
+ stamp=tag.get("url"),
+ fragment=member_tag.text,
)
return None
def parse_date(self, date):
text = self.get_single_line_text_from_element(date)
- time_parts = re.match('\s*the\s+house (?:being |having )?met at?\s+(.*?)$(?i)', text)
+ time_parts = re.match(
+ "\s*the\s+house (?:being |having )?met at?\s+(.*?)$(?i)", text
+ )
if time_parts:
time = time_parts.group(1)
- time = re.sub('?i>',' ', time)
- time = re.sub('\s+',' ', time)
+ time = re.sub("?i>", " ", time)
+ time = re.sub("\s+", " ", time)
if re.match("half-past Nine(?i)", time):
- newtime = '09:30:00'
+ newtime = "09:30:00"
elif re.match("a quarter to Ten o(?i)", time):
- newtime = '09:45:00'
+ newtime = "09:45:00"
elif re.match("Ten o.clock(?i)", time):
- newtime = '10:00:00'
+ newtime = "10:00:00"
elif re.match("half-past Ten(?i)", time):
- newtime = '10:30:00'
+ newtime = "10:30:00"
elif re.match("Eleven o'clock(?i)", time):
- newtime = '11:00:00'
+ newtime = "11:00:00"
elif re.match("twenty-five minutes past\s*Eleven(?i)", time):
- newtime = '11:25:00'
+ newtime = "11:25:00"
elif re.match("twenty-six minutes past\s*Eleven(?i)", time):
- newtime = '11:26:00'
+ newtime = "11:26:00"
elif re.match("twenty-nine minutes past\s*Eleven(?i)", time):
- newtime = '11:29:00'
+ newtime = "11:29:00"
elif re.match("half-past Eleven(?i)", time):
- newtime = '11:30:00'
+ newtime = "11:30:00"
elif re.match("(Twelve noon|Midday)(?i)", time):
- newtime = '12:00:00'
+ newtime = "12:00:00"
elif re.match("half-past Twelve(?i)", time):
- newtime = '12:30:00'
+ newtime = "12:30:00"
elif re.match("One o.clock(?i)", time):
- newtime = '13:00:00'
+ newtime = "13:00:00"
elif re.match("half-past One(?i)", time):
- newtime = '13:30:00'
+ newtime = "13:30:00"
elif re.match("Two o.clock(?i)", time):
- newtime = '14:00:00'
+ newtime = "14:00:00"
elif re.match("half-past Two(?i)", time):
- newtime = '14:30:00'
+ newtime = "14:30:00"
elif re.match("half-past Three(?i)", time):
- newtime = '15:30:00'
+ newtime = "15:30:00"
elif re.match("twenty minutes to Three(?i)", time):
- newtime = '14:40:00'
+ newtime = "14:40:00"
elif re.match("10 minutes past Three(?i)", time):
- newtime = '15:10:00'
+ newtime = "15:10:00"
elif re.match("Six o'clock(?i)", time):
- newtime = '18:00:00'
+ newtime = "18:00:00"
else:
- raise ContextException("No time matched", fragment=time)
+ raise ContextException("No time matched", fragment=time)
self.time = newtime
# this covers the "The Attorney General was Asked - " type
# bits at the start of Oral questions which are in an
# hs_6fDate tag.
- elif re.match('.*was asked.*', text):
+ elif re.match(".*was asked.*", text):
self.parse_para_with_member(date, None)
def parse_oral_heading(self, heading):
self.clear_current_speech()
self.output_heading = True
- tag = etree.Element('oral-heading')
- tag.set('id', self.get_speech_id())
- tag.set('nospeaker', 'true')
- tag.set('colnum', self.current_col)
- tag.set('time', self.current_time)
- tag.set(
- 'url',
- self.get_major_url(heading.get('url'))
- )
+ tag = etree.Element("oral-heading")
+ tag.set("id", self.get_speech_id())
+ tag.set("nospeaker", "true")
+ tag.set("colnum", self.current_col)
+ tag.set("time", self.current_time)
+ tag.set("url", self.get_major_url(heading.get("url")))
tag.text = heading.text
self.root.append(tag)
@@ -432,29 +429,26 @@ def parse_debateheading(self, tag):
def parse_major(self, heading, **kwargs):
text = self.get_text_from_element(heading)
- if text.strip() == 'Prayers':
+ if text.strip() == "Prayers":
return
self.clear_current_speech()
- tag = etree.Element('major-heading')
+ tag = etree.Element("major-heading")
if self.uc_titles:
tag.text = text.upper()
else:
tag.text = text
- if 'extra_text' in kwargs:
- tag.text = '{0} - '.format(tag.text)
- i = etree.Element('i')
- i.text = kwargs['extra_text']
+ if "extra_text" in kwargs:
+ tag.text = "{0} - ".format(tag.text)
+ i = etree.Element("i")
+ i.text = kwargs["extra_text"]
tag.append(i)
- tag.set('id', self.get_speech_id())
- tag.set('nospeaker', 'true')
- tag.set('colnum', self.current_col)
- tag.set('time', self.current_time)
- tag.set(
- 'url',
- self.get_major_url(heading.get('url'))
- )
+ tag.set("id", self.get_speech_id())
+ tag.set("nospeaker", "true")
+ tag.set("colnum", self.current_col)
+ tag.set("time", self.current_time)
+ tag.set("url", self.get_major_url(heading.get("url")))
self.root.append(tag)
self.output_heading = True
@@ -469,26 +463,27 @@ def parse_chair(self, heading):
self.initial_chair = self.get_text_from_element(heading)
def parse_minor(self, heading):
-
next_elt = heading.getnext()
- if next_elt is not None and self.get_tag_name_no_ns(next_elt) in self.minor_headings:
- text = ' - '.join([
- self.get_single_line_text_from_element(heading),
- self.get_single_line_text_from_element(next_elt)
- ])
+ if (
+ next_elt is not None
+ and self.get_tag_name_no_ns(next_elt) in self.minor_headings
+ ):
+ text = " - ".join(
+ [
+ self.get_single_line_text_from_element(heading),
+ self.get_single_line_text_from_element(next_elt),
+ ]
+ )
heading.text = text
self.skip_tag = self.get_tag_name_no_ns(next_elt)
self.clear_current_speech()
- tag = etree.Element('minor-heading')
- tag.set('id', self.get_speech_id())
- tag.set('nospeaker', 'true')
- tag.set('colnum', self.current_col)
- tag.set('time', self.current_time)
- tag.set(
- 'url',
- self.get_minor_url(heading.get('url'))
- )
+ tag = etree.Element("minor-heading")
+ tag.set("id", self.get_speech_id())
+ tag.set("nospeaker", "true")
+ tag.set("colnum", self.current_col)
+ tag.set("time", self.current_time)
+ tag.set("url", self.get_minor_url(heading.get("url")))
text = self.get_text_from_element(heading)
tag.text = text
self.root.append(tag)
@@ -521,15 +516,17 @@ def parse_opposition(self, heading):
minor heading
"""
following = heading.xpath(
- '(./following-sibling::ns:hs_2cDebatedMotion|./following-sibling::ns:hs_7SmCapsHdg|./following-sibling::ns:hs_2GenericHdg)',
- namespaces=self.ns_map
+ "(./following-sibling::ns:hs_2cDebatedMotion|./following-sibling::ns:hs_7SmCapsHdg|./following-sibling::ns:hs_2GenericHdg)",
+ namespaces=self.ns_map,
)
- text = ''
+ text = ""
if len(following) == 1:
- text = ' - '.join([
- self.get_single_line_text_from_element(heading),
- self.get_single_line_text_from_element(following[0])
- ])
+ text = " - ".join(
+ [
+ self.get_single_line_text_from_element(heading),
+ self.get_single_line_text_from_element(following[0]),
+ ]
+ )
heading.text = text
self.skip_tag = self.get_tag_name_no_ns(following[0])
@@ -542,15 +539,16 @@ def parse_debated_motion(self, motion):
when we see the hs_2DebatedMotion tag
"""
following = motion.xpath(
- './following-sibling::ns:hs_6bFormalmotion',
- namespaces=self.ns_map
+ "./following-sibling::ns:hs_6bFormalmotion", namespaces=self.ns_map
)
- text = ''
+ text = ""
if len(following) == 1:
- text = ' - '.join([
- self.get_single_line_text_from_element(motion),
- self.get_single_line_text_from_element(following[0])
- ])
+ text = " - ".join(
+ [
+ self.get_single_line_text_from_element(motion),
+ self.get_single_line_text_from_element(following[0]),
+ ]
+ )
motion.text = text
self.skip_tag = self.get_tag_name_no_ns(following[0])
@@ -571,54 +569,47 @@ def parse_WHDebate(self, debate):
minor heading
"""
chair = debate.xpath(
- '(./preceding-sibling::ns:hs_76fChair | ./following-sibling::ns:hs_76fChair)',
- namespaces=self.ns_map
+ "(./preceding-sibling::ns:hs_76fChair | ./following-sibling::ns:hs_76fChair)",
+ namespaces=self.ns_map,
)
if len(chair) == 1:
chair_text = self.get_single_line_text_from_element(chair[0])
- text = '\n{0} — {1}\n'.format(text, chair_text)
+ text = "\n{0} — {1}\n".format(text, chair_text)
self.clear_current_speech()
- tag = etree.Element('minor-heading')
- tag.set('id', self.get_speech_id())
- tag.set('nospeaker', 'true')
- tag.set('colnum', self.current_col)
- tag.set('time', self.current_time)
- tag.set(
- 'url',
- self.get_major_url(debate.get('url'))
- )
+ tag = etree.Element("minor-heading")
+ tag.set("id", self.get_speech_id())
+ tag.set("nospeaker", "true")
+ tag.set("colnum", self.current_col)
+ tag.set("time", self.current_time)
+ tag.set("url", self.get_major_url(debate.get("url")))
tag.text = text
self.root.append(tag)
self.output_heading = True
def parse_question(self, question):
- member = question.xpath('.//ns:Member', namespaces=self.ns_map)[0]
+ member = question.xpath(".//ns:Member", namespaces=self.ns_map)[0]
member = self.parse_member(member)
- first_para = question.xpath('.//ns:hs_Para', namespaces=self.ns_map)[0]
- self.new_speech(member, first_para.get('url'))
+ first_para = question.xpath(".//ns:hs_Para", namespaces=self.ns_map)[0]
+ self.new_speech(member, first_para.get("url"))
- number = ''.join(
- question.xpath('.//ns:Number/text()', namespaces=self.ns_map)
- )
- if number != '':
- self.current_speech.set('oral-qnum', number)
+ number = "".join(question.xpath(".//ns:Number/text()", namespaces=self.ns_map))
+ if number != "":
+ self.current_speech.set("oral-qnum", number)
- p = etree.Element('p')
- p.set('pid', self.get_pid())
- uin = question.xpath('.//ns:Uin', namespaces=self.ns_map)
+ p = etree.Element("p")
+ p.set("pid", self.get_pid())
+ uin = question.xpath(".//ns:Uin", namespaces=self.ns_map)
if len(uin) > 0:
- uin_text = ''.join(uin[0].xpath('.//text()'))
- m = re.match('\[\s*(\d+)\s*\]', uin_text)
+ uin_text = "".join(uin[0].xpath(".//text()"))
+ m = re.match("\[\s*(\d+)\s*\]", uin_text)
if m is not None:
no = m.groups(1)[0]
- p.set('qnum', no)
+ p.set("qnum", no)
- text = first_para.xpath(
- './/ns:QuestionText/text()', namespaces=self.ns_map
- )
- text = ''.join(text)
+ text = first_para.xpath(".//ns:QuestionText/text()", namespaces=self.ns_map)
+ text = "".join(text)
"""
sometimes the question text is after the tag rather
than inside it in which case we want to grab all the
@@ -634,50 +625,48 @@ def parse_question(self, question):
the Secretary of State aware that the Construction Industry (etc)
"""
- if text == '':
+ if text == "":
q_text = first_para.xpath(
- './/ns:QuestionText/following-sibling::text()',
- namespaces=self.ns_map
+ ".//ns:QuestionText/following-sibling::text()", namespaces=self.ns_map
)
if len(q_text):
- text = ''.join(q_text)
+ text = "".join(q_text)
- p.text = re.sub('\n', ' ', text)
+ p.text = re.sub("\n", " ", text)
self.current_speech.append(p)
# and sometimes there is more question text in following siblings
# so we need to handle those too
following_tags = first_para.xpath(
- './following-sibling::*',
- namespaces=self.ns_map
+ "./following-sibling::*", namespaces=self.ns_map
)
for t in following_tags:
tag_name = self.get_tag_name_no_ns(t)
self.handle_tag(tag_name, t)
def parse_indent(self, tag):
- self.parse_para_with_member(tag, None, css_class='indent')
+ self.parse_para_with_member(tag, None, css_class="indent")
def parse_petition(self, petition):
- petition.text = 'Petition - {0}'.format(petition.text)
+ petition.text = "Petition - {0}".format(petition.text)
self.parse_major(petition)
def output_normally_ignored(self):
self.clear_current_speech()
- tag = etree.Element('major-heading')
- tag.text = 'Prayers'
+ tag = etree.Element("major-heading")
+ tag.text = "Prayers"
- if hasattr(self, 'initial_chair'):
- tag.text += ' - '
- i = etree.Element('i')
+ if hasattr(self, "initial_chair"):
+ tag.text += " - "
+ i = etree.Element("i")
i.text = self.initial_chair
tag.append(i)
- tag.set('id', self.get_speech_id())
- tag.set('nospeaker', 'true')
- tag.set('colnum', self.current_col)
- tag.set('time', self.current_time)
+ tag.set("id", self.get_speech_id())
+ tag.set("nospeaker", "true")
+ tag.set("colnum", self.current_col)
+ tag.set("time", self.current_time)
self.root.append(tag)
self.output_heading = True
@@ -685,29 +674,31 @@ def parse_para_with_member(self, para, member, **kwargs):
if not self.output_heading:
self.output_normally_ignored()
- members = para.xpath('.//ns:Member', namespaces=self.ns_map)
+ members = para.xpath(".//ns:Member", namespaces=self.ns_map)
if member is not None:
- self.new_speech(member, para.get('url'))
+ self.new_speech(member, para.get("url"))
elif members:
m_name = None
- bs = members[0].xpath('./ns:B', namespaces=self.ns_map)
+ bs = members[0].xpath("./ns:B", namespaces=self.ns_map)
if len(bs) == 1:
- m_name = {'name': re.sub('\s+', ' ', bs[0].text).strip()}
+ m_name = {"name": re.sub("\s+", " ", bs[0].text).strip()}
elif len(bs) == 0:
- m_name = {'name': re.sub('\s+', ' ', members[0].text).strip()}
- self.new_speech(m_name, para.get('url'))
+ m_name = {"name": re.sub("\s+", " ", members[0].text).strip()}
+ self.new_speech(m_name, para.get("url"))
elif self.current_speech is None:
- self.new_speech(None, para.get('url'))
+ self.new_speech(None, para.get("url"))
# this makes the text fetching a bit easier
- if kwargs.get('strip_member', True):
+ if kwargs.get("strip_member", True):
for m in members:
- italics = m.xpath('.//ns:I', namespaces=self.ns_map)
- text = ''.join(self.get_single_line_text_from_element(i) for i in italics)
+ italics = m.xpath(".//ns:I", namespaces=self.ns_map)
+ text = "".join(
+ self.get_single_line_text_from_element(i) for i in italics
+ )
if text:
- kwargs['css_class'] = 'italic'
+ kwargs["css_class"] = "italic"
if m.tail:
- text += ' ' + m.tail
+ text += " " + m.tail
m.getparent().text = text
m.getparent().remove(m)
@@ -715,21 +706,21 @@ def parse_para_with_member(self, para, member, **kwargs):
if len(text) == 0:
return
- i = para.xpath('./ns:I', namespaces=self.ns_map)
+ i = para.xpath("./ns:I", namespaces=self.ns_map)
if len(i) == 1:
i_text = self.get_single_line_text_from_element(i[0])
if text == i_text:
- kwargs['css_class'] = 'italic'
+ kwargs["css_class"] = "italic"
- fs = '{0} '.format(PhraseTokenize(self.date, text).GetPara())
+ fs = "{0} ".format(PhraseTokenize(self.date, text).GetPara())
tag = etree.fromstring(fs)
if self.use_pids:
- tag.set('pid', self.get_pid())
- if 'css_class' in kwargs:
- tag.set('class', kwargs['css_class'])
- if 'pwmotiontext' in kwargs:
- tag.set('pwmotiontext', kwargs['pwmotiontext'])
+ tag.set("pid", self.get_pid())
+ if "css_class" in kwargs:
+ tag.set("class", kwargs["css_class"])
+ if "pwmotiontext" in kwargs:
+ tag.set("pwmotiontext", kwargs["pwmotiontext"])
self.current_speech.append(tag)
@@ -741,7 +732,7 @@ def parse_para(self, para):
member = None
for tag in para:
tag_name = self.get_tag_name_no_ns(tag)
- if tag_name == 'B' or tag_name == 'Member':
+ if tag_name == "B" or tag_name == "Member":
m = self.parse_member(tag)
if m:
member = m
@@ -749,45 +740,48 @@ def parse_para(self, para):
self.parse_para_with_member(para, member)
def parse_brev(self, brev):
- self.parse_para_with_member(brev, None, css_class="indent", pwmotiontext='yes')
+ self.parse_para_with_member(brev, None, css_class="indent", pwmotiontext="yes")
def parse_votelist(self, votes, direction, vote_list, is_teller=False):
for vote in votes:
- tag = etree.Element('mpname')
+ tag = etree.Element("mpname")
member = self.parse_member(vote)
- tag.set('person_id', member['person_id'])
- tag.set('vote', direction)
+ tag.set("person_id", member["person_id"])
+ tag.set("vote", direction)
if is_teller:
- tag.set('teller', 'yes')
- if self.debate_type == 'standing':
- tag.set('membername', member['name'])
- tag.text = member['name']
+ tag.set("teller", "yes")
+ if self.debate_type == "standing":
+ tag.set("membername", member["name"])
+ tag.text = member["name"]
proxy = None
vote_text = self.get_single_line_text_from_element(vote)
- m = re.search('\(Proxy vote cast by (.*)\)', vote_text)
+ m = re.search("\(Proxy vote cast by (.*)\)", vote_text)
if m:
proxy = self.resolver.pbc_match(m.group(1), self.date)
if proxy:
- tag.set('proxy', proxy['id'])
+ tag.set("proxy", proxy["id"])
vote_list.append(tag)
return vote_list
def parse_table(self, wrapper):
- rows = wrapper.xpath('.//ns:row', namespaces=self.ns_map)
- tag = etree.Element('table')
- body = etree.Element('tbody')
+ rows = wrapper.xpath(".//ns:row", namespaces=self.ns_map)
+ tag = etree.Element("table")
+ body = etree.Element("tbody")
url = None
for row in rows:
- row_tag = etree.Element('tr')
- row_tag.set('pid', self.get_pid())
+ row_tag = etree.Element("tr")
+ row_tag.set("pid", self.get_pid())
- for entry in row.xpath('(.//ns:hs_TableHeading|.//ns:hs_brev|.//ns:hs_Para|.//ns:hs_para)', namespaces=self.ns_map):
+ for entry in row.xpath(
+ "(.//ns:hs_TableHeading|.//ns:hs_brev|.//ns:hs_Para|.//ns:hs_para)",
+ namespaces=self.ns_map,
+ ):
if url is None:
- url = entry.get('url')
- td_tag = etree.Element('td')
+ url = entry.get("url")
+ td_tag = etree.Element("td")
td_tag.text = self.get_single_line_text_from_element(entry)
row_tag.append(td_tag)
@@ -802,19 +796,21 @@ def parse_table(self, wrapper):
self.current_speech.append(tag)
def get_division_tag(self, division, yes_text, no_text):
- tag = etree.Element('division')
-
- tag.set('id', self.get_speech_id())
- tag.set('nospeaker', 'true')
- tag.set('divdate', self.date)
- div_number = division.xpath('.//ns:' + self.division_number_element, namespaces=self.ns_map)[0]
+ tag = etree.Element("division")
+
+ tag.set("id", self.get_speech_id())
+ tag.set("nospeaker", "true")
+ tag.set("divdate", self.date)
+ div_number = division.xpath(
+ ".//ns:" + self.division_number_element, namespaces=self.ns_map
+ )[0]
div_number = self.get_single_line_text_from_element(div_number)
- tag.set('divnumber', div_number)
- tag.set('colnum', self.current_col)
- tag.set('time', self.current_time)
+ tag.set("divnumber", div_number)
+ tag.set("colnum", self.current_col)
+ tag.set("time", self.current_time)
- div_count = etree.Element('divisioncount')
+ div_count = etree.Element("divisioncount")
div_count.set(self.division_ayes_attribute, yes_text)
div_count.set(self.division_noes_attribute, no_text)
@@ -827,102 +823,111 @@ def parse_division(self, division):
if type(tag) is etree._ProcessingInstruction:
continue
tag_name = self.get_tag_name_no_ns(tag)
- if tag_name not in ('hs_Para', 'England', 'EnglandWales', 'hs_DivListHeader', 'TwoColumn'):
+ if tag_name not in (
+ "hs_Para",
+ "England",
+ "EnglandWales",
+ "hs_DivListHeader",
+ "TwoColumn",
+ ):
if not self.handle_tag(tag_name, tag):
- raise ContextException('unhandled tag: {0}'.format(tag_name), fragment=tag, stamp=tag.get('url'))
+ raise ContextException(
+ "unhandled tag: {0}".format(tag_name),
+ fragment=tag,
+ stamp=tag.get("url"),
+ )
- ayes_count = \
- division.xpath('./ns:hs_Para/ns:AyesNumber/text()', namespaces=self.ns_map)
- noes_count = \
- division.xpath('./ns:hs_Para/ns:NoesNumber/text()', namespaces=self.ns_map)
+ ayes_count = division.xpath(
+ "./ns:hs_Para/ns:AyesNumber/text()", namespaces=self.ns_map
+ )
+ noes_count = division.xpath(
+ "./ns:hs_Para/ns:NoesNumber/text()", namespaces=self.ns_map
+ )
- ayes_count_text = ''.join(ayes_count)
- noes_count_text = ''.join(noes_count)
+ ayes_count_text = "".join(ayes_count)
+ noes_count_text = "".join(noes_count)
self.clear_current_speech()
tag = self.get_division_tag(division, ayes_count_text, noes_count_text)
- ayes = division.xpath(
- './/ns:NamesAyes//ns:Member', namespaces=self.ns_map
- )
- noes = division.xpath(
- './/ns:NamesNoes//ns:Member', namespaces=self.ns_map
- )
+ ayes = division.xpath(".//ns:NamesAyes//ns:Member", namespaces=self.ns_map)
+ noes = division.xpath(".//ns:NamesNoes//ns:Member", namespaces=self.ns_map)
aye_tellers = division.xpath(
- './/ns:TellerNamesAyes//ns:Member', namespaces=self.ns_map
+ ".//ns:TellerNamesAyes//ns:Member", namespaces=self.ns_map
)
noe_tellers = division.xpath(
- './/ns:TellerNamesNoes//ns:Member', namespaces=self.ns_map
+ ".//ns:TellerNamesNoes//ns:Member", namespaces=self.ns_map
)
- aye_list = etree.Element('mplist')
- aye_list.set('vote', 'aye')
- aye_list = self.parse_votelist(ayes, 'aye', aye_list)
- aye_list = self.parse_votelist(aye_tellers, 'aye', aye_list, True)
+ aye_list = etree.Element("mplist")
+ aye_list.set("vote", "aye")
+ aye_list = self.parse_votelist(ayes, "aye", aye_list)
+ aye_list = self.parse_votelist(aye_tellers, "aye", aye_list, True)
tag.append(aye_list)
- noe_list = etree.Element('mplist')
- noe_list.set('vote', 'no')
- noe_list = self.parse_votelist(noes, 'no', noe_list)
- noe_list = self.parse_votelist(noe_tellers, 'no', noe_list, True)
+ noe_list = etree.Element("mplist")
+ noe_list.set("vote", "no")
+ noe_list = self.parse_votelist(noes, "no", noe_list)
+ noe_list = self.parse_votelist(noe_tellers, "no", noe_list, True)
tag.append(noe_list)
self.root.append(tag)
- #Â England/EnglandWales not used since May 2018
- paras = division.xpath('(./ns:hs_Para|./ns:England/ns:hs_Para|./ns:EnglandWales/ns:hs_Para)', namespaces=self.ns_map)
+ # England/EnglandWales not used since May 2018
+ paras = division.xpath(
+ "(./ns:hs_Para|./ns:England/ns:hs_Para|./ns:EnglandWales/ns:hs_Para)",
+ namespaces=self.ns_map,
+ )
for para in paras:
text = self.get_single_line_text_from_element(para)
- if re.search(r'Division\s*No', text):
+ if re.search(r"Division\s*No", text):
continue
self.parse_para(para)
def parse_time(self, tag):
- time_txt = ''.join(tag.xpath('.//text()'))
- if time_txt == '':
+ time_txt = "".join(tag.xpath(".//text()"))
+ if time_txt == "":
return
- matches = re.match('(\d+)(?:[:.,]\s*(\d+))?[\xa0\s]*(am|pm)', time_txt)
+ matches = re.match("(\d+)(?:[:.,]\s*(\d+))?[\xa0\s]*(am|pm)", time_txt)
if matches:
hours = int(matches.group(1))
minutes = int(matches.group(2) or 0)
- if matches.group(3) == 'pm' and hours < 12:
+ if matches.group(3) == "pm" and hours < 12:
hours += 12
time = datetime.time(hours, minutes)
self.current_time = time.isoformat()
- elif time_txt in ('Noon', 'noon') or re.match('12\s*?noon', time_txt):
+ elif time_txt in ("Noon", "noon") or re.match("12\s*?noon", time_txt):
self.current_time = "12:00:00"
- elif re.match('12\s*?midnight', time_txt):
+ elif re.match("12\s*?midnight", time_txt):
self.current_time = "00:00:00"
- elif re.match('Midnight', time_txt):
+ elif re.match("Midnight", time_txt):
self.current_time = "00:00:00"
else:
raise ContextException(
- "Unmatched time %s" % time_txt,
- fragment=tag,
- stamp=tag.get('url')
+ "Unmatched time %s" % time_txt, fragment=tag, stamp=tag.get("url")
)
def parse_procedure(self, procedure):
- tag = etree.Element('p')
+ tag = etree.Element("p")
text = self.get_single_line_text_from_element(procedure)
if len(text) == 0:
return
# We ignore prayers
- if re.match('Prayers.*?read by', text):
+ if re.match("Prayers.*?read by", text):
return
if not self.output_heading:
self.output_normally_ignored()
- tag.set('pid', self.get_pid())
- tag.set('class', 'italic')
+ tag.set("pid", self.get_pid())
+ tag.set("class", "italic")
tag.text = text
if self.current_speech is None:
- self.new_speech(None, procedure.get('url'))
+ self.new_speech(None, procedure.get("url"))
self.current_speech.append(tag)
@@ -930,7 +935,7 @@ def parse_pi(self, pi):
# you would think there is a better way to do this but I can't seem
# to extract attributes from processing instructions :(
text = str(pi)
- matches = re.search(r'column=(\d+)\?', text)
+ matches = re.search(r"column=(\d+)\?", text)
if matches is not None:
col = matches.group(1)
self.current_col = col
@@ -941,20 +946,20 @@ def handle_tag(self, tag_name, tag):
if self.skip_tag is not None and tag_name == self.skip_tag:
self.skip_tag = None
- elif tag_name == 'hs_6fDate':
+ elif tag_name == "hs_6fDate":
self.parse_date(tag)
elif tag_name in self.oral_headings:
self.parse_oral_heading(tag)
- elif tag_name == 'hs_3cOppositionDay':
+ elif tag_name == "hs_3cOppositionDay":
self.parse_opposition(tag)
- elif tag_name == 'hs_2DebatedMotion':
+ elif tag_name == "hs_2DebatedMotion":
self.parse_debated_motion(tag)
- elif tag_name == 'DebateHeading':
+ elif tag_name == "DebateHeading":
handled = self.parse_debateheading(tag)
- elif tag_name == 'hs_2DebBill':
- if self.debate_type == 'westminhall':
+ elif tag_name == "hs_2DebBill":
+ if self.debate_type == "westminhall":
self.parse_WHDebate(tag)
- elif self.debate_type == 'debate':
+ elif self.debate_type == "debate":
self.parse_major(tag)
elif tag_name in self.major_headings:
self.parse_major(tag)
@@ -966,21 +971,21 @@ def handle_tag(self, tag_name, tag):
self.parse_generic(tag)
elif tag_name in self.whall_headings:
self.parse_WHDebate(tag)
- elif tag_name == 'Question':
+ elif tag_name == "Question":
self.parse_question(tag)
- elif tag_name == 'hs_8Petition':
+ elif tag_name == "hs_8Petition":
self.parse_petition(tag)
elif tag_name in self.indents:
self.parse_indent(tag)
elif tag_name in self.paras:
self.parse_para(tag)
- elif tag_name == 'hs_brev' or tag_name == 'hs_brevIndent':
+ elif tag_name == "hs_brev" or tag_name == "hs_brevIndent":
self.parse_brev(tag)
- elif tag_name == 'TableWrapper':
+ elif tag_name == "TableWrapper":
self.parse_table(tag)
- elif tag_name == 'Division':
+ elif tag_name == "Division":
self.parse_division(tag)
- elif tag_name == 'hs_Timeline':
+ elif tag_name == "hs_Timeline":
self.parse_time(tag)
elif tag_name in self.ignored_tags:
pass
@@ -996,21 +1001,20 @@ def parse_day(self, xml_file):
ok = self.setup_parser(xml_file)
if not ok:
return False
- self.root.set('scraperversion', self.rev)
- self.root.set('latest', 'yes')
- self.current_col = self.input_root[0].get('ColStart')
+ self.root.set("scraperversion", self.rev)
+ self.root.set("latest", "yes")
+ self.current_col = self.input_root[0].get("ColStart")
headers = self.input_root[0].xpath(
- './/ns:Fragment/ns:Header', namespaces=self.ns_map
+ ".//ns:Fragment/ns:Header", namespaces=self.ns_map
)
self.parse_system_header(headers[0])
body_tags = self.input_root[0].xpath(
- './/ns:Fragment/ns:Body', namespaces=self.ns_map
+ ".//ns:Fragment/ns:Body", namespaces=self.ns_map
)
for b in body_tags:
for tag in b:
-
# column numbers are contained in processing
# instructions so first check if the tag is
# one of those because then we don't need to
@@ -1025,13 +1029,13 @@ def parse_day(self, xml_file):
tag_name = self.get_tag_name_no_ns(tag)
if self.verbose >= 2:
- start_tag = re.sub('>.*', '>', etree.tounicode(tag))
- print('Parsing %s' % start_tag)
+ start_tag = re.sub(">.*", ">", etree.tounicode(tag))
+ print("Parsing %s" % start_tag)
if not self.handle_tag(tag_name, tag):
raise ContextException(
- 'unhandled tag: {0}'.format(tag_name),
+ "unhandled tag: {0}".format(tag_name),
fragment=etree.tostring(tag),
- stamp=tag.get('url')
+ stamp=tag.get("url"),
)
# PI handling - check inside all tags for processing
@@ -1049,7 +1053,7 @@ def get_date(self, xml_file):
return False
headers = self.input_root[0].xpath(
- './/ns:Fragment/ns:Header', namespaces=self.ns_map
+ ".//ns:Fragment/ns:Header", namespaces=self.ns_map
)
self.parse_system_header(headers[0])
return self.date
@@ -1061,20 +1065,19 @@ def setup_parser(self, xml_file):
if self.input_root is not None:
return True
- self.root = etree.Element('publicwhip')
+ self.root = etree.Element("publicwhip")
self.ns = self.type_to_xpath[self.debate_type][1]
- self.ns_map = {'ns': self.ns}
+ self.ns_map = {"ns": self.ns}
root_xpath = self.type_to_xpath[self.debate_type][0]
self.xml_root = self.get_parser(xml_file).getroot()
- self.input_root = self.xml_root.xpath(
- root_xpath, namespaces=self.ns_map
- )
+ self.input_root = self.xml_root.xpath(root_xpath, namespaces=self.ns_map)
if len(self.input_root) == 0:
if self.verbose >= 1:
sys.stderr.write(
- 'Failed to find any debates of type {0} in {1}\n'
- .format(self.debate_type, xml_file.name)
+ "Failed to find any debates of type {0} in {1}\n".format(
+ self.debate_type, xml_file.name
+ )
)
return False
return True
@@ -1090,10 +1093,10 @@ class PBCParseDayXML(BaseParseDayXML):
use_pids = False
ignored_tags = [
- 'hs_CLHeading',
- 'hs_CLAttended',
- 'hs_6fCntrItalHdg',
- 'hs_TimeCode',
+ "hs_CLHeading",
+ "hs_CLAttended",
+ "hs_6fCntrItalHdg",
+ "hs_TimeCode",
]
def reset(self):
@@ -1109,11 +1112,11 @@ def get_speech_id_first_part(self):
def get_member_with_no_id(self, member_tag):
name = member_tag.text
if not name:
- bs = member_tag.xpath('./ns:B', namespaces=self.ns_map)
+ bs = member_tag.xpath("./ns:B", namespaces=self.ns_map)
if bs:
name = bs[0].text
- name = name.rstrip(':')
+ name = name.rstrip(":")
member = self.resolver.pbc_match(name, self.date)
return member
@@ -1124,70 +1127,70 @@ def get_member_with_no_id(self, member_tag):
# we want the immediately preceding one which will be the last one
# in the array
def get_attending_status(self, member_tag):
- text = member_tag.xpath('./preceding-sibling::text()')
- if len(text) > 0 and re.search('\u2020', text[-1]):
- return 'true'
+ text = member_tag.xpath("./preceding-sibling::text()")
+ if len(text) > 0 and re.search("\u2020", text[-1]):
+ return "true"
- return 'false'
+ return "false"
def parse_chairmen(self, chair):
- member_tags = chair.xpath('.//ns:Member', namespaces=self.ns_map)
+ member_tags = chair.xpath(".//ns:Member", namespaces=self.ns_map)
for member_tag in member_tags:
member = self.parse_member(member_tag)
if member is None:
member = self.get_member_with_no_id(member_tag)
if member is not None:
- member['attending'] = self.get_attending_status(member_tag)
+ member["attending"] = self.get_attending_status(member_tag)
self.chairs.append(member)
else:
raise ContextException(
- 'No match for PBC chairman {0}'.format(member_tag.text),
- stamp=member_tag.get('url'),
- fragment=member_tag.text
+ "No match for PBC chairman {0}".format(member_tag.text),
+ stamp=member_tag.get("url"),
+ fragment=member_tag.text,
)
def parse_clmember(self, clmember):
- member_tag = clmember.xpath('.//ns:Member', namespaces=self.ns_map)[0]
+ member_tag = clmember.xpath(".//ns:Member", namespaces=self.ns_map)[0]
member = self.parse_member(member_tag)
if member is None:
member = self.get_member_with_no_id(member_tag)
- cons_tags = member_tag.xpath('.//ns:I', namespaces=self.ns_map)
- cons = ''
+ cons_tags = member_tag.xpath(".//ns:I", namespaces=self.ns_map)
+ cons = ""
if len(cons_tags) == 1:
cons_tag = cons_tags[0]
cons = cons_tag.text
- cons = re.sub(r'[()]', '', cons)
+ cons = re.sub(r"[()]", "", cons)
if member is not None:
- member['attending'] = self.get_attending_status(member_tag)
- member['pbc_cons'] = cons
+ member["attending"] = self.get_attending_status(member_tag)
+ member["pbc_cons"] = cons
self.members.append(member)
else:
raise ContextException(
- 'No match for PBC member {0}'.format(member_tag.text),
- stamp=member_tag.get('url'),
- fragment=member_tag.text
+ "No match for PBC member {0}".format(member_tag.text),
+ stamp=member_tag.get("url"),
+ fragment=member_tag.text,
)
def parse_clerks(self, clerks):
text = clerks.text
- self.clerks = text.split(',')
+ self.clerks = text.split(",")
def parse_witness(self, witness):
self.witnesses.append(witness.text)
def committee_finished(self):
- committee = etree.Element('committee')
+ committee = etree.Element("committee")
- chairmen = etree.Element('chairmen')
+ chairmen = etree.Element("chairmen")
for c in self.chairs:
- mp = etree.Element('mpname')
- mp.set('person_id', c['person_id'])
- mp.set('membername', c['name'])
- mp.set('attending', c['attending'])
- mp.text = c['name']
+ mp = etree.Element("mpname")
+ mp.set("person_id", c["person_id"])
+ mp.set("membername", c["name"])
+ mp.set("attending", c["attending"])
+ mp.text = c["name"]
chairmen.append(mp)
committee.append(chairmen)
@@ -1195,39 +1198,41 @@ def committee_finished(self):
def current_membership(pid):
members = self.resolver.persontomembermap[pid]
members = [self.resolver.members[mid] for mid in members]
- members = [m for m in members if m['start_date'] <= self.date <= m['end_date']]
+ members = [
+ m for m in members if m["start_date"] <= self.date <= m["end_date"]
+ ]
assert len(members) == 1
return members[0]
for m in self.members:
- mp = etree.Element('mpname')
- mp.set('person_id', m['person_id'])
- mp.set('membername', m['name'])
- mp.set('attending', m['attending'])
- mp.text = m['name']
- cons = etree.Element('i')
+ mp = etree.Element("mpname")
+ mp.set("person_id", m["person_id"])
+ mp.set("membername", m["name"])
+ mp.set("attending", m["attending"])
+ mp.text = m["name"]
+ cons = etree.Element("i")
# if it's a different cons then it's probably a position
# so use that instead and skip the party
- curr_member = current_membership(m['person_id'])
- if curr_member['constituency'] != m['pbc_cons']:
- cons.text = '({0})'.format(m['pbc_cons'])
+ curr_member = current_membership(m["person_id"])
+ if curr_member["constituency"] != m["pbc_cons"]:
+ cons.text = "({0})".format(m["pbc_cons"])
else:
- cons.text = '({0})'.format(curr_member['constituency'])
- cons.tail = '({0})'.format(curr_member['party'])
+ cons.text = "({0})".format(curr_member["constituency"])
+ cons.tail = "({0})".format(curr_member["party"])
mp.append(cons)
committee.append(mp)
for c in self.clerks:
- clerk = etree.Element('clerk')
+ clerk = etree.Element("clerk")
clerk.text = c
committee.append(clerk)
self.root.append(committee)
- witnesses = etree.Element('witnesses')
+ witnesses = etree.Element("witnesses")
for w in self.witnesses:
- witness = etree.Element('witness')
+ witness = etree.Element("witness")
witness.text = w
witnesses.append(witness)
@@ -1236,15 +1241,15 @@ def current_membership(pid):
def parse_bill_title(self, title_tag):
title = self.get_single_line_text_from_element(title_tag)
- bill = etree.Element('bill')
- bill.set('title', title)
- bill.set('session', self.session)
+ bill = etree.Element("bill")
+ bill.set("title", title)
+ bill.set("session", self.session)
bill.text = title
self.root.insert(0, bill)
def handle_minus_member(self, member):
- if member.get('InTheChair') == 'True':
+ if member.get("InTheChair") == "True":
return self.current_chair
return self.get_member_with_no_id(member)
@@ -1252,57 +1257,52 @@ def handle_minus_member(self, member):
def parse_chair(self, chair):
text = self.get_text_from_element(chair)
- if text in ('\n(Morning)\n', '\n(Afternoon)\n'):
+ if text in ("\n(Morning)\n", "\n(Afternoon)\n"):
# Actually a date, not a chair, they gave the wrong tag
return self.parse_date(chair)
- self.new_speech(None, chair.get('url'))
- tag = etree.Element('p')
+ self.new_speech(None, chair.get("url"))
+ tag = etree.Element("p")
tag.text = text
self.current_speech.append(tag)
- chair_match = re.match(
- r'\s*\[\s*(.*)\s+in\s+the\s+chair\s*\](?i)',
- text
- )
+ chair_match = re.match(r"\s*\[\s*(.*)\s+in\s+the\s+chair\s*\](?i)", text)
if chair_match is not None:
name = chair_match.groups(1)[0]
chair = self.resolver.pbc_match(name, self.date)
if chair is not None:
self.current_chair = chair
else:
- raise ContextException('No match for chair {0}'.format(text))
+ raise ContextException("No match for chair {0}".format(text))
def get_division_tag(self, division, yes_text, no_text):
- tag = etree.Element('divisioncount')
+ tag = etree.Element("divisioncount")
- div_number = \
- division.xpath('.//ns:Number/text()', namespaces=self.ns_map)
+ div_number = division.xpath(".//ns:Number/text()", namespaces=self.ns_map)
- tag.set('id', self.get_speech_id())
- tag.set('divnumber', ''.join(div_number))
- tag.set('ayes', yes_text)
- tag.set('noes', no_text)
- tag.set('url', '')
+ tag.set("id", self.get_speech_id())
+ tag.set("divnumber", "".join(div_number))
+ tag.set("ayes", yes_text)
+ tag.set("noes", no_text)
+ tag.set("url", "")
return tag
-
def parse_amendment(self, amendment, level):
- tag = etree.Element('p')
- tag.set('amendmenttext', 'true')
- tag.set('amendmentlevel', str(level))
+ tag = etree.Element("p")
+ tag.set("amendmenttext", "true")
+ tag.set("amendmentlevel", str(level))
tag.text = amendment.text
if self.current_speech is None:
- self.new_speech(None, amendment.get('url'))
+ self.new_speech(None, amendment.get("url"))
self.current_speech.append(tag)
def parse_table(self, table):
- paras = table.xpath('(.//ns:hs_Para|.//ns:hs_brev)', namespaces=self.ns_map)
+ paras = table.xpath("(.//ns:hs_Para|.//ns:hs_brev)", namespaces=self.ns_map)
for para in paras:
tag_name = self.get_tag_name_no_ns(para)
- if tag_name == 'hs_Para':
+ if tag_name == "hs_Para":
self.parse_para_with_member(para, None)
else:
self.parse_para_with_member(para, None, css_class="indent")
@@ -1315,20 +1315,20 @@ def parse_para(self, para):
has_witness = False
for tag in para.iter():
tag_name = self.get_tag_name_no_ns(tag)
- if tag_name == 'Witness':
+ if tag_name == "Witness":
has_witness = True
- name = self.get_single_line_text_from_element(tag).rstrip(':')
- self.new_speech({'name': name}, para.get('url'))
+ name = self.get_single_line_text_from_element(tag).rstrip(":")
+ self.new_speech({"name": name}, para.get("url"))
# Infer from italic text that it's a motiony thing and we should
# start a new para which is a bit fragile
- elif tag_name == 'I':
+ elif tag_name == "I":
has_i = True
if has_i and not has_witness:
- self.new_speech(None, para.get('url'))
+ self.new_speech(None, para.get("url"))
if has_witness:
- for w in para.xpath('.//ns:Witness', namespaces=self.ns_map):
+ for w in para.xpath(".//ns:Witness", namespaces=self.ns_map):
w.getparent().text = w.tail
w.getparent().remove(w)
self.parse_para_with_member(para, None)
@@ -1338,30 +1338,30 @@ def parse_para(self, para):
def handle_tag(self, tag_name, tag):
handled = True
- if tag_name == 'hs_CLMember':
+ if tag_name == "hs_CLMember":
self.parse_clmember(tag)
- elif tag_name == 'hs_CLClerks':
+ elif tag_name == "hs_CLClerks":
self.parse_clerks(tag)
- elif tag_name == 'hs_CLChairman':
+ elif tag_name == "hs_CLChairman":
self.parse_chairmen(tag)
- elif tag_name == 'hs_8GenericHdg':
+ elif tag_name == "hs_8GenericHdg":
self.parse_minor(tag)
- elif tag_name == 'hs_AmendmentLevel1':
+ elif tag_name == "hs_AmendmentLevel1":
self.parse_amendment(tag, 1)
- elif tag_name == 'hs_AmendmentLevel2':
+ elif tag_name == "hs_AmendmentLevel2":
self.parse_amendment(tag, 2)
- elif tag_name == 'TableWrapper':
+ elif tag_name == "TableWrapper":
self.parse_table(tag)
- elif tag_name == 'hs_CLPara':
+ elif tag_name == "hs_CLPara":
self.parse_witness(tag)
- elif tag_name == 'hs_brevIndent':
+ elif tag_name == "hs_brevIndent":
self.parse_brev(tag)
- elif tag_name in ('hs_2BillTitle', 'hs_2DebBill'):
+ elif tag_name in ("hs_2BillTitle", "hs_2DebBill"):
self.parse_bill_title(tag)
- elif tag_name == 'hs_3MainHdg':
+ elif tag_name == "hs_3MainHdg":
self.committee_finished()
self.parse_major(tag)
- elif tag_name == 'hs_ParaIndent':
+ elif tag_name == "hs_ParaIndent":
self.parse_para_with_member(tag, None, css_class="indent")
else:
handled = super(PBCParseDayXML, self).handle_tag(tag_name, tag)
@@ -1374,59 +1374,73 @@ def get_sitting(self, xml_file):
return False
# This isn't nice.
- fragment = self.input_root[0].xpath('.//ns:Fragment', namespaces=self.ns_map)[0]
- self.session, debate_num = re.search('Commons/(\d{4}_\d{4})/Committee_\d+/Debate_(\d+)/Sitting_\d+', fragment.get('__uri__')).groups()
- header = fragment.xpath('./ns:Header', namespaces=self.ns_map)[0]
+ fragment = self.input_root[0].xpath(".//ns:Fragment", namespaces=self.ns_map)[0]
+ self.session, debate_num = re.search(
+ "Commons/(\d{4}_\d{4})/Committee_\d+/Debate_(\d+)/Sitting_\d+",
+ fragment.get("__uri__"),
+ ).groups()
+ header = fragment.xpath("./ns:Header", namespaces=self.ns_map)[0]
try:
# The sitting number is only given in a random attribute
- data_id = header.xpath('./ns:SystemDataId', namespaces=self.ns_map)[0]
+ data_id = header.xpath("./ns:SystemDataId", namespaces=self.ns_map)[0]
data_id = self.get_single_line_text_from_element(data_id)
- sitting_num = int(re.match('P(?:BC|MB)\s*\d+-(\d+)', data_id).group(1))
+ sitting_num = int(re.match("P(?:BC|MB)\s*\d+-(\d+)", data_id).group(1))
except:
# Try and find one in the filename then.
- sitting_num = int(re.search('_(\d+)(?:st|nd|rd|th)_', xml_file.name).group(1))
+ sitting_num = int(
+ re.search("_(\d+)(?:st|nd|rd|th)_", xml_file.name).group(1)
+ )
try:
- title = header.xpath('./ns:Title', namespaces=self.ns_map)[0]
+ title = header.xpath("./ns:Title", namespaces=self.ns_map)[0]
title = self.get_single_line_text_from_element(title)
except:
- fragment = self.xml_root.xpath('.//ns:Fragment', namespaces=self.ns_map)[0]
- title = fragment.xpath('.//ns:Cover', namespaces=self.ns_map)[0].get('debate')
+ fragment = self.xml_root.xpath(".//ns:Fragment", namespaces=self.ns_map)[0]
+ title = fragment.xpath(".//ns:Cover", namespaces=self.ns_map)[0].get(
+ "debate"
+ )
- title = title.partition(' ')[0].upper()
+ title = title.partition(" ")[0].upper()
- self.session = re.sub('(\d{4})_\d\d(\d\d)', r'\1-\2', self.session)
+ self.session = re.sub("(\d{4})_\d\d(\d\d)", r"\1-\2", self.session)
# The 0 here is a part number. I do not know what the XML outputs for multiple parts
- self.sitting_id = "standing%s_%s_%02d-%d_%s" % (debate_num, title, sitting_num, 0, self.date)
+ self.sitting_id = "standing%s_%s_%02d-%d_%s" % (
+ debate_num,
+ title,
+ sitting_num,
+ 0,
+ self.date,
+ )
class LordsParseDayXML(BaseParseDayXML):
resolver = LordsList()
paras = [
- 'hs_para',
- 'hs_parafo',
- 'hs_Question',
- 'hs_newline10',
- 'hs_newline12',
- 'hs_HeadingTwo',
+ "hs_para",
+ "hs_parafo",
+ "hs_Question",
+ "hs_newline10",
+ "hs_newline12",
+ "hs_HeadingTwo",
]
ignored_tags = [
- 'hs_date',
- 'hs_Venue',
+ "hs_date",
+ "hs_Venue",
]
- division_number_element = 'DivisionNumber'
- division_ayes_attribute = 'content'
- division_noes_attribute = 'not-content'
+ division_number_element = "DivisionNumber"
+ division_ayes_attribute = "content"
+ division_noes_attribute = "not-content"
"""
Lords XML is scattered with processing instructions which upset
tag.text meaning it returns None which in turn breaks a lot of
our processing so just strip them all out.
"""
+
def get_parser(self, xml_file):
parser = etree.parse(xml_file)
pis = parser.xpath('//processing-instruction("xpp")')
@@ -1437,23 +1451,23 @@ def get_parser(self, xml_file):
return parser
def parse_quote(self, quote):
- tag = etree.Element('p')
- tag.set('pid', self.get_pid())
- tag.set('class', 'indent')
+ tag = etree.Element("p")
+ tag.set("pid", self.get_pid())
+ tag.set("class", "indent")
- tag.text = re.sub('\n', ' ', quote.text)
+ tag.text = re.sub("\n", " ", quote.text)
- i = quote.xpath('./ns:I', namespaces=self.ns_map)
+ i = quote.xpath("./ns:I", namespaces=self.ns_map)
if len(i) == 1:
i_text = self.get_single_line_text_from_element(i[0])
- new_i = etree.Element('i')
+ new_i = etree.Element("i")
new_i.text = i_text
- new_i.tail = re.sub('\n', ' ', i[0].tail or '')
- if re.match(r'Official Report,?$', i_text):
- phrase = etree.Element('phrase')
- phrase.set('class', 'offrep')
+ new_i.tail = re.sub("\n", " ", i[0].tail or "")
+ if re.match(r"Official Report,?$", i_text):
+ phrase = etree.Element("phrase")
+ phrase.set("class", "offrep")
# FIXME: generate a proper id here
- phrase.set('id', new_i.tail)
+ phrase.set("id", new_i.tail)
phrase.append(new_i)
tag.append(phrase)
else:
@@ -1463,15 +1477,12 @@ def parse_quote(self, quote):
def parse_member(self, member):
# special hand edited XML case :/
- name = member.get('ContinuationText')
- if name == 'The Queen':
- return {
- 'person_id': 'uk.org.publicwhip/person/13935',
- 'name': 'The Queen'
- }
+ name = member.get("ContinuationText")
+ if name == "The Queen":
+ return {"person_id": "uk.org.publicwhip/person/13935", "name": "The Queen"}
tag_name = self.get_tag_name_no_ns(member)
- if tag_name == 'B' and self.get_single_line_text_from_element(member) == '':
+ if tag_name == "B" and self.get_single_line_text_from_element(member) == "":
return None
found_member = super(LordsParseDayXML, self).parse_member(member)
@@ -1481,25 +1492,27 @@ def parse_member(self, member):
member_tag = self._parse_member_or_b(member)
if member_tag is None:
raise ContextException(
- 'Could not find member',
- stamp=member.get('url'),
- fragment=etree.tostring(member),
+ "Could not find member",
+ stamp=member.get("url"),
+ fragment=etree.tostring(member),
)
- if member_tag.get('MnisId') == '-1':
+ if member_tag.get("MnisId") == "-1":
found_member = {
- 'person_id': 'unknown',
- 'name': self.get_single_line_text_from_element(member).rstrip(':')
+ "person_id": "unknown",
+ "name": self.get_single_line_text_from_element(member).rstrip(":"),
}
return found_member
def parse_newdebate(self, tag):
- time = tag.xpath('.//ns:hs_time', namespaces=self.ns_map)
+ time = tag.xpath(".//ns:hs_time", namespaces=self.ns_map)
if len(time):
self.parse_time(time[0])
- heading = tag.xpath('.//ns:hs_DebateHeading|.//hs_AmendmentHeading', namespaces=self.ns_map)
- debate_type = tag.xpath('.//ns:hs_DebateType', namespaces=self.ns_map)
+ heading = tag.xpath(
+ ".//ns:hs_DebateHeading|.//hs_AmendmentHeading", namespaces=self.ns_map
+ )
+ debate_type = tag.xpath(".//ns:hs_DebateType", namespaces=self.ns_map)
if len(heading):
if len(debate_type):
text = self.get_single_line_text_from_element(debate_type[0])
@@ -1508,28 +1521,28 @@ def parse_newdebate(self, tag):
self.parse_major(heading[0])
else:
raise ContextException(
- 'New Lords debate with no heading',
- stamp=tag.get('url'),
- fragment=tag
- )
+ "New Lords debate with no heading", stamp=tag.get("url"), fragment=tag
+ )
return
- #procedure = tag.xpath('.//ns:hs_Procedure', namespaces=self.ns_map)
- #if len(procedure) == 1:
+ # procedure = tag.xpath('.//ns:hs_Procedure', namespaces=self.ns_map)
+ # if len(procedure) == 1:
# self.handle_para(procedure[0])
- want_member = tag.get('BusinessType') in ('Question', 'GeneralDebate')
+ want_member = tag.get("BusinessType") in ("Question", "GeneralDebate")
member = None
- member_tags = tag.xpath('.//ns:Member', namespaces=self.ns_map)
+ member_tags = tag.xpath(".//ns:Member", namespaces=self.ns_map)
if len(member_tags):
if want_member:
member = self.parse_member(member_tags[0])
else:
- tabledby_tags = tag.xpath('.//ns:hs_TabledBy', namespaces=self.ns_map)
- self.parse_para_with_member(tabledby_tags[0], None, css_class='italic', strip_member=False)
+ tabledby_tags = tag.xpath(".//ns:hs_TabledBy", namespaces=self.ns_map)
+ self.parse_para_with_member(
+ tabledby_tags[0], None, css_class="italic", strip_member=False
+ )
- questions = tag.xpath('.//ns:hs_Question', namespaces=self.ns_map)
+ questions = tag.xpath(".//ns:hs_Question", namespaces=self.ns_map)
for question in questions:
self.parse_para_with_member(question, member if want_member else None)
@@ -1541,47 +1554,45 @@ def parse_tabledby(self, tabledby):
tabledby,
None,
strip_member=False,
- css_class='italic',
- pwmotiontext='unrecognized'
+ css_class="italic",
+ pwmotiontext="unrecognized",
)
def parse_amendment(self, amendment):
self.parse_para_with_member(
- amendment,
- None,
- css_class='italic',
- pwmotiontext='unrecognized'
+ amendment, None, css_class="italic", pwmotiontext="unrecognized"
)
def parse_clause_heading(self, heading):
- tag = etree.Element('p')
+ tag = etree.Element("p")
text = self.get_single_line_text_from_element(heading)
- i = etree.Element('i')
+ i = etree.Element("i")
i.text = text
- b = etree.Element('b')
+ b = etree.Element("b")
b.append(i)
- tag.set('pid', self.get_pid())
+ tag.set("pid", self.get_pid())
tag.append(b)
if self.current_speech is None:
- self.new_speech(None, heading.get('url'))
+ self.new_speech(None, heading.get("url"))
self.current_speech.append(tag)
def parse_division(self, division):
- ayes_count = \
- division.xpath('.//ns:ContentsNumber/text()', namespaces=self.ns_map)
- noes_count = \
- division.xpath('.//ns:NotContentsNumber/text()', namespaces=self.ns_map)
+ ayes_count = division.xpath(
+ ".//ns:ContentsNumber/text()", namespaces=self.ns_map
+ )
+ noes_count = division.xpath(
+ ".//ns:NotContentsNumber/text()", namespaces=self.ns_map
+ )
- ayes_count_text = ''.join(ayes_count)
- noes_count_text = ''.join(noes_count)
+ ayes_count_text = "".join(ayes_count)
+ noes_count_text = "".join(noes_count)
# output a summary of the division results
- div_summary = \
- "Ayes {0}, Noes {1}.".format(ayes_count_text, noes_count_text)
- div_summary_tag = etree.Element('p')
- div_summary_tag.set('pid', self.get_pid())
- div_summary_tag.set('pwmotiontext', 'yes')
+ div_summary = "Ayes {0}, Noes {1}.".format(ayes_count_text, noes_count_text)
+ div_summary_tag = etree.Element("p")
+ div_summary_tag.set("pid", self.get_pid())
+ div_summary_tag.set("pwmotiontext", "yes")
div_summary_tag.text = div_summary
self.current_speech.append(div_summary_tag)
@@ -1590,50 +1601,49 @@ def parse_division(self, division):
tag = self.get_division_tag(division, ayes_count_text, noes_count_text)
ayes = division.xpath(
- './/ns:NamesContents//ns:hs_DivListNames', namespaces=self.ns_map
+ ".//ns:NamesContents//ns:hs_DivListNames", namespaces=self.ns_map
)
noes = division.xpath(
- './/ns:NamesNotContents//ns:hs_DivListNames', namespaces=self.ns_map
+ ".//ns:NamesNotContents//ns:hs_DivListNames", namespaces=self.ns_map
)
- aye_list = etree.Element('lordlist')
- aye_list.set('vote', 'content')
- aye_list = self.parse_votelist(ayes, 'content', aye_list)
+ aye_list = etree.Element("lordlist")
+ aye_list.set("vote", "content")
+ aye_list = self.parse_votelist(ayes, "content", aye_list)
tag.append(aye_list)
- no_list = etree.Element('lordlist')
- no_list.set('vote', 'not-content')
- no_list = self.parse_votelist(noes, 'not-content', no_list)
+ no_list = etree.Element("lordlist")
+ no_list.set("vote", "not-content")
+ no_list = self.parse_votelist(noes, "not-content", no_list)
tag.append(no_list)
self.root.append(tag)
- paras = division.xpath('./ns:hs_Procedure', namespaces=self.ns_map)
+ paras = division.xpath("./ns:hs_Procedure", namespaces=self.ns_map)
for para in paras:
- text = ''.join(para.xpath('.//text()'))
- if re.search(r'Contents', text) or \
- re.search(r'Division\s*on', text):
+ text = "".join(para.xpath(".//text()"))
+ if re.search(r"Contents", text) or re.search(r"Division\s*on", text):
continue
self.parse_para(para)
def parse_votelist(self, votes, direction, vote_list):
for vote in votes:
- tag = etree.Element('lord')
+ tag = etree.Element("lord")
member_name = self.get_single_line_text_from_element(vote)
is_teller = False
- if re.match('.*\[Teller\].*', member_name):
- member_name = re.sub('\[Teller\]', '', member_name)
+ if re.match(".*\[Teller\].*", member_name):
+ member_name = re.sub("\[Teller\]", "", member_name)
member_name = member_name.strip()
is_teller = True
# convert smart quote to apostrophe
- member_name = re.sub('\u2019', "'", member_name)
+ member_name = re.sub("\u2019", "'", member_name)
- member = self.resolver.MatchRevName(member_name, self.date, vote.get('url'))
- tag.set('person_id', member)
- tag.set('vote', direction)
+ member = self.resolver.MatchRevName(member_name, self.date, vote.get("url"))
+ tag.set("person_id", member)
+ tag.set("vote", direction)
if is_teller:
- tag.set('teller', 'yes')
+ tag.set("teller", "yes")
tag.text = self.resolver.name_on_date(member, self.date)
vote_list.append(tag)
@@ -1642,25 +1652,25 @@ def parse_votelist(self, votes, direction, vote_list):
def handle_tag(self, tag_name, tag):
handled = True
- if tag_name == 'hs_time':
+ if tag_name == "hs_time":
self.parse_time(tag)
- elif tag_name == 'hs_quotefo':
+ elif tag_name == "hs_quotefo":
self.parse_quote(tag)
- elif tag_name == 'NewDebate':
+ elif tag_name == "NewDebate":
self.parse_newdebate(tag)
- elif tag_name == 'hs_Procedure':
+ elif tag_name == "hs_Procedure":
self.parse_procedure(tag)
- elif tag_name == 'hs_prayers':
+ elif tag_name == "hs_prayers":
return True
- elif tag_name == 'hs_AmendmentHeading':
+ elif tag_name == "hs_AmendmentHeading":
self.parse_amendment_heading(tag)
- elif tag_name == 'hs_TabledBy':
+ elif tag_name == "hs_TabledBy":
self.parse_tabledby(tag)
- elif tag_name == 'Amendment':
+ elif tag_name == "Amendment":
self.parse_amendment(tag)
- elif tag_name == 'hs_ClauseHeading':
+ elif tag_name == "hs_ClauseHeading":
self.parse_clause_heading(tag)
- elif tag_name == 'Division':
+ elif tag_name == "Division":
self.parse_division(tag)
else:
handled = super(LordsParseDayXML, self).handle_tag(tag_name, tag)
@@ -1669,25 +1679,20 @@ def handle_tag(self, tag_name, tag):
class ParseDay(object):
- valid_types = [
- 'debate',
- 'westminhall',
- 'lords',
- 'standing'
- ]
+ valid_types = ["debate", "westminhall", "lords", "standing"]
output_dirs = {
- 'debate': 'debates',
- 'westminhall': 'westminhall',
- 'lords': 'lordspages',
- 'standing': 'standing'
+ "debate": "debates",
+ "westminhall": "westminhall",
+ "lords": "lordspages",
+ "standing": "standing",
}
output_files = {
- 'debate': 'debates',
- 'westminhall': 'westminster',
- 'lords': 'daylord',
- 'standing': 'standing'
+ "debate": "debates",
+ "westminhall": "westminster",
+ "lords": "daylord",
+ "standing": "standing",
}
parser = None
@@ -1698,14 +1703,15 @@ def reset(self):
self.parser = None
def get_output_pbc_filename(self, date, xml_file):
- shortnamemap = { }
+ shortnamemap = {}
pwstandingpages = os.path.join(pwxmldirs, "standing")
for f in os.listdir(pwstandingpages):
m = re.match("(standing.*?)([a-z]*)\.xml$", f)
if m:
shortnamemap.setdefault(m.group(1), []).append(
- (miscfuncs.AlphaStringToOrder(m.group(2)), m.group(2), f))
- elif f.endswith('~') or f == 'changedates.txt':
+ (miscfuncs.AlphaStringToOrder(m.group(2)), m.group(2), f)
+ )
+ elif f.endswith("~") or f == "changedates.txt":
pass
elif os.path.isfile(os.path.join(pwstandingpages, f)):
print("not recognized file:", f, " in ", pwstandingpages)
@@ -1719,7 +1725,7 @@ def get_output_pbc_filename(self, date, xml_file):
dgflatestalpha = ldgf[1]
dgflatest = os.path.join(pwstandingpages, ldgf[2])
self.rev = miscfuncs.NextAlphaString(dgflatestalpha)
- dgfnext = os.path.join(pwstandingpages, '%s%s.xml' % (sitting_id, self.rev))
+ dgfnext = os.path.join(pwstandingpages, "%s%s.xml" % (sitting_id, self.rev))
assert not dgflatest or os.path.isfile(dgflatest)
assert not os.path.isfile(dgfnext), dgfnext
@@ -1730,19 +1736,18 @@ def get_output_filename(self, date, debate_type):
self.output_dirs.get(debate_type),
self.output_files.get(debate_type),
pwxmldirs,
- 'xml'
+ "xml",
)
- latestFilePath, latestFileStem, nextFilePath, nextFileStem = \
- GetFileDayVersions(
- date,
- daymap,
- scrapedDataOutputPath,
- self.output_files.get(debate_type),
- 'xml'
- )
+ latestFilePath, latestFileStem, nextFilePath, nextFileStem = GetFileDayVersions(
+ date,
+ daymap,
+ scrapedDataOutputPath,
+ self.output_files.get(debate_type),
+ "xml",
+ )
- version_match = re.match('\d+-\d+-\d+([a-z])', nextFileStem)
+ version_match = re.match("\d+-\d+-\d+([a-z])", nextFileStem)
self.rev = version_match.groups(1)[0]
return latestFilePath, nextFilePath
@@ -1751,6 +1756,7 @@ def get_output_filename(self, date, debate_type):
This fakes exactly enough of the old flatb structure from the filter
version of the code for use in the diff/redirect creation code.
"""
+
def gen_flatb(self, chks):
flatb = []
for chk in chks:
@@ -1759,53 +1765,57 @@ def gen_flatb(self, chks):
if gidmatch:
gid = gidmatch.group(1)
# http://stackoverflow.com/questions/652276/is-it-possible-to-create-anonymous-objects-in-python#652417
- entry = type('', (object,), {"GID": gid})()
+ entry = type("", (object,), {"GID": gid})()
flatb.append(entry)
return flatb
def normalise_gids(self, string):
- string = re.sub('(publicwhip\/[a-z]*\/\d{4}-\d{2}-\d{2})[a-z]', r'\1', string)
- string = re.sub('(publicwhip\/standing\/.*?\d{4}-\d{2}-\d{2})[a-z]', r'\1', string)
- string = re.sub('(pid=")[a-z]([\d.\/]*")', r'\1\2', string)
+ string = re.sub("(publicwhip\/[a-z]*\/\d{4}-\d{2}-\d{2})[a-z]", r"\1", string)
+ string = re.sub(
+ "(publicwhip\/standing\/.*?\d{4}-\d{2}-\d{2})[a-z]", r"\1", string
+ )
+ string = re.sub('(pid=")[a-z]([\d.\/]*")', r"\1\2", string)
return string
def compare_xml_files(self, prevfile, nextfile):
- hprevfile = io.open(prevfile, encoding='utf-8')
+ hprevfile = io.open(prevfile, encoding="utf-8")
dprevfile = hprevfile.readlines()
hprevfile.close()
- hnextfile = io.open(nextfile, encoding='utf-8')
+ hnextfile = io.open(nextfile, encoding="utf-8")
dnextfile = hnextfile.readlines()
hnextfile.close()
if len(dprevfile) == len(dnextfile):
- sprevfile = self.normalise_gids(''.join(dprevfile[1:]))
- snextfile = self.normalise_gids(''.join(dnextfile[1:]))
+ sprevfile = self.normalise_gids("".join(dprevfile[1:]))
+ snextfile = self.normalise_gids("".join(dnextfile[1:]))
if sprevfile == snextfile:
return "SAME"
if len(dprevfile) < len(dnextfile):
- sprevfile = self.normalise_gids(''.join(dprevfile[1:]))
- snextfile = self.normalise_gids(''.join(dnextfile[1:len(dprevfile)]))
+ sprevfile = self.normalise_gids("".join(dprevfile[1:]))
+ snextfile = self.normalise_gids("".join(dnextfile[1 : len(dprevfile)]))
if sprevfile == snextfile:
return "EXTENSION"
return "DIFFERENT"
def remove_para_newlines(self, string):
return re.sub(
- '(?s)(]*>)(.*?)(<\/p>)',
- lambda m: (''.join((m.group(1), re.sub('\n', ' ', m.group(2)), m.group(3)))),
- string
+ "(?s)( ]*>)(.*?)(<\/p>)",
+ lambda m: (
+ "".join((m.group(1), re.sub("\n", " ", m.group(2)), m.group(3)))
+ ),
+ string,
)
def rewrite_previous_version(self, newfile):
# open the old and new XML files
- xin = io.open(self.prev_file, encoding='utf-8')
+ xin = io.open(self.prev_file, encoding="utf-8")
xprevs = xin.read()
xin.close()
- xin = io.open(newfile, encoding='utf-8')
+ xin = io.open(newfile, encoding="utf-8")
xcur = xin.read()
xin.close()
@@ -1813,11 +1823,11 @@ def rewrite_previous_version(self, newfile):
xcur = self.remove_para_newlines(xcur)
# pull out the scrape versions and the XML as a string
- mpw = re.search(']*)>([\s\S]*?)', xprevs)
- mpc = re.search(']*)>([\s\S]*?)', xcur)
+ mpw = re.search("]*)>([\s\S]*?)", xprevs)
+ mpc = re.search("]*)>([\s\S]*?)", xcur)
if mpc is None or mpw is None:
- sys.stderr.write('Failed to do diff for {0}\n'.format(self.prev_file))
+ sys.stderr.write("Failed to do diff for {0}\n".format(self.prev_file))
return
# take the XML string and turn it into the data structures used
@@ -1825,11 +1835,15 @@ def rewrite_previous_version(self, newfile):
essflatbindx, essflatblist, oldchks = PrepareXMLForDiff(mpc.group(2))
essxindx, essxlist, chks = PrepareXMLForDiff(mpw.group(2))
flatb = self.gen_flatb(oldchks)
- xprevcompress = DoFactorDiff(essflatbindx, essflatblist, essxindx, essxlist, chks, flatb)
+ xprevcompress = DoFactorDiff(
+ essflatbindx, essflatblist, essxindx, essxlist, chks, flatb
+ )
# spit out the rewritten previous version with redirects
- tempfilenameoldxml = tempfile.mktemp(".xml", "pw-filtertempold-", miscfuncs.tmppath)
- foout = io.open(tempfilenameoldxml, mode="w", encoding='utf-8')
+ tempfilenameoldxml = tempfile.mktemp(
+ ".xml", "pw-filtertempold-", miscfuncs.tmppath
+ )
+ foout = io.open(tempfilenameoldxml, mode="w", encoding="utf-8")
if self.parser.is_pre_new_parser:
WriteXMLHeader(foout)
foout.write('\n' % self.prev_file)
@@ -1846,17 +1860,17 @@ def output(self, stream):
def handle_file(self, filename, debate_type, verbose):
if debate_type not in self.valid_types:
- sys.stderr.write('{0} not a valid type'.format(debate_type))
+ sys.stderr.write("{0} not a valid type".format(debate_type))
sys.exit()
- xml_file = io.open(filename, encoding='utf-8')
+ xml_file = io.open(filename, encoding="utf-8")
self.set_parser_for_type(debate_type)
self.parser.verbose = verbose
date = self.parser.get_date(xml_file)
if date is False:
- return 'not-present'
+ return "not-present"
- if debate_type == 'standing':
+ if debate_type == "standing":
prev_file, output_file = self.get_output_pbc_filename(date, xml_file)
else:
prev_file, output_file = self.get_output_filename(date, debate_type)
@@ -1870,13 +1884,13 @@ def handle_file(self, filename, debate_type, verbose):
parse_ok = self.parse_day(xml_file, debate_type)
if parse_ok:
- out = io.open(tempfilename, mode='w', encoding='utf-8')
+ out = io.open(tempfilename, mode="w", encoding="utf-8")
self.output(out)
out.close()
else:
- sys.stderr.write('Failed to parse {0}\n'.format(filename))
+ sys.stderr.write("Failed to parse {0}\n".format(filename))
os.remove(tempfilename)
- return 'failed'
+ return "failed"
# FIME: should be using more temp files here
# if we have a previous version check if it's different from
@@ -1884,32 +1898,32 @@ def handle_file(self, filename, debate_type, verbose):
if self.prev_file is not None:
diffs = self.compare_xml_files(self.prev_file, tempfilename)
# if they are the same then delete the old one
- if diffs == 'SAME':
+ if diffs == "SAME":
os.remove(tempfilename)
- return 'same'
+ return "same"
# otherwise do the diff and redirect dance
else:
self.rewrite_previous_version(tempfilename)
- return 'change'
+ return "change"
else:
os.rename(tempfilename, self.output_file)
- return 'new'
+ return "new"
def set_parser_for_type(self, debate_type):
if self.parser is not None:
return
parser_types = {
- 'lords': LordsParseDayXML,
- 'standing': PBCParseDayXML,
+ "lords": LordsParseDayXML,
+ "standing": PBCParseDayXML,
}
self.parser = parser_types.get(debate_type, CommonsParseDayXML)()
self.parser.debate_type = debate_type
def parse_day(self, text, debate_type):
self.set_parser_for_type(debate_type)
- if debate_type == 'standing':
- if not hasattr(self.parser, 'sitting_id'):
+ if debate_type == "standing":
+ if not hasattr(self.parser, "sitting_id"):
self.parser.get_sitting(text)
parse_ok = self.parser.parse_day(text)
if parse_ok:
@@ -1917,8 +1931,9 @@ def parse_day(self, text, debate_type):
return False
-if __name__ == '__main__':
- xml_file = codecs.open(sys.argv[1], encoding='utf-8')
+
+if __name__ == "__main__":
+ xml_file = codecs.open(sys.argv[1], encoding="utf-8")
house = sys.argv[2]
parse = ParseDay()
parse_ok = parse.parse_day(xml_file, house)
diff --git a/pyscraper/ni/parse.py b/pyscraper/ni/parse.py
index 617ca7e7..84420ce2 100755
--- a/pyscraper/ni/parse.py
+++ b/pyscraper/ni/parse.py
@@ -1,15 +1,17 @@
#! /usr/bin/env python3
-import re
import json
import os
+import re
import sys
-sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
-from ni.resolvenames import memberList
+sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from contextexception import ContextException
-parldata = '../../../parldata/'
+from ni.resolvenames import memberList
+
+parldata = "../../../parldata/"
+
class ParseDayParserBase(object):
def __init__(self, fp, date, **kwargs):
@@ -19,21 +21,22 @@ def __init__(self, fp, date, **kwargs):
self.idB = 0
def id(self):
- return '%s.%s.%s' % (self.date, self.idA, self.idB)
+ return "%s.%s.%s" % (self.date, self.idA, self.idB)
def time_period(self, ptext, optional=False):
- match = re.search('(\d\d?)(?:[.:]\s*(\d\d?))? ?(am|pm|noon|midnight)', ptext)
+ match = re.search("(\d\d?)(?:[.:]\s*(\d\d?))? ?(am|pm|noon|midnight)", ptext)
if not match:
if not optional:
- raise ContextException('Time not found in TimePeriod %s' % p)
+ raise ContextException("Time not found in TimePeriod %s" % ptext)
return None
hour = int(match.group(1))
- if hour<12 and match.group(3) == 'pm':
+ if hour < 12 and match.group(3) == "pm":
hour += 12
- if hour==12 and match.group(3) in ('midnight', 'am'):
+ if hour == 12 and match.group(3) in ("midnight", "am"):
hour = 0
- minutes = match.group(2) or '00'
- if len(minutes) == 1: minutes = '0' + minutes
+ minutes = match.group(2) or "00"
+ if len(minutes) == 1:
+ minutes = "0" + minutes
timestamp = "%s:%s" % (hour, minutes)
return timestamp
@@ -41,106 +44,142 @@ def time_period(self, ptext, optional=False):
class ParseDayJSON(ParseDayParserBase):
def display_speech(self):
if self.heading:
- timestamp = self.heading['ts']
+ timestamp = self.heading["ts"]
if timestamp:
timestamp = ' time="%s"' % timestamp
- typ = self.heading['type']
- text = self.heading['text']
- if typ == 'major':
+ typ = self.heading["type"]
+ text = self.heading["text"]
+ if typ == "major":
self.idA += 1
self.idB = 0
else:
self.idB += 1
- self.out.write('<%s-heading id="uk.org.publicwhip/ni/%s"%s>%s%s-heading>\n' % (typ, self.id(), timestamp, text, typ))
+ self.out.write(
+ '<%s-heading id="uk.org.publicwhip/ni/%s"%s>%s%s-heading>\n'
+ % (typ, self.id(), timestamp, text, typ)
+ )
self.heading = {}
if self.text:
- if 'id' in self.speaker:
- speaker_str = self.speaker['id']
- elif 'name' in self.speaker:
- speaker_str = 'person_id="unknown" speakername="%s"' % self.speaker['name']
+ if "id" in self.speaker:
+ speaker_str = self.speaker["id"]
+ elif "name" in self.speaker:
+ speaker_str = (
+ 'person_id="unknown" speakername="%s"' % self.speaker["name"]
+ )
else:
speaker_str = 'nospeaker="true"'
- timestamp = self.speaker.get('ts', '')
+ timestamp = self.speaker.get("ts", "")
if timestamp:
timestamp = ' time="%s"' % timestamp
self.idB += 1
- self.out.write('\n%s\n' % (self.id(), speaker_str, timestamp, self.text))
- self.text = ''
+ self.out.write(
+ '\n%s\n'
+ % (self.id(), speaker_str, timestamp, self.text)
+ )
+ self.text = ""
def parse_day(self, input):
self.heading = {}
self.pre_heading = {}
self.speaker = {}
- self.text = ''
- timestamp = ''
+ self.text = ""
+ timestamp = ""
j = json.loads(input)
- if 'AllHansardComponentsList' in j:
- j = j['AllHansardComponentsList']['HansardComponent']
+ if "AllHansardComponentsList" in j:
+ j = j["AllHansardComponentsList"]["HansardComponent"]
for line in j:
- text = (line['ComponentText'] or '').replace('&', '&')
+ text = (line["ComponentText"] or "").replace("&", "&")
if not text:
print("WARNING: Empty line: %s" % line)
- elif line['ComponentType'] == 'Document Title':
- assert re.match('(Plenary|PLE), %s/%s/%s$(?i)' % (self.date[8:10], self.date[5:7], self.date[0:4]), text), text
- elif line['ComponentType'] == 'Time':
+ elif line["ComponentType"] == "Document Title":
+ assert re.match(
+ "(Plenary|PLE), %s/%s/%s$(?i)"
+ % (self.date[8:10], self.date[5:7], self.date[0:4]),
+ text,
+ ), text
+ elif line["ComponentType"] == "Time":
timestamp = self.time_period(text)
- elif line['ComponentType'] == 'Header':
- if line['ComponentHeaderId'] in (0, 1, '0', '1'):
- typ = 'major'
- elif line['ComponentHeaderId'] in (2, '2'):
- typ = 'minor'
+ elif line["ComponentType"] == "Header":
+ if line["ComponentHeaderId"] in (0, 1, "0", "1"):
+ typ = "major"
+ elif line["ComponentHeaderId"] in (2, "2"):
+ typ = "minor"
else:
- raise Exception("Unknown ComponentHeaderId %s" % line['ComponentHeaderId'])
- if self.heading and self.heading['type'] == typ:
- self.pre_heading = {'level': line['ComponentHeaderId'], 'text': self.heading['text']}
- self.heading['text'] += ' — %s' % text
+ raise Exception(
+ "Unknown ComponentHeaderId %s" % line["ComponentHeaderId"]
+ )
+ if self.heading and self.heading["type"] == typ:
+ self.pre_heading = {
+ "level": line["ComponentHeaderId"],
+ "text": self.heading["text"],
+ }
+ self.heading["text"] += " — %s" % text
else:
self.display_speech()
- self.speaker = {'ts': timestamp}
- if self.pre_heading and self.pre_heading['level'] == line['ComponentHeaderId']:
- text = '%s — %s' % (self.pre_heading['text'], text)
- elif self.pre_heading and self.pre_heading['level'] > line['ComponentHeaderId']:
+ self.speaker = {"ts": timestamp}
+ if (
+ self.pre_heading
+ and self.pre_heading["level"] == line["ComponentHeaderId"]
+ ):
+ text = "%s — %s" % (self.pre_heading["text"], text)
+ elif (
+ self.pre_heading
+ and self.pre_heading["level"] > line["ComponentHeaderId"]
+ ):
self.pre_heading = {}
- self.heading = {'text': text, 'ts': timestamp, 'type': typ}
- elif re.match('Speaker \((MlaName|DeputyChairAndName|ChairAndName|DeputySpeaker|PrincipalDeputySpeaker|MinisterAndName|ActingSpeaker|TemporarySpeaker|Speaker)\)$', line['ComponentType']):
+ self.heading = {"text": text, "ts": timestamp, "type": typ}
+ elif re.match(
+ "Speaker \((MlaName|DeputyChairAndName|ChairAndName|DeputySpeaker|PrincipalDeputySpeaker|MinisterAndName|ActingSpeaker|TemporarySpeaker|Speaker)\)$",
+ line["ComponentType"],
+ ):
# RelatedItemId here is the NI speaker ID. We could use that!
# But for now, carry on going by name as all that code exists.
self.display_speech()
- speaker = text.replace(':', '')
+ speaker = text.replace(":", "")
id, stri = memberList.match(speaker, self.date)
- self.speaker = {'id': stri, 'ts': timestamp}
- elif line['ComponentType'] == 'Speaker (Special)' or line['ComponentType'] == 'Speaker (GuestSpeaker)':
+ self.speaker = {"id": stri, "ts": timestamp}
+ elif (
+ line["ComponentType"] == "Speaker (Special)"
+ or line["ComponentType"] == "Speaker (GuestSpeaker)"
+ ):
self.display_speech()
- speaker = text.replace(':', '')
- self.speaker = {'name': speaker, 'ts': timestamp}
- elif line['ComponentType'] == 'Question':
+ speaker = text.replace(":", "")
+ self.speaker = {"name": speaker, "ts": timestamp}
+ elif line["ComponentType"] == "Question":
self.display_speech()
- m = re.match('(T?[0-9]+\. )?(.*?) asked', text)
+ m = re.match("(T?[0-9]+\. )?(.*?) asked", text)
id, stri = memberList.match(m.group(2), self.date)
- self.speaker = {'id': stri, 'ts': timestamp}
+ self.speaker = {"id": stri, "ts": timestamp}
self.text += "%s \n" % text
- elif line['ComponentType'] == 'Quote':
+ elif line["ComponentType"] == "Quote":
self.text += '%s \n' % text
- elif line['ComponentType'] in ('Plenary Item Text', 'Procedure Line'):
- match = re.match('The Assembly met at ((\d\d?):(\d\d?) (am|pm)|12 noon)', text)
+ elif line["ComponentType"] in ("Plenary Item Text", "Procedure Line"):
+ match = re.match(
+ "The Assembly met at ((\d\d?):(\d\d?) (am|pm)|12 noon)", text
+ )
if match:
timestamp = self.time_period(text)
- self.speaker['ts'] = timestamp
+ self.speaker["ts"] = timestamp
self.text += '%s \n' % text
- elif line['ComponentType'] == 'Bill Text':
- self.text += text.replace('', ' ') # Already is HTML
- elif line['ComponentType'] in ('Division', 'Spoken Text'):
- text = re.sub('\s* \s* \s*(?i)', ' \n', text)
- text = re.sub('WIDTH=50%', 'WIDTH="50%"', text)
- self.text += ' %s \n' % text
+ elif line["ComponentType"] == "Bill Text":
+ self.text += text.replace(
+ "", ' '
+ ) # Already is HTML
+ elif line["ComponentType"] in ("Division", "Spoken Text"):
+ text = re.sub("\s* \s* \s*(?i)", " \n", text)
+ text = re.sub("WIDTH=50%", 'WIDTH="50%"', text)
+ self.text += " %s \n" % text
else:
- raise ContextException("Uncaught Component Type! %s" % line['ComponentType'])
+ raise ContextException(
+ "Uncaught Component Type! %s" % line["ComponentType"]
+ )
self.display_speech()
+
class ParseDay(object):
def parse_day(self, out, text, date):
out.write('\n')
- out.write('''
+ out.write("""
@@ -191,16 +230,16 @@ def parse_day(self, out, text, date):
]>
-''')
- if date > '2014-11-01':
+""")
+ if date > "2014-11-01":
parser = ParseDayJSON(out, date)
else:
sys.exit("Parsing <=2014-11-01 HTML is no longer supported")
parser.parse_day(text)
- out.write('\n')
+ out.write("\n")
-if __name__ == '__main__':
+if __name__ == "__main__":
fp = sys.stdout
text = open(sys.argv[1]).read()
date = os.path.basename(sys.argv[1])[2:12]
diff --git a/pyscraper/ni/resolvenames.py b/pyscraper/ni/resolvenames.py
index 9980f553..ac7a7ecd 100755
--- a/pyscraper/ni/resolvenames.py
+++ b/pyscraper/ni/resolvenames.py
@@ -1,49 +1,60 @@
-import re
import datetime
-from contextexception import ContextException
+import re
from base_resolver import ResolverBase
+from contextexception import ContextException
+
class MemberList(ResolverBase):
deputy_speaker = None
- import_organization_id = 'northern-ireland-assembly'
+ import_organization_id = "northern-ireland-assembly"
def reloadJSON(self):
super(MemberList, self).reloadJSON()
self.members = {
- "uk.org.publicwhip/member/454" : { 'given_name':'Paul', 'family_name':'Murphy', 'title':'', 'party':'Labour' },
- "uk.org.publicwhip/member/384" : { 'given_name':'John', 'family_name':'McFall', 'title':'', 'party':'Labour' },
- } # ID --> MLAs
-
- self.debatedate=None
- self.debatenamehistory=[] # recent speakers in debate
- self.debateofficehistory={} # recent offices ("The Deputy Prime Minister")
-
- self.retitles = re.compile('^(?:Rev |Dr |Mr |Mrs |Ms |Miss |Sir |Lord )+')
- self.rehonorifics = re.compile('(?: OBE| CBE| MP)+$')
+ "uk.org.publicwhip/member/454": {
+ "given_name": "Paul",
+ "family_name": "Murphy",
+ "title": "",
+ "party": "Labour",
+ },
+ "uk.org.publicwhip/member/384": {
+ "given_name": "John",
+ "family_name": "McFall",
+ "title": "",
+ "party": "Labour",
+ },
+ } # ID --> MLAs
+
+ self.debatedate = None
+ self.debatenamehistory = [] # recent speakers in debate
+ self.debateofficehistory = {} # recent offices ("The Deputy Prime Minister")
+
+ self.retitles = re.compile("^(?:Rev |Dr |Mr |Mrs |Ms |Miss |Sir |Lord )+")
+ self.rehonorifics = re.compile("(?: OBE| CBE| MP)+$")
self.import_constituencies()
self.import_people_json()
def list(self, date=None, fro=None, to=None):
- if date == 'now':
+ if date == "now":
date = datetime.date.today().isoformat()
if date:
fro = to = date
if not fro:
- fro = '1000-01-01'
+ fro = "1000-01-01"
if not to:
- to = '9999-12-31'
+ to = "9999-12-31"
ids = []
for m in self.members.values():
- if 'start_date' in m and to >= m["start_date"] and fro <= m["end_date"]:
+ if "start_date" in m and to >= m["start_date"] and fro <= m["end_date"]:
ids.append(self.membertoperson(m["id"]))
return ids
# useful to have this function out there
def striptitles(self, text):
- text = text.replace("’", "'").replace('\u2019', "'")
+ text = text.replace("’", "'").replace("\u2019", "'")
text = text.replace(" ", " ")
(text, titletotal) = self.retitles.subn("", text)
text = self.rehonorifics.sub("", text)
@@ -52,13 +63,16 @@ def striptitles(self, text):
# date can be none, will give more matches
def fullnametoids(self, tinput, date):
# Special case gender uniques
- if tinput == 'Mrs Bell': tinput = 'Mrs E Bell'
+ if tinput == "Mrs Bell":
+ tinput = "Mrs E Bell"
text, titletotal = self.striptitles(tinput)
# Special case for non-MLAs
- if text == 'P Murphy': return ["uk.org.publicwhip/member/454"]
- if text == 'McFall': return ["uk.org.publicwhip/member/384"]
+ if text == "P Murphy":
+ return ["uk.org.publicwhip/member/454"]
+ if text == "McFall":
+ return ["uk.org.publicwhip/member/384"]
# Find unique identifier for member
ids = set()
@@ -70,20 +84,30 @@ def fullnametoids(self, tinput, date):
# If a speaker, then match against the special speaker parties
if text == "Speaker" or text == "The Speaker":
matches.extend(self.parties.get("Speaker", []))
- if not matches and text in ('Deputy Speaker', 'Madam Deputy Speaker', 'The Deputy Speaker', 'The Principal Deputy Speaker', 'Madam Principal Deputy Speaker'):
+ if not matches and text in (
+ "Deputy Speaker",
+ "Madam Deputy Speaker",
+ "The Deputy Speaker",
+ "The Principal Deputy Speaker",
+ "Madam Principal Deputy Speaker",
+ ):
if not self.deputy_speaker:
- raise ContextException('Deputy speaker speaking, but do not know who it is')
+ raise ContextException(
+ "Deputy speaker speaking, but do not know who it is"
+ )
return self.fullnametoids(self.deputy_speaker, date)
if matches:
for m in matches:
- if (date == None) or (date >= m["start_date"] and date <= m["end_date"]):
+ if (date == None) or (
+ date >= m["start_date"] and date <= m["end_date"]
+ ):
ids.add(m["id"])
return ids
def setDeputy(self, deputy):
- if deputy == 'Mr Wilson':
- deputy = 'Mr J Wilson'
+ if deputy == "Mr Wilson":
+ deputy = "Mr J Wilson"
self.deputy_speaker = deputy
def match_person(self, input, date=None):
@@ -92,7 +116,9 @@ def match_person(self, input, date=None):
if len(ids) == 0:
raise ContextException("No match %s" % input)
if len(ids) > 1:
- raise ContextException("Multiple matches %s, possibles are %s" % (input, ids))
+ raise ContextException(
+ "Multiple matches %s, possibles are %s" % (input, ids)
+ )
id = ids.pop()
return id
@@ -101,10 +127,10 @@ def match(self, input, date):
if self.debatedate != date:
self.debatedate = date
self.cleardebatehistory()
- speakeroffice = ''
+ speakeroffice = ""
office = None
- input = re.sub(' \(Designate\)', '', input)
- match = re.match('(.*) \((.*?)\)\s*$', input)
+ input = re.sub(" \(Designate\)", "", input)
+ match = re.match("(.*) \((.*?)\)\s*$", input)
if match:
office = match.group(1)
speakeroffice = ' speakeroffice="%s"' % office
@@ -123,42 +149,54 @@ def match(self, input, date):
self.debateofficehistory.setdefault(office, set()).update(ids)
if len(ids) == 0:
- if not re.search('Some Members|A Member|Several Members|Members', input):
+ if not re.search("Some Members|A Member|Several Members|Members", input):
# import pdb;pdb.set_trace()
raise ContextException("No matches %s" % (input))
- return None, 'person_id="unknown" error="No match" speakername="%s"' % (input)
- if len(ids) > 1 and 'uk.org.publicwhip/member/90355' in ids:
+ return None, 'person_id="unknown" error="No match" speakername="%s"' % (
+ input
+ )
+ if len(ids) > 1 and "uk.org.publicwhip/member/90355" in ids:
# Special case for 8th May, when Mr Hay becomes Speaker
- if input == 'Mr Hay':
- ids.remove('uk.org.publicwhip/member/90355')
- elif input == 'Mr Speaker':
- ids.remove('uk.org.publicwhip/member/90287')
+ if input == "Mr Hay":
+ ids.remove("uk.org.publicwhip/member/90355")
+ elif input == "Mr Speaker":
+ ids.remove("uk.org.publicwhip/member/90287")
else:
- raise ContextException('Problem with Mr Hay!')
- elif len(ids) > 1 and 'uk.org.publicwhip/member/90449' in ids:
+ raise ContextException("Problem with Mr Hay!")
+ elif len(ids) > 1 and "uk.org.publicwhip/member/90449" in ids:
# Special case for 2015-01-12, when Mr McLaughlin becomes Speaker
- if input == 'Mr Mitchel McLaughlin':
- ids.remove('uk.org.publicwhip/member/90497')
- elif input == 'Mr Principal Deputy Speaker':
- ids.remove('uk.org.publicwhip/member/90497')
- elif input == 'Mr Speaker':
- ids.remove('uk.org.publicwhip/member/90449')
+ if input == "Mr Mitchel McLaughlin":
+ ids.remove("uk.org.publicwhip/member/90497")
+ elif input == "Mr Principal Deputy Speaker":
+ ids.remove("uk.org.publicwhip/member/90497")
+ elif input == "Mr Speaker":
+ ids.remove("uk.org.publicwhip/member/90449")
else:
raise ContextException('Problem with Mr McLaughlin! Got "%s"' % input)
elif len(ids) > 1:
names = ""
for id in ids:
name = self.name_on_date(self.membertoperson(id), date)
- names += '%s %s (%s) ' % (id, name, self.members[id]["constituency"])
- raise ContextException("Multiple matches %s, possibles are %s" % (input, names))
- return None, 'person_id="unknown" error="Matched multiple times" speakername="%s"' % (input)
+ names += "%s %s (%s) " % (id, name, self.members[id]["constituency"])
+ raise ContextException(
+ "Multiple matches %s, possibles are %s" % (input, names)
+ )
+ return (
+ None,
+ 'person_id="unknown" error="Matched multiple times" speakername="%s"'
+ % (input),
+ )
for id in ids:
pass
person_id = self.membertoperson(id)
remadename = self.name_on_date(person_id, date)
if self.members[id]["party"] == "Speaker" and re.search("Speaker", input):
remadename = input
- return person_id, 'person_id="%s" speakername="%s"%s' % (person_id, remadename, speakeroffice)
+ return person_id, 'person_id="%s" speakername="%s"%s' % (
+ person_id,
+ remadename,
+ speakeroffice,
+ )
def cleardebatehistory(self):
self.debatenamehistory = []
@@ -167,4 +205,5 @@ def cleardebatehistory(self):
def getmember(self, memberid):
return self.members[memberid]
+
memberList = MemberList()
diff --git a/pyscraper/ni/scrape.py b/pyscraper/ni/scrape.py
index 9f2db033..de5951a5 100755
--- a/pyscraper/ni/scrape.py
+++ b/pyscraper/ni/scrape.py
@@ -3,52 +3,57 @@
# XXX Pagination has been introduced for the 1998-2003 pages, so any
# rescraping of those will break with this current code.
+import datetime
import json
-import urllib.request
-import urllib.parse
-import re
-import time, datetime
import os
+import re
import sys
+import time
+import urllib.parse
+import urllib.request
-API_ROOT = 'http://data.niassembly.gov.uk/hansard_json.ashx?m=GetAllHansardReports'
-API_PLENARY = 'http://data.niassembly.gov.uk/hansard_json.ashx?m=GetHansardComponentsByPlenaryDate&plenaryDate='
+API_ROOT = "http://data.niassembly.gov.uk/hansard_json.ashx?m=GetAllHansardReports"
+API_PLENARY = "http://data.niassembly.gov.uk/hansard_json.ashx?m=GetHansardComponentsByPlenaryDate&plenaryDate="
root = []
-#for i in range(1997,2003):
+# for i in range(1997,2003):
# root.append('http://www.niassembly.gov.uk/record/hansard_session%d.htm' % i)
-for i in range(2005,2007):
- root.append('http://archive.niassembly.gov.uk/record/hansard_session%d_A.htm' % i)
-root.append('http://archive.niassembly.gov.uk/record/hansard_session%d_TA.htm' % i)
-for i in range(2006,2012):
- root.append('http://archive.niassembly.gov.uk/record/hansard_session%d.htm' % i)
-for i in range(11,15):
- root.append('http://www.niassembly.gov.uk/Assembly-Business/Official-Report/Reports-%d-%d/' % (i, i+1))
+for i in range(2005, 2007):
+ root.append("http://archive.niassembly.gov.uk/record/hansard_session%d_A.htm" % i)
+root.append("http://archive.niassembly.gov.uk/record/hansard_session%d_TA.htm" % i)
+for i in range(2006, 2012):
+ root.append("http://archive.niassembly.gov.uk/record/hansard_session%d.htm" % i)
+for i in range(11, 15):
+ root.append(
+ "http://www.niassembly.gov.uk/Assembly-Business/Official-Report/Reports-%d-%d/"
+ % (i, i + 1)
+ )
ni_dir = os.path.dirname(__file__)
+
def scrape_ni_day(url, filename, forcescrape):
- filename = '%s/../../../parldata/cmpages/ni/%s' % (ni_dir, filename)
+ filename = "%s/../../../parldata/cmpages/ni/%s" % (ni_dir, filename)
data = urllib.request.urlopen(url).read()
- if b'ExceptionMessage' in data or b'"Message":"An error has occurred."' in data:
- print('ERROR received scraping %s' % url)
+ if b"ExceptionMessage" in data or b'"Message":"An error has occurred."' in data:
+ print("ERROR received scraping %s" % url)
return
save = True
if os.path.isfile(filename):
- current = open(filename, 'rb').read()
+ current = open(filename, "rb").read()
if current == data and not forcescrape:
save = False
if save:
print("NI scraping %s" % url)
- open(filename, 'wb').write(data)
+ open(filename, "wb").write(data)
def scrape_ni(datefrom, dateto, forcescrape=False):
# Let's use the API for anything post 2014-11-01 for the moment
- date_switch = '2014-11-01'
+ date_switch = "2014-11-01"
if datefrom <= date_switch:
scrape_ni_html(datefrom, dateto, forcescrape)
if dateto >= date_switch:
@@ -59,15 +64,17 @@ def scrape_ni_json(datefrom, dateto, forcescrape):
ur = urllib.request.urlopen(API_ROOT)
index = json.load(ur)
- if 'ExceptionMessage' in index:
- print('ERROR received scraping NI root')
+ if "ExceptionMessage" in index:
+ print("ERROR received scraping NI root")
return
- for day in index['AllHansardComponentsList']['HansardComponent']:
- date = day['PlenaryDate'][:10]
- if date < datefrom or date > dateto: continue
- if date < '2014-11-01': continue
- filename = 'ni%s.json' % date
+ for day in index["AllHansardComponentsList"]["HansardComponent"]:
+ date = day["PlenaryDate"][:10]
+ if date < datefrom or date > dateto:
+ continue
+ if date < "2014-11-01":
+ continue
+ filename = "ni%s.json" % date
scrape_ni_day(API_PLENARY + str(date), filename, forcescrape)
@@ -78,29 +85,43 @@ def scrape_ni_html(datefrom, dateto, forcescrape):
ur.close()
# Manual fixes
- page = page.replace('990315', '990715').replace('000617', '000619').replace('060706', '060606')
- page = page.replace('060919', '060919p').replace('071101', '071001').replace('071102', '071002')
-
- match = re.findall('View (?:as|in) HTML *', page)
+ page = (
+ page.replace("990315", "990715")
+ .replace("000617", "000619")
+ .replace("060706", "060606")
+ )
+ page = (
+ page.replace("060919", "060919p")
+ .replace("071101", "071001")
+ .replace("071102", "071002")
+ )
+
+ match = re.findall(
+ 'View (?:as|in) HTML *',
+ page,
+ )
for day in match:
date = time.strptime(day[2], "%y%m%d")
- date = '%d-%02d-%02d' % date[:3]
- if date < datefrom or date > dateto: continue
- filename = 'ni%s%s%s.html' % (date, day[1], day[3])
+ date = "%d-%02d-%02d" % date[:3]
+ if date < datefrom or date > dateto:
+ continue
+ filename = "ni%s%s%s.html" % (date, day[1], day[3])
scrape_ni_day(urllib.parse.urljoin(url, day[0]), filename, forcescrape)
- match = re.findall('Read now', page)
+ match = re.findall(
+ "Read now",
+ page,
+ )
for day in match:
# Normally 12-December-2011 but recently 23-January-2012-1030am---1100am and 1030-1100am--17-January-2012
# and Monday-16-April
formats = (
# Manual fix for 2013-02-18
- (r'(18-Febraury-2013)', '%d-%braury-%Y', day[2]),
-
- (r'(\d{1,2}-[a-zA-Z]*-\d\d\d\d)', "%d-%B-%Y", day[2]),
- (r'(\d{2}/[a-zA-Z]*-\d{1,2}-[a-zA-Z]*)', "%y/%A-%d-%B", day[1]),
- (r'(\d{2}/\d{1,2}-[a-zA-Z]*)', "%y/%d-%B", day[1]),
- )
+ (r"(18-Febraury-2013)", "%d-%braury-%Y", day[2]),
+ (r"(\d{1,2}-[a-zA-Z]*-\d\d\d\d)", "%d-%B-%Y", day[2]),
+ (r"(\d{2}/[a-zA-Z]*-\d{1,2}-[a-zA-Z]*)", "%y/%A-%d-%B", day[1]),
+ (r"(\d{2}/\d{1,2}-[a-zA-Z]*)", "%y/%d-%B", day[1]),
+ )
date = None
for date_re, date_format, day_part in formats:
@@ -114,12 +135,16 @@ def scrape_ni_html(datefrom, dateto, forcescrape):
if not date:
raise ValueError("%s is not in a recognized format" % day[1])
- if datetime.date(*date[:3]) == datetime.date.today(): continue
- if datetime.date(*date[:3]) < datetime.date(2011, 12, 12): continue
- date = '%d-%02d-%02d' % date[:3]
- if date < datefrom or date > dateto: continue
- filename = 'ni%s.html' % date
+ if datetime.date(*date[:3]) == datetime.date.today():
+ continue
+ if datetime.date(*date[:3]) < datetime.date(2011, 12, 12):
+ continue
+ date = "%d-%02d-%02d" % date[:3]
+ if date < datefrom or date > dateto:
+ continue
+ filename = "ni%s.html" % date
scrape_ni_day(urllib.parse.urljoin(url, day[0]), filename, forcescrape)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
scrape_ni(*sys.argv[1:])
diff --git a/pyscraper/ni/wikipedia-mla.py b/pyscraper/ni/wikipedia-mla.py
index cbb092e0..2fa2cc6c 100755
--- a/pyscraper/ni/wikipedia-mla.py
+++ b/pyscraper/ni/wikipedia-mla.py
@@ -8,21 +8,26 @@
# For details see the file LICENSE.html in the top level of the source.
import os
+import re
import sys
import urllib.parse
-import re
-file_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), '..')
+file_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), "..")
sys.path.insert(0, file_dir)
from ni.resolvenames import memberList
-wiki_index_url = "https://en.wikipedia.org/wiki/Members_of_the_4th_Northern_Ireland_Assembly"
-wikimembers = {}
+wiki_index_url = (
+ "https://en.wikipedia.org/wiki/Members_of_the_4th_Northern_Ireland_Assembly"
+)
+wikimembers = {}
+
# Grab pages
def read(y):
- with open(file_dir + '/../rawdata/Members_of_the_NIA_%d' % y) as ur:
+ with open(file_dir + "/../rawdata/Members_of_the_NIA_%d" % y) as ur:
return ur.read()
+
+
content = read(2003) + read(2007) + read(2011) + read(2016) + read(2017) + read(2022)
matches = set()
@@ -36,33 +41,40 @@ def read(y):
matches.update(re.findall(matcher, content))
# 4-6th Assembly changes
-changes = re.findall('(?s) ]*>(?:]*>)?MLAs by constituency.*?]*>(?:]*>)?Changes(.*?) |