From d94d9471dc8052a85ff48d9f6d67bfb6407bc704 Mon Sep 17 00:00:00 2001 From: Pancakem Date: Mon, 13 Jul 2020 11:47:23 +0300 Subject: [PATCH 1/3] Updating parser to regex --- api/__init__.py | 7 +- parser/parser.py | 283 ++++++++++++++++++++++++++--------------------- 2 files changed, 162 insertions(+), 128 deletions(-) diff --git a/api/__init__.py b/api/__init__.py index 54ce9af..37ae0dd 100644 --- a/api/__init__.py +++ b/api/__init__.py @@ -5,6 +5,7 @@ from api.config import app_config from parser.parser import parse from api.util import validate_url +import json document_url = None callback_url = None @@ -30,8 +31,10 @@ def run_parse(): response = { 'error': 'None', 'data': data - } - requests.post(callback_url, headers=headers, data=response) + } + + res = json.dumps(response) + requests.post(callback_url, headers=headers, data=res) def create_app(config_name): diff --git a/parser/parser.py b/parser/parser.py index d230418..f55fabc 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -2,136 +2,167 @@ from pdfminer.high_level import extract_text from shutil import copyfileobj import tempfile - -keywords = ['REGION', 'COUNTY', 'TIME', 'DATE','AREA', ',' ] - -class County: - name = None - area = None - time = None - date = None - locations = [] - - def serialize(self): - if self.name == None: - return - - return { 'name': self.name, - 'area': self.area, - 'time': self.time, - 'date': self.date, - 'locations': self.locations - } - -class Region: - region = None - counties = [] - - def serialize(self): - ser_counties = [] - for sc in self.counties: - ser_counties.append(sc.serialize()) - - return {'region': self.region, - 'counties': ser_counties - } - -def download_file(url): +import re + +from re import search, sub, IGNORECASE + + +# { +# "region": { +# "name": "Region name", +# "counties": [ +# { +# "name": "County Name", +# "areas":[ +# { +# "name": "Area name", +# "details": { +# "date": "Date", +# "time": "Time", +# "locations": ["location"] +# } +# } +# ] +# } +# ] +# } +# } + +def get_text(url): r = requests.get(url, stream=True) temFile = tempfile.TemporaryFile() copyfileobj(r.raw, temFile) - return temFile - -def get_text(file_): - return extract_text(file_) - -def check_for_keyword(lines): - new_lines = [] - for line in lines: - for k in keywords: - if k in line: - new_lines.append(line) - break - return new_lines - -def take_lines(contents): - lines = [] - contents = contents.split('\n') - lappend = lines.append - for line in contents: - if len(line) < 3: continue - lappend(line) - return check_for_keyword(lines[1:]) - -def parse_(lines): - hit_county, hit_region, i = 0, 0, 0 - regions = [] - region = Region() - county = County() - rounds = len(lines) - for line in lines: - i += 1 - line = line.replace('\n', '').lstrip().rstrip() - if 'REGION' in line: - if hit_region == 0: - region.region = line - hit_region = 1 - elif hit_region == 1: - # another region encountered store current - region.counties.append(county) - county = County() - regions.append(region) - region = Region() - region.region = line - - elif 'COUNTY' in line: - if hit_county == 0: - county.name = line - hit_county = 1 - else: - region.counties.append(county) - county = County() - county.name = line - - elif 'DATE' in line and 'TIME' in line: - date_str = '' - for x in line: - if x == ' ': - continue - - if x == 'T': - county.date = date_str.replace('\n', '') - date_str = x - continue - - date_str += x - county.time = date_str - county.locations = lines[i].replace('\n', '').rstrip().lstrip().split(',') - - elif 'DATE' in line: - county.date = line[6:] - - elif 'TIME' in line: - county.time = line[6:] - county.locations = lines[i].replace('\n', '').rstrip().lstrip().split(',') - - elif 'AREA' in line: - county.area = line[6:] - - if i == rounds-1: - region.counties.append(county) - regions.append(region) - - + text = extract_text(temFile) + text = text.replace("\n", '.') + text = sub(r"[\s]{2,}", ' ', text) + return text + +def get_regions(text): + regions = dict() + regex = r"[.]([a-zA-Z\s]+?REGION)(.+?)[.](?:[a-zA-Z\s]+?REGION)" + region_search = search(regex, text, IGNORECASE) + while region_search: + # Get the top regio + region = dict() + region["name"] = region_search.group(1).strip() + print("Region: " + region["name"]) + region_key = '_'.join(region["name"].lower().split(' ')) + region["counties"] = get_counties(region_search.group(2), regions, region_key) + regions[region_key] = region + # Remove the region + text = text.replace(region_search.group(1), '') + text = text.replace(region_search.group(2), '') + + # Do the region search again + region_search = search(regex, text, IGNORECASE) + + last_region_check = search(r"[.]([a-zA-Z\s]+?REGION)(.+?customers)", text, IGNORECASE) + if last_region_check: + # Get the last region + region = dict() + region["name"] = last_region_check.group(1).strip() + print("Region: " + region["name"]) + region_key = '_'.join(region["name"].lower().split(' ')) + region["counties"] = get_counties(last_region_check.group(2), regions, region_key) + regions[region_key] = region return regions +def get_counties(text, regions, region_key): + counties = list() + regex = r"[.]([a-zA-Z\s]+?COUNTY)(.+?)[.]([a-zA-Z\s]*?COUNTY)" + county_search = search(regex, text, IGNORECASE) + while county_search: + # Get the top county + county = dict() + county["name"] = county_search.group(1).strip() + print("County: " + county["name"]) + county["areas"] = get_areas(county_search.group(2)) + + # Check if the region already exists + if region_key in regions.keys(): + regions[region_key]["counties"].append(county) + else: + counties.append(county) + + # Remove the county + text = text.replace(county_search.group(1), '') + text = text.replace(county_search.group(2), '') + + # Do the county search again + county_search = search(regex, text, IGNORECASE) + + last_county_check = search(r"[.]([a-zA-Z\s]+?COUNTY)(.+?)$", text, IGNORECASE) + if last_county_check: + # Get the last county + county = dict() + county["name"] = last_county_check.group(1).strip() + print("County: " + county["name"]) + county["areas"] = get_areas(last_county_check.group(2)) + + # Check if the region already exists + if region_key in regions.keys(): + regions[region_key]["counties"].append(county) + else: + counties.append(county) + + return counties + +def get_areas(text): + areas = list() + regex = r"(AREA:[a-zA-Z\s,]+[.])(DATE.+?)AREA" + area_search = search(regex, text, IGNORECASE) + while area_search: + # Get the top area + area = dict() + area["name"] = area_search.group(1) + print(area["name"]) + area["details"] = get_details(area_search.group(2)) + areas.append(area) + + # Remove the area + text = text.replace(area_search.group(1), '') + text = text.replace(area_search.group(2), '') + + # Do the county search again + area_search = search(regex, text, IGNORECASE) + + last_area_check = search(r"(AREA:[a-zA-Z\s,]+[.])(DATE.+?)$", text, IGNORECASE) + if last_area_check: + # Get the last area + area = dict() + area["name"] = last_area_check.group(1) + print(area["name"]) + area["details"] = get_details(last_area_check.group(2)) + areas.append(area) + + print("\n") + return areas + +def get_details(text): + details = dict() + date_search = search(r"(DATE:)(.+?)TIME", text, IGNORECASE) + if date_search: + print(date_search.group(2).strip()) + details["date"] = date_search.group(2).strip() + text = text.replace(date_search.group(1), '') + text = text.replace(date_search.group(2), '') + + time_search = search(r"(TIME:)(.+?P[.]M[.])", text, IGNORECASE) + if time_search: + print(time_search.group(2).strip()) + details["time"] = time_search.group(2).strip() + text = text.replace(time_search.group(1), '') + text = text.replace(time_search.group(2), '') + + details["locations"] = get_locations(text) + print(details["locations"]) + + print("\n") + return details + +def get_locations(text): + stripSpaces = lambda location : location.strip() + return list(map(stripSpaces, text.split(','))) def parse(url): - tempFile = download_file(url) - file_data = take_lines(get_text(tempFile)) - all_data = parse_(file_data) - serialized_data = [] - append = serialized_data.append - for r in all_data: - append(r.serialize()) - return serialized_data + return get_regions(get_text(url)) From e2728cd657cec6ccb343361ca161b1724416b828 Mon Sep 17 00:00:00 2001 From: Pancakem Date: Tue, 14 Jul 2020 15:38:14 +0300 Subject: [PATCH 2/3] Remove leading and trailing dots --- parser/parser.py | 18 +++--------------- parser/util.py | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 15 deletions(-) create mode 100644 parser/util.py diff --git a/parser/parser.py b/parser/parser.py index f55fabc..84e1e07 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -2,10 +2,8 @@ from pdfminer.high_level import extract_text from shutil import copyfileobj import tempfile -import re - from re import search, sub, IGNORECASE - +from .util import rlstrip_dot, composite_function # { # "region": { @@ -37,6 +35,7 @@ def get_text(url): text = sub(r"[\s]{2,}", ' ', text) return text + def get_regions(text): regions = dict() regex = r"[.]([a-zA-Z\s]+?REGION)(.+?)[.](?:[a-zA-Z\s]+?REGION)" @@ -45,7 +44,6 @@ def get_regions(text): # Get the top regio region = dict() region["name"] = region_search.group(1).strip() - print("Region: " + region["name"]) region_key = '_'.join(region["name"].lower().split(' ')) region["counties"] = get_counties(region_search.group(2), regions, region_key) regions[region_key] = region @@ -61,7 +59,6 @@ def get_regions(text): # Get the last region region = dict() region["name"] = last_region_check.group(1).strip() - print("Region: " + region["name"]) region_key = '_'.join(region["name"].lower().split(' ')) region["counties"] = get_counties(last_region_check.group(2), regions, region_key) regions[region_key] = region @@ -75,7 +72,6 @@ def get_counties(text, regions, region_key): # Get the top county county = dict() county["name"] = county_search.group(1).strip() - print("County: " + county["name"]) county["areas"] = get_areas(county_search.group(2)) # Check if the region already exists @@ -96,7 +92,6 @@ def get_counties(text, regions, region_key): # Get the last county county = dict() county["name"] = last_county_check.group(1).strip() - print("County: " + county["name"]) county["areas"] = get_areas(last_county_check.group(2)) # Check if the region already exists @@ -115,7 +110,6 @@ def get_areas(text): # Get the top area area = dict() area["name"] = area_search.group(1) - print(area["name"]) area["details"] = get_details(area_search.group(2)) areas.append(area) @@ -131,38 +125,32 @@ def get_areas(text): # Get the last area area = dict() area["name"] = last_area_check.group(1) - print(area["name"]) area["details"] = get_details(last_area_check.group(2)) areas.append(area) - print("\n") return areas def get_details(text): details = dict() date_search = search(r"(DATE:)(.+?)TIME", text, IGNORECASE) if date_search: - print(date_search.group(2).strip()) details["date"] = date_search.group(2).strip() text = text.replace(date_search.group(1), '') text = text.replace(date_search.group(2), '') time_search = search(r"(TIME:)(.+?P[.]M[.])", text, IGNORECASE) if time_search: - print(time_search.group(2).strip()) details["time"] = time_search.group(2).strip() text = text.replace(time_search.group(1), '') text = text.replace(time_search.group(2), '') details["locations"] = get_locations(text) - print(details["locations"]) - print("\n") return details def get_locations(text): stripSpaces = lambda location : location.strip() - return list(map(stripSpaces, text.split(','))) + return list(map(composite_function(stripSpaces, rlstrip_dot), text.split(','))) def parse(url): return get_regions(get_text(url)) diff --git a/parser/util.py b/parser/util.py new file mode 100644 index 0000000..2dd96e5 --- /dev/null +++ b/parser/util.py @@ -0,0 +1,17 @@ +from functools import reduce +from re import sub + +# strip leading and trailing dots +def rlstrip_dot(string): + return sub(r"^[\.]|\.\s+$", "", string) + + +# helper function + +# this function takes a number of functions and composes them +def composite_function(*func): + + def compose(f, g): + return lambda x : f(g(x)) + + return reduce(compose, func, lambda x : x) From 1d907a055d87a59fc073b486b48ac127db475a60 Mon Sep 17 00:00:00 2001 From: Pancakem Date: Wed, 15 Jul 2020 17:01:09 +0300 Subject: [PATCH 3/3] Document the updated parsing functions --- parser/parser.py | 89 ++++++++++++++++++++++++++++++------------------ parser/util.py | 2 +- 2 files changed, 56 insertions(+), 35 deletions(-) diff --git a/parser/parser.py b/parser/parser.py index 84e1e07..3d7002e 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -27,6 +27,9 @@ # } def get_text(url): + """ + Downloads a pdf converts it to text and returns the text with all '\n' replaced with '.' + """ r = requests.get(url, stream=True) temFile = tempfile.TemporaryFile() copyfileobj(r.raw, temFile) @@ -37,11 +40,15 @@ def get_text(url): def get_regions(text): + """ Consumes text + Takes a chunk of text marked with REGION at the beginning and REGION at the end + From the chunk of text, counties, areas and area details are mined + """ regions = dict() regex = r"[.]([a-zA-Z\s]+?REGION)(.+?)[.](?:[a-zA-Z\s]+?REGION)" region_search = search(regex, text, IGNORECASE) while region_search: - # Get the top regio + # Get the top region region = dict() region["name"] = region_search.group(1).strip() region_key = '_'.join(region["name"].lower().split(' ')) @@ -65,6 +72,11 @@ def get_regions(text): return regions def get_counties(text, regions, region_key): + """Consumes text, a dictionary of regions and current region key + The text is chunked using county boundaries. + The parsed county and its details will be stored in the + dictionary value of the current region key. + """ counties = list() regex = r"[.]([a-zA-Z\s]+?COUNTY)(.+?)[.]([a-zA-Z\s]*?COUNTY)" county_search = search(regex, text, IGNORECASE) @@ -103,54 +115,63 @@ def get_counties(text, regions, region_key): return counties def get_areas(text): - areas = list() - regex = r"(AREA:[a-zA-Z\s,]+[.])(DATE.+?)AREA" + """consume text + chunks text using AREA boundaries and capturing date + The area details (time and date) are mined from the date capture group + """ + areas = list() + regex = r"(AREA:[a-zA-Z\s,]+[.])(DATE.+?)AREA" + area_search = search(regex, text, IGNORECASE) + while area_search: + # Get the top area + area = dict() + area["name"] = area_search.group(1) + area["details"] = get_details(area_search.group(2)) + areas.append(area) + + # Remove the area + text = text.replace(area_search.group(1), '') + text = text.replace(area_search.group(2), '') + + # Do the county search again area_search = search(regex, text, IGNORECASE) - while area_search: - # Get the top area - area = dict() - area["name"] = area_search.group(1) - area["details"] = get_details(area_search.group(2)) - areas.append(area) - - # Remove the area - text = text.replace(area_search.group(1), '') - text = text.replace(area_search.group(2), '') - - # Do the county search again - area_search = search(regex, text, IGNORECASE) last_area_check = search(r"(AREA:[a-zA-Z\s,]+[.])(DATE.+?)$", text, IGNORECASE) if last_area_check: - # Get the last area - area = dict() - area["name"] = last_area_check.group(1) - area["details"] = get_details(last_area_check.group(2)) - areas.append(area) + # Get the last area + area = dict() + area["name"] = last_area_check.group(1) + area["details"] = get_details(last_area_check.group(2)) + areas.append(area) - return areas + return areas def get_details(text): - details = dict() - date_search = search(r"(DATE:)(.+?)TIME", text, IGNORECASE) - if date_search: - details["date"] = date_search.group(2).strip() - text = text.replace(date_search.group(1), '') - text = text.replace(date_search.group(2), '') + """The text consumed should be from a date capture group + The text is searched for time and date + """ + details = dict() + date_search = search(r"(DATE:)(.+?)TIME", text, IGNORECASE) + if date_search: + details["date"] = date_search.group(2).strip() + text = text.replace(date_search.group(1), '') + text = text.replace(date_search.group(2), '') time_search = search(r"(TIME:)(.+?P[.]M[.])", text, IGNORECASE) if time_search: - details["time"] = time_search.group(2).strip() - text = text.replace(time_search.group(1), '') - text = text.replace(time_search.group(2), '') + details["time"] = time_search.group(2).strip() + text = text.replace(time_search.group(1), '') + text = text.replace(time_search.group(2), '') details["locations"] = get_locations(text) - return details + return details def get_locations(text): - stripSpaces = lambda location : location.strip() - return list(map(composite_function(stripSpaces, rlstrip_dot), text.split(','))) + """Mines comma separated locations at the end of a area section""" + stripSpaces = lambda location : location.strip() + return list(map(composite_function(stripSpaces, rlstrip_dot), text.split(','))) def parse(url): + """ Do everything""" return get_regions(get_text(url)) diff --git a/parser/util.py b/parser/util.py index 2dd96e5..0f75cce 100644 --- a/parser/util.py +++ b/parser/util.py @@ -3,7 +3,7 @@ # strip leading and trailing dots def rlstrip_dot(string): - return sub(r"^[\.]|\.\s+$", "", string) + return sub(r"^[\.]+|[\.\s]+$", "", string) # helper function