From d94d9471dc8052a85ff48d9f6d67bfb6407bc704 Mon Sep 17 00:00:00 2001
From: Pancakem <pancakesdeath@protonmail.com>
Date: Mon, 13 Jul 2020 11:47:23 +0300
Subject: [PATCH 1/3] Updating parser to regex

---
 api/__init__.py  |   7 +-
 parser/parser.py | 283 ++++++++++++++++++++++++++---------------------
 2 files changed, 162 insertions(+), 128 deletions(-)

diff --git a/api/__init__.py b/api/__init__.py
index 54ce9af..37ae0dd 100644
--- a/api/__init__.py
+++ b/api/__init__.py
@@ -5,6 +5,7 @@
 from api.config import app_config
 from parser.parser import parse
 from api.util import validate_url
+import json
 
 document_url = None
 callback_url = None
@@ -30,8 +31,10 @@ def run_parse():
     response = {
             'error': 'None',
             'data': data
-        }
-    requests.post(callback_url, headers=headers, data=response)
+    }
+
+    res = json.dumps(response)
+    requests.post(callback_url, headers=headers, data=res)
 
 
 def create_app(config_name):
diff --git a/parser/parser.py b/parser/parser.py
index d230418..f55fabc 100644
--- a/parser/parser.py
+++ b/parser/parser.py
@@ -2,136 +2,167 @@
 from pdfminer.high_level import extract_text
 from shutil import copyfileobj
 import tempfile
-
-keywords = ['REGION', 'COUNTY', 'TIME', 'DATE','AREA', ',' ]
-
-class County:
-    name = None
-    area = None
-    time = None
-    date = None
-    locations = []
-
-    def serialize(self):
-        if self.name == None:
-            return
-
-        return { 'name': self.name,
-                 'area': self.area,
-                 'time': self.time,
-                 'date': self.date,
-                 'locations': self.locations
-        }
-
-class Region:
-    region = None
-    counties = []
-
-    def serialize(self):
-        ser_counties = []
-        for sc in self.counties:
-            ser_counties.append(sc.serialize())
-
-        return {'region': self.region,
-                 'counties': ser_counties
-         }
-
-def download_file(url):
+import re
+
+from re import search, sub, IGNORECASE
+
+
+# {
+# 	"region": {
+# 		"name": "Region name",
+# 		"counties": [
+# 			{
+# 				"name": "County Name",
+# 				"areas":[
+# 					{
+# 						"name": "Area name",
+# 						"details": {
+# 							"date": "Date",
+# 							"time": "Time",
+# 							"locations": ["location"]
+# 						}
+# 					}
+# 				]
+# 			}
+# 		]
+# 	}
+# }
+
+def get_text(url):
     r = requests.get(url, stream=True)
     temFile = tempfile.TemporaryFile()
     copyfileobj(r.raw, temFile)
-    return temFile
-
-def get_text(file_):
-    return extract_text(file_)
-
-def check_for_keyword(lines):
-    new_lines = []
-    for line in lines:
-        for k in keywords:
-            if k in line:
-                new_lines.append(line)
-                break
-    return new_lines
-
-def take_lines(contents):
-    lines = []
-    contents = contents.split('\n')
-    lappend = lines.append
-    for line in contents:
-        if len(line) < 3: continue
-        lappend(line)
-    return check_for_keyword(lines[1:])
-
-def parse_(lines):
-    hit_county, hit_region, i = 0, 0, 0
-    regions = []
-    region = Region()
-    county = County()
-    rounds = len(lines)
-    for line in lines:
-        i += 1
-        line = line.replace('\n', '').lstrip().rstrip()
-        if 'REGION' in line:
-            if hit_region == 0:
-                region.region = line
-                hit_region = 1
-            elif hit_region == 1:
-                # another region encountered store current
-                region.counties.append(county)
-                county = County()
-                regions.append(region)
-                region = Region()
-                region.region = line
-
-        elif 'COUNTY' in line:
-            if hit_county == 0:
-                county.name = line
-                hit_county = 1
-            else:
-                region.counties.append(county)
-                county = County()
-                county.name = line
-
-        elif 'DATE' in line and 'TIME' in line:
-            date_str = ''
-            for x in line:
-                if x == ' ':
-                    continue
-
-                if x == 'T':
-                    county.date = date_str.replace('\n', '')
-                    date_str = x
-                    continue
-
-                date_str += x
-            county.time = date_str
-            county.locations = lines[i].replace('\n', '').rstrip().lstrip().split(',')
-
-        elif 'DATE' in line:
-            county.date = line[6:]
-
-        elif 'TIME' in line:
-            county.time = line[6:]
-            county.locations = lines[i].replace('\n', '').rstrip().lstrip().split(',')
-
-        elif 'AREA' in line:
-            county.area = line[6:]
-
-        if i == rounds-1:
-            region.counties.append(county)
-            regions.append(region)
-
-
+    text = extract_text(temFile)
+    text = text.replace("\n", '.')
+    text = sub(r"[\s]{2,}", ' ', text)
+    return text
+
+def get_regions(text):
+    regions = dict()
+    regex = r"[.]([a-zA-Z\s]+?REGION)(.+?)[.](?:[a-zA-Z\s]+?REGION)"
+    region_search = search(regex, text, IGNORECASE)
+    while region_search:
+        # Get the top regio
+        region = dict()
+        region["name"] = region_search.group(1).strip()
+        print("Region: " + region["name"])
+        region_key = '_'.join(region["name"].lower().split(' '))
+        region["counties"] = get_counties(region_search.group(2), regions, region_key)
+        regions[region_key] = region
+        # Remove the region
+        text = text.replace(region_search.group(1), '')
+        text = text.replace(region_search.group(2), '')
+
+	# Do the region search again
+        region_search = search(regex, text, IGNORECASE)
+
+    last_region_check = search(r"[.]([a-zA-Z\s]+?REGION)(.+?customers)", text, IGNORECASE)
+    if last_region_check:
+        # Get the last region
+        region = dict()
+        region["name"] = last_region_check.group(1).strip()
+        print("Region: " + region["name"])
+        region_key = '_'.join(region["name"].lower().split(' '))
+        region["counties"] = get_counties(last_region_check.group(2), regions, region_key)
+        regions[region_key] = region
     return regions
 
+def get_counties(text, regions, region_key):
+    counties = list()
+    regex = r"[.]([a-zA-Z\s]+?COUNTY)(.+?)[.]([a-zA-Z\s]*?COUNTY)"
+    county_search = search(regex, text, IGNORECASE)
+    while county_search:
+        # Get the top county
+        county = dict()
+        county["name"] = county_search.group(1).strip()
+        print("County: " + county["name"])
+        county["areas"] = get_areas(county_search.group(2))
+
+        # Check if the region already exists
+        if region_key in regions.keys():
+            regions[region_key]["counties"].append(county)
+        else:
+            counties.append(county)
+
+        # Remove the county
+        text = text.replace(county_search.group(1), '')
+        text = text.replace(county_search.group(2), '')
+
+        # Do the county search again
+        county_search = search(regex, text, IGNORECASE)
+
+    last_county_check = search(r"[.]([a-zA-Z\s]+?COUNTY)(.+?)$", text, IGNORECASE)
+    if last_county_check:
+        # Get the last county
+        county = dict()
+        county["name"] = last_county_check.group(1).strip()
+        print("County: " + county["name"])
+        county["areas"] = get_areas(last_county_check.group(2))
+
+        # Check if the region already exists
+        if region_key in regions.keys():
+            regions[region_key]["counties"].append(county)
+        else:
+            counties.append(county)
+
+    return counties
+
+def get_areas(text):
+	areas = list()
+	regex = r"(AREA:[a-zA-Z\s,]+[.])(DATE.+?)AREA"
+	area_search = search(regex, text, IGNORECASE)
+	while area_search:
+		# Get the top area
+		area = dict()
+		area["name"] = area_search.group(1)
+		print(area["name"])
+		area["details"] = get_details(area_search.group(2))
+		areas.append(area)
+
+		# Remove the area
+		text = text.replace(area_search.group(1), '')
+		text = text.replace(area_search.group(2), '')
+
+		# Do the county search again
+		area_search = search(regex, text, IGNORECASE)
+
+	last_area_check = search(r"(AREA:[a-zA-Z\s,]+[.])(DATE.+?)$", text, IGNORECASE)
+	if last_area_check:
+		# Get the last area
+		area = dict()
+		area["name"] = last_area_check.group(1)
+		print(area["name"])
+		area["details"] = get_details(last_area_check.group(2))
+		areas.append(area)
+
+	print("\n")
+	return areas
+
+def get_details(text):
+	details = dict()
+	date_search = search(r"(DATE:)(.+?)TIME", text, IGNORECASE)
+	if date_search:
+		print(date_search.group(2).strip())
+		details["date"] = date_search.group(2).strip()
+		text = text.replace(date_search.group(1), '')
+		text = text.replace(date_search.group(2), '')
+
+	time_search = search(r"(TIME:)(.+?P[.]M[.])", text, IGNORECASE)
+	if time_search:
+		print(time_search.group(2).strip())
+		details["time"] = time_search.group(2).strip()
+		text = text.replace(time_search.group(1), '')
+		text = text.replace(time_search.group(2), '')
+
+	details["locations"] = get_locations(text)
+	print(details["locations"])
+
+	print("\n")
+	return details
+
+def get_locations(text):
+	stripSpaces = lambda location : location.strip()
+	return list(map(stripSpaces, text.split(',')))
 
 def parse(url):
-    tempFile = download_file(url)
-    file_data = take_lines(get_text(tempFile))
-    all_data = parse_(file_data)
-    serialized_data = []
-    append = serialized_data.append
-    for r in all_data:
-        append(r.serialize())
-    return serialized_data
+    return get_regions(get_text(url))

From e2728cd657cec6ccb343361ca161b1724416b828 Mon Sep 17 00:00:00 2001
From: Pancakem <pancakesdeath@protonmail.com>
Date: Tue, 14 Jul 2020 15:38:14 +0300
Subject: [PATCH 2/3] Remove leading and trailing dots

---
 parser/parser.py | 18 +++---------------
 parser/util.py   | 17 +++++++++++++++++
 2 files changed, 20 insertions(+), 15 deletions(-)
 create mode 100644 parser/util.py

diff --git a/parser/parser.py b/parser/parser.py
index f55fabc..84e1e07 100644
--- a/parser/parser.py
+++ b/parser/parser.py
@@ -2,10 +2,8 @@
 from pdfminer.high_level import extract_text
 from shutil import copyfileobj
 import tempfile
-import re
-
 from re import search, sub, IGNORECASE
-
+from .util import rlstrip_dot, composite_function
 
 # {
 # 	"region": {
@@ -37,6 +35,7 @@ def get_text(url):
     text = sub(r"[\s]{2,}", ' ', text)
     return text
 
+
 def get_regions(text):
     regions = dict()
     regex = r"[.]([a-zA-Z\s]+?REGION)(.+?)[.](?:[a-zA-Z\s]+?REGION)"
@@ -45,7 +44,6 @@ def get_regions(text):
         # Get the top regio
         region = dict()
         region["name"] = region_search.group(1).strip()
-        print("Region: " + region["name"])
         region_key = '_'.join(region["name"].lower().split(' '))
         region["counties"] = get_counties(region_search.group(2), regions, region_key)
         regions[region_key] = region
@@ -61,7 +59,6 @@ def get_regions(text):
         # Get the last region
         region = dict()
         region["name"] = last_region_check.group(1).strip()
-        print("Region: " + region["name"])
         region_key = '_'.join(region["name"].lower().split(' '))
         region["counties"] = get_counties(last_region_check.group(2), regions, region_key)
         regions[region_key] = region
@@ -75,7 +72,6 @@ def get_counties(text, regions, region_key):
         # Get the top county
         county = dict()
         county["name"] = county_search.group(1).strip()
-        print("County: " + county["name"])
         county["areas"] = get_areas(county_search.group(2))
 
         # Check if the region already exists
@@ -96,7 +92,6 @@ def get_counties(text, regions, region_key):
         # Get the last county
         county = dict()
         county["name"] = last_county_check.group(1).strip()
-        print("County: " + county["name"])
         county["areas"] = get_areas(last_county_check.group(2))
 
         # Check if the region already exists
@@ -115,7 +110,6 @@ def get_areas(text):
 		# Get the top area
 		area = dict()
 		area["name"] = area_search.group(1)
-		print(area["name"])
 		area["details"] = get_details(area_search.group(2))
 		areas.append(area)
 
@@ -131,38 +125,32 @@ def get_areas(text):
 		# Get the last area
 		area = dict()
 		area["name"] = last_area_check.group(1)
-		print(area["name"])
 		area["details"] = get_details(last_area_check.group(2))
 		areas.append(area)
 
-	print("\n")
 	return areas
 
 def get_details(text):
 	details = dict()
 	date_search = search(r"(DATE:)(.+?)TIME", text, IGNORECASE)
 	if date_search:
-		print(date_search.group(2).strip())
 		details["date"] = date_search.group(2).strip()
 		text = text.replace(date_search.group(1), '')
 		text = text.replace(date_search.group(2), '')
 
 	time_search = search(r"(TIME:)(.+?P[.]M[.])", text, IGNORECASE)
 	if time_search:
-		print(time_search.group(2).strip())
 		details["time"] = time_search.group(2).strip()
 		text = text.replace(time_search.group(1), '')
 		text = text.replace(time_search.group(2), '')
 
 	details["locations"] = get_locations(text)
-	print(details["locations"])
 
-	print("\n")
 	return details
 
 def get_locations(text):
 	stripSpaces = lambda location : location.strip()
-	return list(map(stripSpaces, text.split(',')))
+	return list(map(composite_function(stripSpaces, rlstrip_dot), text.split(',')))
 
 def parse(url):
     return get_regions(get_text(url))
diff --git a/parser/util.py b/parser/util.py
new file mode 100644
index 0000000..2dd96e5
--- /dev/null
+++ b/parser/util.py
@@ -0,0 +1,17 @@
+from functools import reduce
+from re import sub
+
+# strip leading and trailing dots
+def rlstrip_dot(string):
+    return sub(r"^[\.]|\.\s+$", "", string)
+
+
+# helper function
+
+# this function takes a number of functions and composes them
+def composite_function(*func):
+
+    def compose(f, g):
+        return lambda x : f(g(x))
+
+    return reduce(compose, func, lambda x : x)

From 1d907a055d87a59fc073b486b48ac127db475a60 Mon Sep 17 00:00:00 2001
From: Pancakem <pancakesdeath@protonmail.com>
Date: Wed, 15 Jul 2020 17:01:09 +0300
Subject: [PATCH 3/3] Document the updated parsing functions

---
 parser/parser.py | 89 ++++++++++++++++++++++++++++++------------------
 parser/util.py   |  2 +-
 2 files changed, 56 insertions(+), 35 deletions(-)

diff --git a/parser/parser.py b/parser/parser.py
index 84e1e07..3d7002e 100644
--- a/parser/parser.py
+++ b/parser/parser.py
@@ -27,6 +27,9 @@
 # }
 
 def get_text(url):
+    """
+    Downloads a pdf converts it to text and returns the text with all '\n' replaced with '.'
+    """
     r = requests.get(url, stream=True)
     temFile = tempfile.TemporaryFile()
     copyfileobj(r.raw, temFile)
@@ -37,11 +40,15 @@ def get_text(url):
 
 
 def get_regions(text):
+    """ Consumes text
+    Takes a chunk of text marked with REGION at the beginning and REGION at the end
+    From the chunk of text, counties, areas and area details are mined
+    """
     regions = dict()
     regex = r"[.]([a-zA-Z\s]+?REGION)(.+?)[.](?:[a-zA-Z\s]+?REGION)"
     region_search = search(regex, text, IGNORECASE)
     while region_search:
-        # Get the top regio
+        # Get the top region
         region = dict()
         region["name"] = region_search.group(1).strip()
         region_key = '_'.join(region["name"].lower().split(' '))
@@ -65,6 +72,11 @@ def get_regions(text):
     return regions
 
 def get_counties(text, regions, region_key):
+    """Consumes text, a dictionary of regions and current region key
+    The text is chunked using county boundaries.
+    The parsed county and its details will be stored in the
+    dictionary value of the current region key.
+    """
     counties = list()
     regex = r"[.]([a-zA-Z\s]+?COUNTY)(.+?)[.]([a-zA-Z\s]*?COUNTY)"
     county_search = search(regex, text, IGNORECASE)
@@ -103,54 +115,63 @@ def get_counties(text, regions, region_key):
     return counties
 
 def get_areas(text):
-	areas = list()
-	regex = r"(AREA:[a-zA-Z\s,]+[.])(DATE.+?)AREA"
+    """consume text
+    chunks text using AREA boundaries and capturing date
+    The area details (time and date) are mined from the date capture group
+    """
+    areas = list()
+    regex = r"(AREA:[a-zA-Z\s,]+[.])(DATE.+?)AREA"
+    area_search = search(regex, text, IGNORECASE)
+    while area_search:
+	# Get the top area
+	area = dict()
+	area["name"] = area_search.group(1)
+	area["details"] = get_details(area_search.group(2))
+	areas.append(area)
+
+	# Remove the area
+	text = text.replace(area_search.group(1), '')
+	text = text.replace(area_search.group(2), '')
+
+	# Do the county search again
 	area_search = search(regex, text, IGNORECASE)
-	while area_search:
-		# Get the top area
-		area = dict()
-		area["name"] = area_search.group(1)
-		area["details"] = get_details(area_search.group(2))
-		areas.append(area)
-
-		# Remove the area
-		text = text.replace(area_search.group(1), '')
-		text = text.replace(area_search.group(2), '')
-
-		# Do the county search again
-		area_search = search(regex, text, IGNORECASE)
 
 	last_area_check = search(r"(AREA:[a-zA-Z\s,]+[.])(DATE.+?)$", text, IGNORECASE)
 	if last_area_check:
-		# Get the last area
-		area = dict()
-		area["name"] = last_area_check.group(1)
-		area["details"] = get_details(last_area_check.group(2))
-		areas.append(area)
+	    # Get the last area
+	    area = dict()
+	    area["name"] = last_area_check.group(1)
+	    area["details"] = get_details(last_area_check.group(2))
+	    areas.append(area)
 
-	return areas
+    return areas
 
 def get_details(text):
-	details = dict()
-	date_search = search(r"(DATE:)(.+?)TIME", text, IGNORECASE)
-	if date_search:
-		details["date"] = date_search.group(2).strip()
-		text = text.replace(date_search.group(1), '')
-		text = text.replace(date_search.group(2), '')
+    """The text consumed should be from a date capture group
+    The text is searched for time and date
+    """
+    details = dict()
+    date_search = search(r"(DATE:)(.+?)TIME", text, IGNORECASE)
+    if date_search:
+	details["date"] = date_search.group(2).strip()
+	text = text.replace(date_search.group(1), '')
+	text = text.replace(date_search.group(2), '')
 
 	time_search = search(r"(TIME:)(.+?P[.]M[.])", text, IGNORECASE)
 	if time_search:
-		details["time"] = time_search.group(2).strip()
-		text = text.replace(time_search.group(1), '')
-		text = text.replace(time_search.group(2), '')
+	    details["time"] = time_search.group(2).strip()
+	    text = text.replace(time_search.group(1), '')
+	    text = text.replace(time_search.group(2), '')
 
 	details["locations"] = get_locations(text)
 
-	return details
+    return details
 
 def get_locations(text):
-	stripSpaces = lambda location : location.strip()
-	return list(map(composite_function(stripSpaces, rlstrip_dot), text.split(',')))
+    """Mines comma separated locations at the end of a area section"""
+    stripSpaces = lambda location : location.strip()
+    return list(map(composite_function(stripSpaces, rlstrip_dot), text.split(',')))
 
 def parse(url):
+    """ Do everything"""
     return get_regions(get_text(url))
diff --git a/parser/util.py b/parser/util.py
index 2dd96e5..0f75cce 100644
--- a/parser/util.py
+++ b/parser/util.py
@@ -3,7 +3,7 @@
 
 # strip leading and trailing dots
 def rlstrip_dot(string):
-    return sub(r"^[\.]|\.\s+$", "", string)
+    return sub(r"^[\.]+|[\.\s]+$", "", string)
 
 
 # helper function