-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmake-json.py
More file actions
executable file
·104 lines (84 loc) · 3.03 KB
/
make-json.py
File metadata and controls
executable file
·104 lines (84 loc) · 3.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/python3
import json
import os
import re
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.layout import LAParams, LTTextBoxHorizontal
from pdfminer.converter import PDFPageAggregator
def get_mandatory(cfg, name):
v = cfg.get(name)
if v is None:
raise Exception("define %s in config.json" % name)
return v
def ensure_dir(d):
if not os.path.exists(d):
os.makedirs(d)
class Generator:
def __init__(self, cfg):
self.source_file = get_mandatory(cfg, "sourceFile")
self.target_dir = cfg.get("targetDir", "json")
ensure_dir(self.target_dir)
self.space_rx = re.compile("\\s+")
def run(self):
with open(self.source_file, 'rb') as fp:
parser = PDFParser(fp)
document = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
first = True
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
if first: # skip title page
first = False
else:
layout = device.get_result()
self.process(layout)
def process(self, layout):
height = 0
columns = [] # of list of height length
for obj in layout:
if isinstance(obj, LTTextBoxHorizontal):
raw = obj.get_text()
col = raw.split("\n")
l = len(col)
# remember columns of maximum height
if l > height:
height = l
columns = []
if l == height:
columns.append(col)
if len(columns) != 7:
raise Exception("table not found")
for row in map(list, zip(*columns)): # transpose
self.handle_row(row)
def handle_row(self, row):
try:
row_id = int(row[0])
except ValueError:
return # skip empty row (normally last one on page)
target_path = os.path.join(self.target_dir, "%d.json" % row_id)
with open(target_path, 'w') as f:
json.dump({
"Id": str(row_id),
"street": self.respace(row[1]),
"zip": self.respace(row[2], ""),
"municipality": self.respace(row[3]),
"lau1": row[4].strip(),
"charging_point_count": int(row[5]),
"since": row[6].strip()
}, f, ensure_ascii=False)
def respace(self, item, repl=" "):
return self.space_rx.sub(repl, item.strip())
def main():
with open("config.json") as cf:
cfg = json.load(cf)
gen = Generator(cfg)
gen.run()
if __name__ == "__main__":
main()