-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathimport_csv.py
59 lines (46 loc) · 1.67 KB
/
import_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import os
import csv
import glob
import json
import gzip
from marisa_trie import Trie
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
CSV_FOLDER = 'CSV'
OUTPUT_GZ = 'postcodes.gz'
def code_from_csv():
"""
This function searches all CSV files from the
specified "CSV_FOLDER" and extracts only the first column;
the first column should be the UK post-code.
"""
globpath = '{}/{}/*.csv'.format(BASE_DIR, CSV_FOLDER)
for csvname in glob.glob(globpath):
print('Loading CSV ::', csvname)
with open(csvname, newline='') as csvfile:
csvreader = csv.reader(csvfile, delimiter=',', quotechar='"')
for row in csvreader:
code = row[0].upper().replace(' ', '')
yield code
def write_to_file():
"""
All the extracted post-codes are normalized and exported in a file.
"""
# Sets are the most efficient data structure for storing unique strings
postcodes = {}
for code in code_from_csv():
out_code = code[:-3]
inw_code = code[-3:]
if out_code not in postcodes:
postcodes[out_code] = set()
postcodes[out_code].add(inw_code)
# Calculate length before joining the set
codes = sum(len(val) for val in postcodes.values())
for key, val in postcodes.items():
postcodes[key] = ','.join(sorted(val))
out_path = '{}/{}'.format(BASE_DIR, OUTPUT_GZ)
with gzip.open(out_path, 'wt') as output:
json.dump(postcodes, output, indent=2, sort_keys=True)
f_size = os.stat(out_path).st_size / 1000 / 1000
print('Stat :: {:,} codes, {:,.2f} MB file.'.format(codes, f_size))
if __name__ == '__main__':
write_to_file()