14
14
import csv
15
15
import os
16
16
17
- import sas7bdat
18
17
import pickle
18
+ import sas7bdat
19
19
import epiweeks as epi
20
20
21
21
22
22
DATAPATH = '/home/automation/afhsb_data'
23
- SOURCE_DIR = DATAPATH
24
- TARGET_DIR = DATAPATH
23
+ SOURCE_DIR = DATAPATH
24
+ TARGET_DIR = DATAPATH
25
25
26
26
INVALID_DMISIDS = set ()
27
27
28
28
def get_flu_cat (dx ):
29
29
# flu1 (influenza)
30
- if (len (dx ) == 0 ): return None
30
+ if len (dx ) == 0 :
31
+ return None
31
32
dx = dx .capitalize ()
32
- if ( dx .isnumeric () ):
33
+ if dx .isnumeric ():
33
34
for prefix in ["487" , "488" ]:
34
- if (dx .startswith (prefix )): return 1
35
+ if dx .startswith (prefix ):
36
+ return 1
35
37
for i in range (0 , 7 ):
36
38
prefix = str (480 + i )
37
- if (dx .startswith (prefix )): return 2
39
+ if dx .startswith (prefix ):
40
+ return 2
38
41
for i in range (0 , 7 ):
39
42
prefix = str (460 + i )
40
- if (dx .startswith (prefix )): return 3
43
+ if dx .startswith (prefix ):
44
+ return 3
41
45
for prefix in ["07999" , "3829" , "7806" , "7862" ]:
42
- if (dx .startswith (prefix )): return 3
46
+ if dx .startswith (prefix ):
47
+ return 3
43
48
elif (dx [0 ].isalpha () and dx [1 :].isnumeric ()):
44
49
for prefix in ["J09" , "J10" , "J11" ]:
45
- if (dx .startswith (prefix )): return 1
50
+ if dx .startswith (prefix ):
51
+ return 1
46
52
for i in range (12 , 19 ):
47
53
prefix = "J{}" .format (i )
48
- if (dx .startswith (prefix )): return 2
54
+ if dx .startswith (prefix ):
55
+ return 2
49
56
for i in range (0 , 7 ):
50
57
prefix = "J0{}" .format (i )
51
- if (dx .startswith (prefix )): return 3
58
+ if dx .startswith (prefix ):
59
+ return 3
52
60
for i in range (20 , 23 ):
53
61
prefix = "J{}" .format (i )
54
- if (dx .startswith (prefix )): return 3
62
+ if dx .startswith (prefix ):
63
+ return 3
55
64
for prefix in ["J40" , "R05" , "H669" , "R509" , "B9789" ]:
56
- if (dx .startswith (prefix )): return 3
65
+ if dx .startswith (prefix ):
66
+ return 3
57
67
else :
58
68
return None
59
69
60
70
def aggregate_data (sourcefile , targetfile ):
61
71
reader = sas7bdat .SAS7BDAT (os .path .join (SOURCE_DIR , sourcefile ), skip_header = True )
62
72
# map column names to column indices
63
- COL2IDX = {column .name .decode ('utf-8' ): column .col_id for column in reader .columns }
64
- def get_field (row , column ): return row [COL2IDX [column ]]
73
+ col_2_idx = {column .name .decode ('utf-8' ): column .col_id for column in reader .columns }
74
+
75
+ def get_field (row , column ):
76
+ return row [col_2_idx [column ]]
65
77
66
78
def row2flu (row ):
67
79
for i in range (1 , 9 ):
68
80
dx = get_field (row , "dx{}" .format (i ))
69
81
flu_cat = get_flu_cat (dx )
70
- if (flu_cat != None ): return flu_cat
82
+ if flu_cat is not None :
83
+ return flu_cat
71
84
return 0
72
85
73
86
def row2epiweek (row ):
@@ -77,28 +90,30 @@ def row2epiweek(row):
77
90
year , week_num = week_tuple [0 ], week_tuple [1 ]
78
91
return year , week_num
79
92
80
- results_dict = dict ()
81
- for r , row in enumerate (reader ):
93
+ results_dict = {}
94
+ for _ , row in enumerate (reader ):
82
95
# if (r >= 1000000): break
83
- if (get_field (row , 'type' ) != "Outpt" ): continue
96
+ if get_field (row , 'type' ) != "Outpt" :
97
+ continue
84
98
year , week_num = row2epiweek (row )
85
99
dmisid = get_field (row , 'DMISID' )
86
100
flu_cat = row2flu (row )
87
101
88
102
key_list = [year , week_num , dmisid , flu_cat ]
89
103
curr_dict = results_dict
90
104
for i , key in enumerate (key_list ):
91
- if (i == len (key_list ) - 1 ):
92
- if (not key in curr_dict ): curr_dict [key ] = 0
105
+ if i == len (key_list ) - 1 :
106
+ if key not in curr_dict :
107
+ curr_dict [key ] = 0
93
108
curr_dict [key ] += 1
94
109
else :
95
- if (not key in curr_dict ): curr_dict [key ] = dict ()
110
+ if key not in curr_dict :
111
+ curr_dict [key ] = {}
96
112
curr_dict = curr_dict [key ]
97
113
98
114
results_path = os .path .join (TARGET_DIR , targetfile )
99
115
with open (results_path , 'wb' ) as f :
100
116
pickle .dump (results_dict , f , pickle .HIGHEST_PROTOCOL )
101
- return
102
117
103
118
104
119
################# Functions for geographical information ####################
@@ -122,7 +137,7 @@ def format_dmisid_csv(filename, target_name):
122
137
123
138
src_csv = open (src_path , "r" , encoding = 'utf-8-sig' )
124
139
reader = csv .DictReader (src_csv )
125
-
140
+
126
141
dst_csv = open (dst_path , "w" )
127
142
fieldnames = ['dmisid' , 'country' , 'state' , 'zip5' ]
128
143
writer = csv .DictWriter (dst_csv , fieldnames = fieldnames )
@@ -132,9 +147,11 @@ def format_dmisid_csv(filename, target_name):
132
147
133
148
for row in reader :
134
149
country2 = row ['Facility ISO Country Code' ]
135
- if (country2 == "" ): country3 = ""
136
- elif (not country2 in country_mapping ):
137
- for key in row .keys (): print (key , row [key ])
150
+ if country2 == "" :
151
+ country3 = ""
152
+ elif country2 not in country_mapping :
153
+ for key in row .keys ():
154
+ print (key , row [key ])
138
155
continue
139
156
else :
140
157
country3 = country_mapping [country2 ]
@@ -149,6 +166,7 @@ def dmisid():
149
166
target_name = "simple_DMISID_FY2018.csv"
150
167
format_dmisid_csv (filename , target_name )
151
168
169
+
152
170
cen2states = {'cen1' : {'CT' , 'ME' , 'MA' , 'NH' , 'RI' , 'VT' },
153
171
'cen2' : {'NJ' , 'NY' , 'PA' },
154
172
'cen3' : {'IL' , 'IN' , 'MI' , 'OH' , 'WI' },
@@ -175,7 +193,7 @@ def state2region(D):
175
193
for region in D .keys ():
176
194
states = D [region ]
177
195
for state in states :
178
- assert ( not state in results )
196
+ assert state not in results
179
197
results [state ] = region
180
198
return results
181
199
@@ -204,7 +222,7 @@ def write_afhsb_csv(period):
204
222
with open (os .path .join (TARGET_DIR , "{}.csv" .format (period )), 'w' ) as csvfile :
205
223
writer = csv .DictWriter (csvfile , fieldnames = fieldnames )
206
224
writer .writeheader ()
207
-
225
+
208
226
i = 0
209
227
for year in sorted (results_dict .keys ()):
210
228
year_dict = results_dict [year ]
@@ -217,11 +235,12 @@ def write_afhsb_csv(period):
217
235
i += 1
218
236
epiweek = int ("{}{:02d}" .format (year , week ))
219
237
flu_type = flu_mapping [flu ]
220
-
238
+
221
239
row = {"epiweek" : epiweek , "dmisid" : None if (not dmisid .isnumeric ()) else dmisid ,
222
240
"flu_type" : flu_type , "visit_sum" : visit_sum , "id" : i }
223
241
writer .writerow (row )
224
- if (i % 100000 == 0 ): print (row )
242
+ if i % 100000 == 0 :
243
+ print (row )
225
244
226
245
def dmisid_start_time_from_file (filename ):
227
246
starttime_record = dict ()
@@ -230,7 +249,7 @@ def dmisid_start_time_from_file(filename):
230
249
for row in reader :
231
250
dmisid = row ['dmisid' ]
232
251
epiweek = int (row ['epiweek' ])
233
- if ( not dmisid in starttime_record ):
252
+ if dmisid not in starttime_record :
234
253
starttime_record [dmisid ] = epiweek
235
254
else :
236
255
starttime_record [dmisid ] = min (epiweek , starttime_record [dmisid ])
@@ -241,7 +260,7 @@ def dmisid_start_time():
241
260
record2 = dmisid_start_time_from_file (os .path .join (TARGET_DIR , "13to17.csv" ))
242
261
record = record1
243
262
for dmisid , epiweek in record2 .items ():
244
- if ( dmisid in record ) :
263
+ if dmisid in record :
245
264
record [dmisid ] = min (record [dmisid ], epiweek )
246
265
else :
247
266
record [dmisid ] = epiweek
@@ -261,10 +280,10 @@ def fillin_zero_to_csv(period, dmisid_start_record):
261
280
dmisid = row ['dmisid' ]
262
281
flu_type = row ['flu_type' ]
263
282
visit_sum = row ['visit_sum' ]
264
- if ( not epiweek in results_dict ) :
283
+ if epiweek not in results_dict :
265
284
results_dict [epiweek ] = dict ()
266
285
week_dict = results_dict [epiweek ]
267
- if ( not dmisid in week_dict ) :
286
+ if dmisid not in week_dict :
268
287
week_dict [dmisid ] = dict ()
269
288
dmisid_dict = week_dict [dmisid ]
270
289
dmisid_dict [flu_type ] = visit_sum
@@ -277,14 +296,15 @@ def fillin_zero_to_csv(period, dmisid_start_record):
277
296
week_dict = results_dict [epiweek ]
278
297
for dmisid in dmisid_group :
279
298
start_week = dmisid_start_record [dmisid ]
280
- if (start_week > epiweek ): continue
299
+ if start_week > epiweek :
300
+ continue
281
301
282
- if ( not dmisid in week_dict ) :
302
+ if dmisid not in week_dict :
283
303
week_dict [dmisid ] = dict ()
284
304
285
305
dmisid_dict = week_dict [dmisid ]
286
306
for flutype in flutype_group :
287
- if ( not flutype in dmisid_dict ) :
307
+ if flutype not in dmisid_dict :
288
308
dmisid_dict [flutype ] = 0
289
309
290
310
# Write to csv files
@@ -301,7 +321,7 @@ def fillin_zero_to_csv(period, dmisid_start_record):
301
321
row = {"id" : i , "epiweek" : epiweek , "dmisid" : dmisid ,
302
322
"flu_type" : flutype , "visit_sum" : visit_sum }
303
323
writer .writerow (row )
304
- if ( i % 100000 == 0 ) :
324
+ if i % 100000 == 0 :
305
325
print (row )
306
326
i += 1
307
327
print ("Wrote {} rows" .format (i ))
@@ -328,4 +348,4 @@ def main():
328
348
329
349
330
350
if __name__ == '__main__' :
331
- main ()
351
+ main ()
0 commit comments