1
- '''
1
+ """
2
2
===============
3
3
=== Purpose ===
4
4
===============
15
15
* add end date, end week check
16
16
2017-12-02:
17
17
* original version
18
- '''
18
+ """
19
19
20
20
# standard library
21
21
from collections import defaultdict
35
35
import delphi .utils .epidate as ED
36
36
from delphi .utils .geo .locations import Locations
37
37
38
- def word_map (row ,terms ):
39
- for (k ,v ) in terms .items ():
40
- row = row .replace (k ,v )
38
+
39
+ def word_map (row , terms ):
40
+ for (k , v ) in terms .items ():
41
+ row = row .replace (k , v )
41
42
return row
42
43
43
- def date_less_than (d1 ,d2 ):
44
- y1 ,m1 ,d1 = [int (x ) for x in d1 .split ('-' )]
45
- y2 ,m2 ,d2 = [int (x ) for x in d2 .split ('-' )]
46
44
47
- if y1 * 10000 + m1 * 100 + d1 < y2 * 10000 + m2 * 100 + d2 :
45
+ def date_less_than (d1 , d2 ):
46
+ y1 , m1 , d1 = (int (x ) for x in d1 .split ("-" ))
47
+ y2 , m2 , d2 = (int (x ) for x in d2 .split ("-" ))
48
+
49
+ if y1 * 10000 + m1 * 100 + d1 < y2 * 10000 + m2 * 100 + d2 :
48
50
return 1
49
- elif y1 * 10000 + m1 * 100 + d1 == y2 * 10000 + m2 * 100 + d2 :
51
+ elif y1 * 10000 + m1 * 100 + d1 == y2 * 10000 + m2 * 100 + d2 :
50
52
return 0
51
53
else :
52
54
return - 1
53
55
56
+
54
57
# shift>0: shifted to future
55
58
def date_to_epiweek (date , shift = 0 ):
56
- y ,m , d = [ int (x ) for x in date .split ('-' )]
59
+ y , m , d = ( int (x ) for x in date .split ("-" ))
57
60
58
- epidate = ED .EpiDate (y ,m , d )
61
+ epidate = ED .EpiDate (y , m , d )
59
62
epidate = epidate .add_days (shift )
60
63
ew = epidate .get_ew ()
61
64
return ew
62
65
66
+
63
67
# convert measurment to time series format
64
68
# startweek and endweek are inclusive
65
- def measurement_to_ts (m ,index ,startweek = None ,endweek = None ):
69
+ def measurement_to_ts (m , index , startweek = None , endweek = None ):
66
70
if startweek is None :
67
71
startweek = 0
68
72
if endweek is None :
69
73
endweek = 999999
70
74
res = {}
71
- for r ,rdict in m .items ():
72
- res [r ]= {}
73
- for t ,vals in rdict .items ():
74
- if index >= len (vals ):
75
+ for r , rdict in m .items ():
76
+ res [r ] = {}
77
+ for t , vals in rdict .items ():
78
+ if index >= len (vals ):
75
79
raise Exception ("Index is invalid" )
76
- if t >= startweek and t <= endweek :
80
+ if t >= startweek and t <= endweek :
77
81
res [r ][t ] = vals [index ]
78
82
return res
79
83
84
+
80
85
class QuidelData :
81
86
def __init__ (self , raw_path , load_email = True ):
82
87
self .data_path = raw_path
83
- self .excel_uptodate_path = join (raw_path ,' excel/uptodate' )
84
- self .excel_history_path = join (raw_path ,' excel/history' )
85
- self .csv_path = join (raw_path ,' csv' )
88
+ self .excel_uptodate_path = join (raw_path , " excel/uptodate" )
89
+ self .excel_history_path = join (raw_path , " excel/history" )
90
+ self .csv_path = join (raw_path , " csv" )
86
91
self .xlsx_uptodate_list = [
87
- f [:- 5 ] for f in listdir (self .excel_uptodate_path ) if isfile (join (self .excel_uptodate_path , f )) and f [- 5 :]== '.xlsx'
92
+ f [:- 5 ]
93
+ for f in listdir (self .excel_uptodate_path )
94
+ if isfile (join (self .excel_uptodate_path , f )) and f [- 5 :] == ".xlsx"
88
95
]
89
96
self .xlsx_history_list = [
90
- f [:- 5 ] for f in listdir (self .excel_history_path ) if isfile (join (self .excel_history_path , f )) and f [- 5 :]== '.xlsx'
97
+ f [:- 5 ]
98
+ for f in listdir (self .excel_history_path )
99
+ if isfile (join (self .excel_history_path , f )) and f [- 5 :] == ".xlsx"
100
+ ]
101
+ self .csv_list = [
102
+ f [:- 4 ]
103
+ for f in listdir (self .csv_path )
104
+ if isfile (join (self .csv_path , f )) and f [- 4 :] == ".csv"
91
105
]
92
- self .csv_list = [f [:- 4 ] for f in listdir (self .csv_path ) if isfile (join (self .csv_path , f )) and f [- 4 :]== '.csv' ]
93
106
self .map_terms = {
94
- ' FL 34637"' :'FL' ,
107
+ ' FL 34637"' : "FL" ,
95
108
}
96
109
# hardcoded parameters
97
110
self .date_dim = 1
98
111
self .state_dim = 4
99
112
self .fields = [
100
- 'sofia_ser' ,'date' ,'fac_id' ,'city' ,'state' ,'zip' ,'age' ,
101
- 'fluA' ,'fluB' ,'fluAll' ,'county' ,'fac_type'
113
+ "sofia_ser" ,
114
+ "date" ,
115
+ "fac_id" ,
116
+ "city" ,
117
+ "state" ,
118
+ "zip" ,
119
+ "age" ,
120
+ "fluA" ,
121
+ "fluB" ,
122
+ "fluAll" ,
123
+ "county" ,
124
+ "fac_type" ,
102
125
]
103
- self .fields_to_keep = [' fac_id' , ' fluA' , ' fluB' , ' fluAll' ]
126
+ self .fields_to_keep = [" fac_id" , " fluA" , " fluB" , " fluAll" ]
104
127
self .dims_to_keep = [self .fields .index (x ) for x in self .fields_to_keep ]
105
128
if load_email :
106
129
self .retrieve_excels ()
107
130
self .prepare_csv ()
108
131
109
132
def retrieve_excels (self ):
110
- detach_dir = self .excel_uptodate_path # directory where to save attachments (default: current)
133
+ detach_dir = (
134
+ self .excel_uptodate_path
135
+ ) # directory where to save attachments (default: current)
111
136
112
137
# connecting to the gmail imap server
113
138
m = imaplib .IMAP4_SSL ("imap.gmail.com" )
114
- m .login (secrets .quidel .email_addr ,secrets .quidel .email_pwd )
115
- m .select ("INBOX" ) # here you a can choose a mail box like INBOX instead
139
+ m .login (secrets .quidel .email_addr , secrets .quidel .email_pwd )
140
+ m .select ("INBOX" ) # here you a can choose a mail box like INBOX instead
116
141
# use m.list() to get all the mailboxes
117
- _ , items = m .search (None , "ALL" ) # you could filter using the IMAP rules here (check http://www.example-code.com/csharp/imap-search-critera.asp)
118
- items = items [0 ].split () # getting the mails id
142
+ # you could filter using the IMAP rules here (check https://www.example-code.com/csharp/imap-search-critera.asp)
143
+ _ , items = m .search (None , "ALL" )
144
+ items = items [0 ].split () # getting the mails id
119
145
120
146
# The emailids are ordered from past to now
121
147
for emailid in items :
122
- _ , data = m .fetch (emailid , "(RFC822)" ) # fetching the mail, "`(RFC822)`" means "get the whole stuff", but you can ask for headers only, etc
123
- email_body = data [0 ][1 ].decode ('utf-8' ) # getting the mail content
124
- mail = email .message_from_string (email_body ) # parsing the mail content to get a mail object
125
-
126
- #Check if any attachments at all
127
- if mail .get_content_maintype () != 'multipart' :
148
+ _ , data = m .fetch (
149
+ emailid , "(RFC822)"
150
+ ) # fetching the mail, "`(RFC822)`" means "get the whole stuff", but you can ask for headers only, etc
151
+ email_body = data [0 ][1 ].decode ("utf-8" ) # getting the mail content
152
+ mail = email .message_from_string (
153
+ email_body
154
+ ) # parsing the mail content to get a mail object
155
+
156
+ # Check if any attachments at all
157
+ if mail .get_content_maintype () != "multipart" :
128
158
continue
129
159
130
160
# we use walk to create a generator so we can iterate on the parts and forget about the recursive headach
131
161
for part in mail .walk ():
132
162
# multipart are just containers, so we skip them
133
- if part .get_content_maintype () == ' multipart' :
163
+ if part .get_content_maintype () == " multipart" :
134
164
continue
135
165
136
166
# is this part an attachment ?
137
- if part .get (' Content-Disposition' ) is None :
167
+ if part .get (" Content-Disposition" ) is None :
138
168
continue
139
169
140
170
filename = part .get_filename ()
141
171
# check duplicates
142
- if filename [- 5 :]!= '.xlsx' or filename [:- 5 ] in self .xlsx_uptodate_list + self .xlsx_history_list :
172
+ if (
173
+ filename [- 5 :] != ".xlsx"
174
+ or filename [:- 5 ] in self .xlsx_uptodate_list + self .xlsx_history_list
175
+ ):
143
176
continue
144
177
145
178
self .xlsx_uptodate_list .append (filename [:- 5 ])
146
179
att_path = os .path .join (detach_dir , filename )
147
180
148
- #Check if its already there
149
- if not os .path .isfile (att_path ) :
181
+ # Check if its already there
182
+ if not os .path .isfile (att_path ):
150
183
# finally write the stuff
151
- fp = open (att_path , 'wb' )
184
+ fp = open (att_path , "wb" )
152
185
fp .write (part .get_payload (decode = True ))
153
186
fp .close ()
154
187
155
188
def prepare_csv (self ):
156
- need_update = False
189
+ need_update = False
157
190
for f in self .xlsx_uptodate_list :
158
191
if f in self .csv_list :
159
192
continue
160
193
else :
161
- need_update = True
194
+ need_update = True
162
195
163
- date_regex = ' \d{2}-\d{2}-\d{4}'
164
- date_items = re .findall (date_regex ,f )
196
+ date_regex = r" \d{2}-\d{2}-\d{4}"
197
+ date_items = re .findall (date_regex , f )
165
198
if date_items :
166
- end_date = '-' .join (date_items [- 1 ].split ('-' )[x ] for x in [2 ,0 , 1 ])
199
+ end_date = "-" .join (date_items [- 1 ].split ("-" )[x ] for x in [2 , 0 , 1 ])
167
200
else :
168
- print ("End date not found in file name:" + f )
201
+ print ("End date not found in file name:" + f )
169
202
end_date = None
170
203
171
- df_dict = pd .read_excel (join (self .excel_uptodate_path , f + ' .xlsx' ), sheet_name = None )
172
- for (_ ,df ) in df_dict .items ():
173
- df = df .dropna (axis = 0 , how = ' all' )
174
- df [' TestDate' ] = df [' TestDate' ].apply (lambda x : x .strftime (' %Y-%m-%d' ))
175
- df_filtered = df [df [' TestDate' ] != '' ]
204
+ df_dict = pd .read_excel (join (self .excel_uptodate_path , f + " .xlsx" ), sheet_name = None )
205
+ for (_ , df ) in df_dict .items ():
206
+ df = df .dropna (axis = 0 , how = " all" )
207
+ df [" TestDate" ] = df [" TestDate" ].apply (lambda x : x .strftime (" %Y-%m-%d" ))
208
+ df_filtered = df [df [" TestDate" ] != "" ]
176
209
if end_date is not None :
177
- df_filtered = df_filtered [df .apply (lambda x : date_less_than (end_date ,x ['TestDate' ])!= 1 , axis = 1 )]
178
- df_filtered .to_csv (join (self .csv_path , f + '.csv' ), index = False , encoding = 'utf-8' )
179
- self .csv_list = [f [:- 4 ] for f in listdir (self .csv_path ) if isfile (join (self .csv_path , f )) and f [- 4 :]== '.csv' ]
210
+ df_filtered = df_filtered [
211
+ df .apply (lambda x : date_less_than (end_date , x ["TestDate" ]) != 1 , axis = 1 )
212
+ ]
213
+ df_filtered .to_csv (join (self .csv_path , f + ".csv" ), index = False , encoding = "utf-8" )
214
+ self .csv_list = [
215
+ f [:- 4 ]
216
+ for f in listdir (self .csv_path )
217
+ if isfile (join (self .csv_path , f )) and f [- 4 :] == ".csv"
218
+ ]
180
219
self .need_update = need_update
181
220
182
221
def load_csv (self , dims = None ):
@@ -186,12 +225,12 @@ def load_csv(self, dims=None):
186
225
for f in self .csv_list :
187
226
if f in self .xlsx_history_list :
188
227
continue
189
- rf = open (join (self .csv_path ,f + ' .csv' ))
228
+ rf = open (join (self .csv_path , f + " .csv" ))
190
229
191
230
lines = rf .readlines ()
192
231
for l in lines [1 :]:
193
- l = word_map (l ,self .map_terms )
194
- row = l .strip ().split (',' )
232
+ l = word_map (l , self .map_terms )
233
+ row = l .strip ().split ("," )
195
234
date = row [self .date_dim ]
196
235
state = row [self .state_dim ]
197
236
if state not in parsed_dict [date ]:
@@ -202,42 +241,43 @@ def load_csv(self, dims=None):
202
241
203
242
# hardcoded aggregation function
204
243
# output: [#unique_device,fluA,fluB,fluAll,total]
205
- def prepare_measurements (self ,data_dict ,use_hhs = True ,start_weekday = 6 ):
244
+ def prepare_measurements (self , data_dict , use_hhs = True , start_weekday = 6 ):
206
245
buffer_dict = {}
207
246
if use_hhs :
208
247
region_list = Locations .hhs_list
209
248
else :
210
249
region_list = Locations .atom_list
211
250
212
251
def get_hhs_region (atom ):
213
- for region in Locations .hhs_list :
214
- if atom .lower () in Locations .hhs_map [region ]:
215
- return region
216
- if atom .lower () == 'ny' :
217
- return ' hhs2'
218
- return atom
252
+ for region in Locations .hhs_list :
253
+ if atom .lower () in Locations .hhs_map [region ]:
254
+ return region
255
+ if atom .lower () == "ny" :
256
+ return " hhs2"
257
+ return atom
219
258
220
259
day_shift = 6 - start_weekday
221
- time_map = lambda x :date_to_epiweek (x ,shift = day_shift )
222
- region_map = lambda x :get_hhs_region (x ) \
223
- if use_hhs and x not in Locations .hhs_list else x # a bit hacky
260
+ time_map = lambda x : date_to_epiweek (x , shift = day_shift )
261
+ region_map = (
262
+ lambda x : get_hhs_region (x ) if use_hhs and x not in Locations .hhs_list else x
263
+ ) # a bit hacky
224
264
225
265
end_date = sorted (data_dict .keys ())[- 1 ]
226
266
# count the latest week in only if Thurs data is included
227
- end_epiweek = date_to_epiweek (end_date ,shift = - 4 )
267
+ end_epiweek = date_to_epiweek (end_date , shift = - 4 )
228
268
# first pass: prepare device_id set
229
269
device_dict = {}
230
- for (date ,daily_dict ) in data_dict .items ():
270
+ for (date , daily_dict ) in data_dict .items ():
231
271
if not date :
232
272
continue
233
273
ew = time_map (date )
234
- if ew == - 1 or ew > end_epiweek :
274
+ if ew == - 1 or ew > end_epiweek :
235
275
continue
236
276
if ew not in device_dict :
237
- device_dict [ew ]= {}
277
+ device_dict [ew ] = {}
238
278
for r in region_list :
239
279
device_dict [ew ][r ] = set ()
240
- for (state ,rec_list ) in daily_dict .items ():
280
+ for (state , rec_list ) in daily_dict .items ():
241
281
region = region_map (state )
242
282
# get rid of non-US regions
243
283
if region not in region_list :
@@ -247,38 +287,40 @@ def get_hhs_region(atom):
247
287
device_dict [ew ][region ].add (fac )
248
288
249
289
# second pass: prepare all measurements
250
- for (date ,daily_dict ) in data_dict .items ():
290
+ for (date , daily_dict ) in data_dict .items ():
251
291
ew = time_map (date )
252
- if ew == - 1 or ew > end_epiweek :
292
+ if ew == - 1 or ew > end_epiweek :
253
293
continue
254
294
if ew not in buffer_dict :
255
- buffer_dict [ew ]= {}
295
+ buffer_dict [ew ] = {}
256
296
for r in region_list :
257
- buffer_dict [ew ][r ] = [0.0 ]* 8
297
+ buffer_dict [ew ][r ] = [0.0 ] * 8
258
298
259
- for (state ,rec_list ) in daily_dict .items ():
299
+ for (state , rec_list ) in daily_dict .items ():
260
300
region = region_map (state )
261
301
# get rid of non-US regions
262
302
if region not in region_list :
263
303
continue
264
304
for rec in rec_list :
265
305
fac_num = float (len (device_dict [ew ][region ]))
266
- buffer_dict [ew ][region ]= np .add (
267
- buffer_dict [ew ][region ],[
268
- rec [1 ]== 'positive' ,
269
- rec [2 ]== 'positive' ,
270
- rec [3 ]== 'positive' ,
306
+ buffer_dict [ew ][region ] = np .add (
307
+ buffer_dict [ew ][region ],
308
+ [
309
+ rec [1 ] == "positive" ,
310
+ rec [2 ] == "positive" ,
311
+ rec [3 ] == "positive" ,
271
312
1.0 ,
272
- float (rec [1 ]== 'positive' )/ fac_num ,
273
- float (rec [2 ]== 'positive' )/ fac_num ,
274
- float (rec [3 ]== 'positive' )/ fac_num ,
275
- 1.0 / fac_num ,
276
- ]).tolist ()
313
+ float (rec [1 ] == "positive" ) / fac_num ,
314
+ float (rec [2 ] == "positive" ) / fac_num ,
315
+ float (rec [3 ] == "positive" ) / fac_num ,
316
+ 1.0 / fac_num ,
317
+ ],
318
+ ).tolist ()
277
319
# switch two dims of dict
278
320
result_dict = {}
279
321
for r in region_list :
280
- result_dict [r ]= {}
281
- for (k ,v ) in buffer_dict .items ():
282
- result_dict [r ][k ]= v [r ]
322
+ result_dict [r ] = {}
323
+ for (k , v ) in buffer_dict .items ():
324
+ result_dict [r ][k ] = v [r ]
283
325
284
326
return result_dict
0 commit comments