6
6
7
7
import pandas as pd
8
8
9
+
9
10
class CovidHospException (Exception ):
10
11
"""Exception raised exclusively by `covid_hosp` utilities."""
11
12
@@ -69,7 +70,26 @@ def parse_bool(value):
69
70
return False
70
71
raise CovidHospException (f'cannot convert "{ value } " to bool' )
71
72
72
- def issues_to_fetch (metadata , newer_than , older_than ):
73
+ def limited_string_fn (length ):
74
+ def limited_string (value ):
75
+ value = str (value )
76
+ if len (value ) > length :
77
+ raise CovidHospException (f"Value '{ value } ':{ len (value )} longer than max { length } " )
78
+ return value
79
+ return limited_string
80
+
81
+ GEOCODE_LENGTH = 32
82
+ GEOCODE_PATTERN = re .compile (r'POINT \((-?[0-9.]+) (-?[0-9.]+)\)' )
83
+ def limited_geocode (value ):
84
+ if len (value ) < Utils .GEOCODE_LENGTH :
85
+ return value
86
+ # otherwise parse and set precision to 6 decimal places
87
+ m = Utils .GEOCODE_PATTERN .match (value )
88
+ if not m :
89
+ raise CovidHospException (f"Couldn't parse geocode '{ value } '" )
90
+ return f'POINT ({ " " .join (f"{ float (x ):.6f} " for x in m .groups ())} )'
91
+
92
+ def issues_to_fetch (metadata , newer_than , older_than , logger = False ):
73
93
"""
74
94
Construct all issue dates and URLs to be ingested based on metadata.
75
95
@@ -81,13 +101,16 @@ def issues_to_fetch(metadata, newer_than, older_than):
81
101
Lower bound (exclusive) of days to get issues for.
82
102
older_than Date
83
103
Upper bound (exclusive) of days to get issues for
104
+ logger structlog.Logger [optional; default False]
105
+ Logger to receive messages
84
106
Returns
85
107
-------
86
108
Dictionary of {issue day: list of (download urls, index)}
87
109
for issues after newer_than and before older_than
88
110
"""
89
111
daily_issues = {}
90
112
n_beyond = 0
113
+ n_selected = 0
91
114
for index in sorted (set (metadata .index )):
92
115
day = index .date ()
93
116
if day > newer_than and day < older_than :
@@ -97,14 +120,17 @@ def issues_to_fetch(metadata, newer_than, older_than):
97
120
daily_issues [day ] = urls_list
98
121
else :
99
122
daily_issues [day ] += urls_list
123
+ n_selected += len (urls_list )
100
124
elif day >= older_than :
101
125
n_beyond += 1
102
- if n_beyond > 0 :
103
- print (f"{ n_beyond } issues available on { older_than } or newer" )
126
+ if logger :
127
+ if n_beyond > 0 :
128
+ logger .info ("issues available beyond selection" , on_or_newer = older_than , count = n_beyond )
129
+ logger .info ("issues selected" , newer_than = str (newer_than ), older_than = str (older_than ), count = n_selected )
104
130
return daily_issues
105
131
106
132
@staticmethod
107
- def merge_by_key_cols (dfs , key_cols ):
133
+ def merge_by_key_cols (dfs , key_cols , logger = False ):
108
134
"""Merge a list of data frames as a series of updates.
109
135
110
136
Parameters:
@@ -113,13 +139,20 @@ def merge_by_key_cols(dfs, key_cols):
113
139
Data frames to merge, ordered from earliest to latest.
114
140
key_cols: list(str)
115
141
Columns to use as the index.
142
+ logger structlog.Logger [optional; default False]
143
+ Logger to receive messages
116
144
117
145
Returns a single data frame containing the most recent data for each state+date.
118
146
"""
119
147
120
148
dfs = [df .set_index (key_cols ) for df in dfs
121
149
if not all (k in df .index .names for k in key_cols )]
122
150
result = dfs [0 ]
151
+ if logger and len (dfs ) > 7 :
152
+ logger .warning (
153
+ "expensive operation" ,
154
+ msg = "concatenating more than 7 files may result in long running times" ,
155
+ count = len (dfs ))
123
156
for df in dfs [1 :]:
124
157
# update values for existing keys
125
158
result .update (df )
@@ -153,22 +186,25 @@ def update_dataset(database, network, newer_than=None, older_than=None):
153
186
bool
154
187
Whether a new dataset was acquired.
155
188
"""
156
- metadata = network .fetch_metadata ()
189
+ logger = database .logger ()
190
+
191
+ metadata = network .fetch_metadata (logger = logger )
157
192
datasets = []
158
193
with database .connect () as db :
159
- max_issue = db .get_max_issue ()
194
+ max_issue = db .get_max_issue (logger = logger )
160
195
161
196
older_than = datetime .datetime .today ().date () if newer_than is None else older_than
162
197
newer_than = max_issue if newer_than is None else newer_than
163
- daily_issues = Utils .issues_to_fetch (metadata , newer_than , older_than )
198
+ daily_issues = Utils .issues_to_fetch (metadata , newer_than , older_than , logger = logger )
164
199
if not daily_issues :
165
- print ("no new issues, nothing to do" )
200
+ logger . info ("no new issues; nothing to do" )
166
201
return False
167
202
for issue , revisions in daily_issues .items ():
168
203
issue_int = int (issue .strftime ("%Y%m%d" ))
169
204
# download the dataset and add it to the database
170
- dataset = Utils .merge_by_key_cols ([network .fetch_dataset (url ) for url , _ in revisions ],
171
- db .KEY_COLS )
205
+ dataset = Utils .merge_by_key_cols ([network .fetch_dataset (url , logger = logger ) for url , _ in revisions ],
206
+ db .KEY_COLS ,
207
+ logger = logger )
172
208
# add metadata to the database
173
209
all_metadata = []
174
210
for url , index in revisions :
@@ -180,10 +216,10 @@ def update_dataset(database, network, newer_than=None, older_than=None):
180
216
))
181
217
with database .connect () as db :
182
218
for issue_int , dataset , all_metadata in datasets :
183
- db .insert_dataset (issue_int , dataset )
219
+ db .insert_dataset (issue_int , dataset , logger = logger )
184
220
for url , metadata_json in all_metadata :
185
- db .insert_metadata (issue_int , url , metadata_json )
186
- print ( f'successfully acquired { len (dataset )} rows' )
221
+ db .insert_metadata (issue_int , url , metadata_json , logger = logger )
222
+ logger . info ( " acquired rows" , count = len (dataset ))
187
223
188
224
# note that the transaction is committed by exiting the `with` block
189
225
return True
0 commit comments