66
77import pandas as pd
88
9+
910class CovidHospException (Exception ):
1011 """Exception raised exclusively by `covid_hosp` utilities."""
1112
@@ -69,7 +70,26 @@ def parse_bool(value):
6970 return False
7071 raise CovidHospException (f'cannot convert "{ value } " to bool' )
7172
72- def issues_to_fetch (metadata , newer_than , older_than ):
73+ def limited_string_fn (length ):
74+ def limited_string (value ):
75+ value = str (value )
76+ if len (value ) > length :
77+ raise CovidHospException (f"Value '{ value } ':{ len (value )} longer than max { length } " )
78+ return value
79+ return limited_string
80+
81+ GEOCODE_LENGTH = 32
82+ GEOCODE_PATTERN = re .compile (r'POINT \((-?[0-9.]+) (-?[0-9.]+)\)' )
83+ def limited_geocode (value ):
84+ if len (value ) < Utils .GEOCODE_LENGTH :
85+ return value
86+ # otherwise parse and set precision to 6 decimal places
87+ m = Utils .GEOCODE_PATTERN .match (value )
88+ if not m :
89+ raise CovidHospException (f"Couldn't parse geocode '{ value } '" )
90+ return f'POINT ({ " " .join (f"{ float (x ):.6f} " for x in m .groups ())} )'
91+
92+ def issues_to_fetch (metadata , newer_than , older_than , logger = False ):
7393 """
7494 Construct all issue dates and URLs to be ingested based on metadata.
7595
@@ -81,13 +101,16 @@ def issues_to_fetch(metadata, newer_than, older_than):
81101 Lower bound (exclusive) of days to get issues for.
82102 older_than Date
83103 Upper bound (exclusive) of days to get issues for
104+ logger structlog.Logger [optional; default False]
105+ Logger to receive messages
84106 Returns
85107 -------
86108 Dictionary of {issue day: list of (download urls, index)}
87109 for issues after newer_than and before older_than
88110 """
89111 daily_issues = {}
90112 n_beyond = 0
113+ n_selected = 0
91114 for index in sorted (set (metadata .index )):
92115 day = index .date ()
93116 if day > newer_than and day < older_than :
@@ -97,14 +120,17 @@ def issues_to_fetch(metadata, newer_than, older_than):
97120 daily_issues [day ] = urls_list
98121 else :
99122 daily_issues [day ] += urls_list
123+ n_selected += len (urls_list )
100124 elif day >= older_than :
101125 n_beyond += 1
102- if n_beyond > 0 :
103- print (f"{ n_beyond } issues available on { older_than } or newer" )
126+ if logger :
127+ if n_beyond > 0 :
128+ logger .info ("issues available beyond selection" , on_or_newer = older_than , count = n_beyond )
129+ logger .info ("issues selected" , newer_than = str (newer_than ), older_than = str (older_than ), count = n_selected )
104130 return daily_issues
105131
106132 @staticmethod
107- def merge_by_key_cols (dfs , key_cols ):
133+ def merge_by_key_cols (dfs , key_cols , logger = False ):
108134 """Merge a list of data frames as a series of updates.
109135
110136 Parameters:
@@ -113,13 +139,20 @@ def merge_by_key_cols(dfs, key_cols):
113139 Data frames to merge, ordered from earliest to latest.
114140 key_cols: list(str)
115141 Columns to use as the index.
142+ logger structlog.Logger [optional; default False]
143+ Logger to receive messages
116144
117145 Returns a single data frame containing the most recent data for each state+date.
118146 """
119147
120148 dfs = [df .set_index (key_cols ) for df in dfs
121149 if not all (k in df .index .names for k in key_cols )]
122150 result = dfs [0 ]
151+ if logger and len (dfs ) > 7 :
152+ logger .warning (
153+ "expensive operation" ,
154+ msg = "concatenating more than 7 files may result in long running times" ,
155+ count = len (dfs ))
123156 for df in dfs [1 :]:
124157 # update values for existing keys
125158 result .update (df )
@@ -153,22 +186,25 @@ def update_dataset(database, network, newer_than=None, older_than=None):
153186 bool
154187 Whether a new dataset was acquired.
155188 """
156- metadata = network .fetch_metadata ()
189+ logger = database .logger ()
190+
191+ metadata = network .fetch_metadata (logger = logger )
157192 datasets = []
158193 with database .connect () as db :
159- max_issue = db .get_max_issue ()
194+ max_issue = db .get_max_issue (logger = logger )
160195
161196 older_than = datetime .datetime .today ().date () if newer_than is None else older_than
162197 newer_than = max_issue if newer_than is None else newer_than
163- daily_issues = Utils .issues_to_fetch (metadata , newer_than , older_than )
198+ daily_issues = Utils .issues_to_fetch (metadata , newer_than , older_than , logger = logger )
164199 if not daily_issues :
165- print ("no new issues, nothing to do" )
200+ logger . info ("no new issues; nothing to do" )
166201 return False
167202 for issue , revisions in daily_issues .items ():
168203 issue_int = int (issue .strftime ("%Y%m%d" ))
169204 # download the dataset and add it to the database
170- dataset = Utils .merge_by_key_cols ([network .fetch_dataset (url ) for url , _ in revisions ],
171- db .KEY_COLS )
205+ dataset = Utils .merge_by_key_cols ([network .fetch_dataset (url , logger = logger ) for url , _ in revisions ],
206+ db .KEY_COLS ,
207+ logger = logger )
172208 # add metadata to the database
173209 all_metadata = []
174210 for url , index in revisions :
@@ -180,10 +216,10 @@ def update_dataset(database, network, newer_than=None, older_than=None):
180216 ))
181217 with database .connect () as db :
182218 for issue_int , dataset , all_metadata in datasets :
183- db .insert_dataset (issue_int , dataset )
219+ db .insert_dataset (issue_int , dataset , logger = logger )
184220 for url , metadata_json in all_metadata :
185- db .insert_metadata (issue_int , url , metadata_json )
186- print ( f'successfully acquired { len (dataset )} rows' )
221+ db .insert_metadata (issue_int , url , metadata_json , logger = logger )
222+ logger . info ( " acquired rows" , count = len (dataset ))
187223
188224 # note that the transaction is committed by exiting the `with` block
189225 return True
0 commit comments