Skip to content

Commit a55cc10

Browse files
committed
Convert covid_hosp to use structured logger
* also adds some previously-absent logging
1 parent 2919a58 commit a55cc10

File tree

4 files changed

+54
-20
lines changed

4 files changed

+54
-20
lines changed

src/acquisition/covid_hosp/common/database.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ def contains_revision(self, revision):
124124
for (result,) in cursor:
125125
return bool(result)
126126

127-
def insert_metadata(self, publication_date, revision, meta_json):
127+
def insert_metadata(self, publication_date, revision, meta_json, logger=False):
128128
"""Add revision metadata to the database.
129129
130130
Parameters
@@ -135,6 +135,8 @@ def insert_metadata(self, publication_date, revision, meta_json):
135135
Unique revision string.
136136
meta_json : str
137137
Metadata serialized as a JSON string.
138+
logger structlog.Logger [optional; default False]
139+
Logger to receive messages
138140
"""
139141

140142
with self.new_cursor() as cursor:
@@ -152,7 +154,7 @@ def insert_metadata(self, publication_date, revision, meta_json):
152154
(%s, %s, %s, %s, %s, NOW())
153155
''', (self.table_name, self.hhs_dataset_id, publication_date, revision, meta_json))
154156

155-
def insert_dataset(self, publication_date, dataframe):
157+
def insert_dataset(self, publication_date, dataframe, logger=False):
156158
"""Add a dataset to the database.
157159
158160
Parameters
@@ -161,6 +163,8 @@ def insert_dataset(self, publication_date, dataframe):
161163
Date when the dataset was published in YYYYMMDD format.
162164
dataframe : pandas.DataFrame
163165
The dataset.
166+
logger structlog.Logger [optional; default False]
167+
Logger to receive messages.
164168
"""
165169
dataframe_columns_and_types = [
166170
x for x in self.columns_and_types.values() if x.csv_name in dataframe.columns
@@ -181,6 +185,8 @@ def nan_safe_dtype(dtype, value):
181185
sql = f'INSERT INTO `{self.table_name}` (`id`, `{self.publication_col_name}`, {columns}) ' \
182186
f'VALUES ({value_placeholders})'
183187
id_and_publication_date = (0, publication_date)
188+
if logger:
189+
logger.info("updating values")
184190
with self.new_cursor() as cursor:
185191
for _, row in dataframe.iterrows():
186192
values = []
@@ -193,6 +199,8 @@ def nan_safe_dtype(dtype, value):
193199

194200
# deal with non/seldomly updated columns used like a fk table (if this database needs it)
195201
if hasattr(self, 'AGGREGATE_KEY_COLS'):
202+
if logger:
203+
logger.info("updating keys")
196204
ak_cols = self.AGGREGATE_KEY_COLS
197205

198206
# restrict data to just the key columns and remove duplicate rows
@@ -225,7 +233,7 @@ def nan_safe_dtype(dtype, value):
225233
cur.executemany(ak_insert_sql, ak_data)
226234

227235

228-
def get_max_issue(self):
236+
def get_max_issue(self, logger=False):
229237
"""Fetch the most recent issue.
230238
231239
This is used to bookend what updates we pull in from the HHS metadata.
@@ -242,4 +250,6 @@ def get_max_issue(self):
242250
for (result,) in cursor:
243251
if result is not None:
244252
return pd.Timestamp(str(result))
253+
if logger:
254+
logger.info("get_max_issue", msg="no matching results in meta table; returning 1900/1/1 epoch")
245255
return pd.Timestamp("1900/1/1")

src/acquisition/covid_hosp/common/network.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ class Network:
66
METADATA_URL_TEMPLATE = \
77
'https://healthdata.gov/api/views/%s/rows.csv'
88

9-
def fetch_metadata_for_dataset(dataset_id):
9+
def fetch_metadata_for_dataset(dataset_id, logger=False):
1010
"""Download and return metadata.
1111
1212
Parameters
@@ -20,14 +20,15 @@ def fetch_metadata_for_dataset(dataset_id):
2020
The metadata object.
2121
"""
2222
url = Network.METADATA_URL_TEMPLATE % dataset_id
23-
print(f'fetching metadata at {url}')
23+
if logger:
24+
logger.info('fetching metadata', url=url)
2425
df = Network.fetch_dataset(url)
2526
df["Update Date"] = pandas.to_datetime(df["Update Date"])
2627
df.sort_values("Update Date", inplace=True)
2728
df.set_index("Update Date", inplace=True)
2829
return df
2930

30-
def fetch_dataset(url, pandas_impl=pandas):
31+
def fetch_dataset(url, pandas_impl=pandas, logger=False):
3132
"""Download and return a dataset.
3233
3334
Type inference is disabled in favor of explicit type casting at the
@@ -44,6 +45,6 @@ def fetch_dataset(url, pandas_impl=pandas):
4445
pandas.DataFrame
4546
The dataset.
4647
"""
47-
48-
print(f'fetching dataset at {url}')
48+
if logger:
49+
logger.info('fetching dataset', url=url)
4950
return pandas_impl.read_csv(url, dtype=str)

src/acquisition/covid_hosp/common/utils.py

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
import pandas as pd
88

9+
from delphi.epidata.acquisition.common.logger import get_structured_logger
10+
911
class CovidHospException(Exception):
1012
"""Exception raised exclusively by `covid_hosp` utilities."""
1113

@@ -69,7 +71,15 @@ def parse_bool(value):
6971
return False
7072
raise CovidHospException(f'cannot convert "{value}" to bool')
7173

72-
def issues_to_fetch(metadata, newer_than, older_than):
74+
def limited_string_fn(length):
75+
def limited_string(value):
76+
value = str(value)
77+
if len(value) > length:
78+
raise CovidHospException(f"Value '{value}':{len(value)} longer than max {length}")
79+
return value
80+
return limited_string
81+
82+
def issues_to_fetch(metadata, newer_than, older_than, logger=False):
7383
"""
7484
Construct all issue dates and URLs to be ingested based on metadata.
7585
@@ -81,6 +91,8 @@ def issues_to_fetch(metadata, newer_than, older_than):
8191
Lower bound (exclusive) of days to get issues for.
8292
older_than Date
8393
Upper bound (exclusive) of days to get issues for
94+
logger structlog.Logger [optional; default False]
95+
Logger to receive messages
8496
Returns
8597
-------
8698
Dictionary of {issue day: list of (download urls, index)}
@@ -100,11 +112,12 @@ def issues_to_fetch(metadata, newer_than, older_than):
100112
elif day >= older_than:
101113
n_beyond += 1
102114
if n_beyond > 0:
103-
print(f"{n_beyond} issues available on {older_than} or newer")
115+
if logger:
116+
logger.info("issues available", on_or_newer=older_than, count=n_beyond)
104117
return daily_issues
105118

106119
@staticmethod
107-
def merge_by_key_cols(dfs, key_cols):
120+
def merge_by_key_cols(dfs, key_cols, logger=False):
108121
"""Merge a list of data frames as a series of updates.
109122
110123
Parameters:
@@ -113,13 +126,20 @@ def merge_by_key_cols(dfs, key_cols):
113126
Data frames to merge, ordered from earliest to latest.
114127
key_cols: list(str)
115128
Columns to use as the index.
129+
logger structlog.Logger [optional; default False]
130+
Logger to receive messages
116131
117132
Returns a single data frame containing the most recent data for each state+date.
118133
"""
119134

120135
dfs = [df.set_index(key_cols) for df in dfs
121136
if not all(k in df.index.names for k in key_cols)]
122137
result = dfs[0]
138+
if logger and len(dfs) > 7:
139+
logger.warning(
140+
"expensive operation",
141+
msg="concatenating more than 7 files may result in long running times",
142+
count=len(dfs))
123143
for df in dfs[1:]:
124144
# update values for existing keys
125145
result.update(df)
@@ -153,22 +173,25 @@ def update_dataset(database, network, newer_than=None, older_than=None):
153173
bool
154174
Whether a new dataset was acquired.
155175
"""
156-
metadata = network.fetch_metadata()
176+
logger = get_structured_logger(f"{database.__class__.__module__}.{database.__class__.__name__}.update_dataset")
177+
178+
metadata = network.fetch_metadata(logger=logger)
157179
datasets = []
158180
with database.connect() as db:
159-
max_issue = db.get_max_issue()
181+
max_issue = db.get_max_issue(logger=logger)
160182

161183
older_than = datetime.datetime.today().date() if newer_than is None else older_than
162184
newer_than = max_issue if newer_than is None else newer_than
163-
daily_issues = Utils.issues_to_fetch(metadata, newer_than, older_than)
185+
daily_issues = Utils.issues_to_fetch(metadata, newer_than, older_than, logger=logger)
164186
if not daily_issues:
165-
print("no new issues, nothing to do")
187+
logger.info("no new issues; nothing to do")
166188
return False
167189
for issue, revisions in daily_issues.items():
168190
issue_int = int(issue.strftime("%Y%m%d"))
169191
# download the dataset and add it to the database
170192
dataset = Utils.merge_by_key_cols([network.fetch_dataset(url) for url, _ in revisions],
171-
db.KEY_COLS)
193+
db.KEY_COLS,
194+
logger=logger)
172195
# add metadata to the database
173196
all_metadata = []
174197
for url, index in revisions:
@@ -180,10 +203,10 @@ def update_dataset(database, network, newer_than=None, older_than=None):
180203
))
181204
with database.connect() as db:
182205
for issue_int, dataset, all_metadata in datasets:
183-
db.insert_dataset(issue_int, dataset)
206+
db.insert_dataset(issue_int, dataset, logger=logger)
184207
for url, metadata_json in all_metadata:
185-
db.insert_metadata(issue_int, url, metadata_json)
186-
print(f'successfully acquired {len(dataset)} rows')
208+
db.insert_metadata(issue_int, url, metadata_json, logger=logger)
209+
logger.info("acquired rows", count=len(dataset))
187210

188211
# note that the transaction is committed by exiting the `with` block
189212
return True

src/acquisition/covid_hosp/facility/database.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ class Database(BaseDatabase):
4040
Columndef('ccn', 'ccn', str),
4141
Columndef('city', 'city', str),
4242
Columndef('fips_code', 'fips_code', str),
43-
Columndef('geocoded_hospital_address', 'geocoded_hospital_address', str),
43+
Columndef('geocoded_hospital_address', 'geocoded_hospital_address', Utils.limited_string_fn(32)),
4444
Columndef('hhs_ids', 'hhs_ids', str),
4545
Columndef('hospital_name', 'hospital_name', str),
4646
Columndef('hospital_subtype', 'hospital_subtype', str),

0 commit comments

Comments
 (0)