Skip to content

Commit 229a96c

Browse files
committed
pass seasonids around to use in requests for location-specific data
1 parent 26eff97 commit 229a96c

File tree

2 files changed

+98
-67
lines changed

2 files changed

+98
-67
lines changed

src/acquisition/flusurv/flusurv.py

Lines changed: 59 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -122,23 +122,41 @@ def fetch_json(path, payload, call_count=1, requests_impl=requests):
122122
return resp.json()
123123

124124

125-
def fetch_flusurv_location(location_code):
126-
"""Return decoded FluSurv JSON object for the given location."""
127-
return fetch_json(
125+
def fetch_flusurv_location(location, seasonids):
126+
"""Return FluSurv JSON object for the given location."""
127+
location_code = location_to_code[location]
128+
129+
result = fetch_json(
128130
"PostPhase03DataTool",
129131
{
130132
"appversion": "Public",
131133
"key": "getdata",
132-
"injson": [{
133-
"networkid": location_code[0],
134-
"cacthmentid": location_code[1],
135-
"seasonid": seasonid
136-
}],
134+
"injson": [
135+
{
136+
"networkid": location_code[0],
137+
"catchmentid": location_code[1],
138+
"seasonid": elem,
139+
} for elem in seasonids],
137140
},
138141
)
139142

140-
def fetch_flusurv_object():
141-
"""Return raw FluSurv JSON object for all locations."""
143+
# If no data is returned (a given seasonid is not reported,
144+
# location codes are invalid, etc), the API returns a JSON like:
145+
# {
146+
# 'default_data': {
147+
# 'response': 'No Data'
148+
# }
149+
# }
150+
#
151+
# If data is returned, then data["default_data"] is a list
152+
# and data["default_data"]["response"] doesn't exist.
153+
assert isinstance(result["default_data"], list) and len(result["default_data"]) > 0, \
154+
f"Data was not correctly returned from the API for {location}"
155+
return result
156+
157+
158+
def fetch_flusurv_metadata():
159+
"""Return FluSurv JSON metadata object."""
142160
return fetch_json(
143161
"PostPhase03DataTool",
144162
{"appversion": "Public", "key": "", "injson": []}
@@ -155,12 +173,13 @@ def mmwrid_to_epiweek(mmwrid):
155173
return epiweek_200340.add_weeks(mmwrid - mmwrid_200340).get_ew()
156174

157175

158-
def reformat_to_nested(data):
176+
def group_by_epiweek(data):
159177
"""
160-
Convert the default data object into a dictionary grouped by location and epiweek
178+
Convert default data for a single location into an epiweek-grouped dictionary
161179
162180
Args:
163-
A GRASP API response object, as fetched with 'fetch_flusurv_object()'
181+
data: The "default_data" element of a GRASP API response object,
182+
as fetched with 'fetch_flusurv_location' or `fetch_flusurv_metadata`
164183
165184
Returns a dictionary of the format
166185
{
@@ -176,21 +195,22 @@ def reformat_to_nested(data):
176195
...
177196
}
178197
"""
198+
data = data["default_data"]
199+
179200
# Sanity check the input. We expect to see some epiweeks
180-
if len(data["default_data"]) == 0:
201+
if len(data) == 0:
181202
raise Exception("no data found")
182203

183-
id_label_map = make_id_label_map(data)
204+
id_label_map = make_id_label_map()
184205

185206
# Create output object
186-
# First layer of keys is locations. Second layer of keys is epiweeks.
187-
# Third layer of keys is groups (by id, not age in years, sex abbr, etc).
207+
# First layer of keys is epiweeks. Second layer of keys is groups
208+
# (by id, not age in years, sex abbr, etc).
188209
#
189210
# If a top-level key doesn't already exist, create a new empty dict.
190-
# If a secondary key doesn't already exist, create a new empty dict.
191-
# If a tertiary key doesn't already exist, create a new key with a
211+
# If a secondary key doesn't already exist, create a new key with a
192212
# default value of None if not provided.
193-
data_out = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: None)))
213+
data_out = defaultdict(lambda: defaultdict(lambda: None))
194214

195215
# data["default_data"] is a list of dictionaries, with the format
196216
# [
@@ -199,66 +219,62 @@ def reformat_to_nested(data):
199219
# {'networkid': 1, 'catchmentid': 22, 'seasonid': 49, 'ageid': 0, 'sexid': 0, 'raceid': 1, 'rate': 20.6, 'weeklyrate': 0.1, 'mmwrid': 2516},
200220
# ...
201221
# ]
202-
for obs in data["default_data"]:
222+
for obs in data:
203223
epiweek = mmwrid_to_epiweek(obs["mmwrid"])
204-
location = code_to_location[(obs["networkid"], obs["catchmentid"])]
205224
groupname = groupids_to_name(
206225
ageid = obs["ageid"], sexid = obs["sexid"], raceid = obs["raceid"],
207226
id_label_map = id_label_map
208227
)
209228

210229
rate = obs["weeklyrate"]
211-
prev_rate = data_out[location][epiweek][groupname]
230+
prev_rate = data_out[epiweek][groupname]
212231
if prev_rate is None:
213-
# This is the first time to see a rate for this location-epiweek-
214-
# group combo
215-
data_out[location][epiweek][groupname] = rate
232+
# This is the first time to see a rate for this epiweek-group
233+
# combo
234+
data_out[epiweek][groupname] = rate
216235
elif prev_rate != rate:
217236
# Skip and warn; a different rate was already found for this
218-
# location-epiweek-group combo
219-
warn((f"warning: Multiple rates seen for {location} {epiweek} "
237+
# epiweek-group combo
238+
warn((f"warning: Multiple rates seen for {epiweek} "
220239
f"{groupname}, but previous value {prev_rate} does not "
221240
f"equal new value {rate}. Using the first value."))
222241

223242
# Sanity check the input. We expect to have populated our dictionary
224243
if len(data_out.keys()) == 0:
225244
raise Exception("no data loaded")
226245

227-
print(f"found data for {len(data_out.keys())} locations")
228-
# Just check one location to avoid iterating through the entire
229-
# dictionary.
230-
print(f"found data for {len(data_out[location].keys())} epiweeks for {location}")
246+
print(f"found data for {len(data_out.keys())} epiweeks")
231247

232248
return data_out
233249

234250

235-
def get_data(location_code):
251+
def get_data(location, seasonids):
236252
"""
237253
Fetch and parse flu data for the given location.
238254
239255
This method performs the following operations:
240-
- fetches FluSurv data from CDC
256+
- filters location-specific FluSurv data from CDC API response object
241257
- extracts and returns hospitalization rates
242258
"""
243-
244259
# fetch
245260
print("[fetching flusurv data...]")
246-
data_in = fetch_flusurv_location(location_code)
261+
data_in = fetch_flusurv_location(location, seasonids)
247262

248263
# extract
249-
print("[extracting values...]")
250-
data_out = reformat_to_nested(data_in)
264+
print("[reformatting flusurv result...]")
265+
data_out = group_by_epiweek(data_in)
251266

252267
# return
253-
print("[scraped successfully]")
268+
print(f"[successfully fetched data for {location}]")
254269
return data_out
255270

256271

257272
def get_current_issue(data):
258273
"""
259274
Extract the current issue from the FluSurv API result.
260275
261-
data: dictionary representing a JSON response from the FluSurv API
276+
Args:
277+
data: dictionary representing a JSON response from the FluSurv API
262278
"""
263279
# extract
264280
date = datetime.strptime(data["loaddatetime"], "%b %d, %Y")
@@ -267,8 +283,10 @@ def get_current_issue(data):
267283
return EpiDate(date.year, date.month, date.day).get_ew()
268284

269285

270-
def make_id_label_map(data):
286+
def make_id_label_map():
271287
"""Create a map from valueid to group description"""
288+
data = fetch_flusurv_metadata()
289+
272290
id_to_label = defaultdict(lambda: defaultdict(lambda: None))
273291
for group in data["master_lookup"]:
274292
# Skip "overall" group

src/acquisition/flusurv/flusurv_update.py

Lines changed: 39 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070

7171
# standard library
7272
import argparse
73+
from warnings import warn
7374

7475
# third party
7576
import mysql.connector
@@ -81,6 +82,8 @@
8182
from delphi.utils.epiweek import delta_epiweeks
8283

8384

85+
max_age_to_consider_weeks = 52
86+
8487
def get_rows(cur):
8588
"""Return the number of rows in the `flusurv` table."""
8689

@@ -90,13 +93,10 @@ def get_rows(cur):
9093
return num
9194

9295

93-
def update(issue, location, test_mode=False):
94-
"""Fetch and store the currently avialble weekly FluSurv dataset."""
95-
96-
# fetch data
97-
location_code = flusurv.location_to_code[location]
98-
print("fetching data for", location, location_code)
99-
data = flusurv.get_data(location_code)
96+
def update(issue, location, seasonids, test_mode=False):
97+
"""Fetch and store the currently available weekly FluSurv dataset."""
98+
# Fetch location-specific data
99+
data = flusurv.get_data(location, seasonids)
100100

101101
# metadata
102102
epiweeks = sorted(data.keys())
@@ -214,10 +214,16 @@ def update(issue, location, test_mode=False):
214214

215215
# insert/update each row of data (one per epiweek)
216216
for epiweek in epiweeks:
217-
# As of Sept 2023, we expect to see these 24 groups, as described in
218-
# the top-level "master_lookup" element of the new GRASP API
219-
# (https://gis.cdc.gov/GRASP/Flu3/PostPhase03DataTool) response
220-
# object:
217+
lag = delta_epiweeks(epiweek, issue)
218+
if lag > max_age_to_consider_weeks:
219+
# Ignore values older than one year, as (1) they are assumed not to
220+
# change, and (2) it would adversely affect database performance if all
221+
# values (including duplicates) were stored on each run.
222+
continue
223+
224+
# As of Sept 2023, for new data we expect to see these 23 groups, as
225+
# described in the top-level "master_lookup" element of the new GRASP API
226+
# (https://gis.cdc.gov/GRASP/Flu3/PostPhase03DataTool) response object:
221227
# 'master_lookup' = [
222228
# {'Variable': 'Age', 'valueid': 1, 'parentid': 97, 'Label': '0-4 yr', 'Color_HexValue': '#d19833', 'Enabled': True},
223229
# {'Variable': 'Age', 'valueid': 2, 'parentid': 97, 'Label': '5-17 yr', 'Color_HexValue': '#707070', 'Enabled': True},
@@ -247,9 +253,11 @@ def update(issue, location, test_mode=False):
247253
# {'Variable': None, 'valueid': 0, 'parentid': 0, 'Label': 'Overall', 'Color_HexValue': '#000000', 'Enabled': True},
248254
# ]
249255
#
256+
# All 23 strata are available starting with epiweek 200935.
257+
#
250258
# The previous version of the GRASP API
251259
# (https://gis.cdc.gov/GRASP/Flu3/GetPhase03InitApp)
252-
# used a different age group-id mapping, as described in the
260+
# used the following age groupid mapping, as described in the
253261
# top-level "ages" element:
254262
# 'ages' = [
255263
# {'label': '0-4 yr', 'ageid': 1, 'color_hexvalue': '#1B9E77'},
@@ -263,18 +271,15 @@ def update(issue, location, test_mode=False):
263271
# {'label': '85+', 'ageid': 9, 'color_hexvalue': '#1f78b4'}
264272
# ]
265273
#
266-
# In addition to the new age, race, and sex breakdowns, the
267-
# group id for overall reporting has changed from 6 to 0.
268-
n_max_expected_groups = 24
269-
assert len(epiweek.keys()) == n_max_expected_groups, \
270-
f"{location} {epiweek} data does not contain the expected {n_max_expected_groups} groups"
274+
# In addition to the new age, race, and sex breakdowns, the group
275+
# id for overall reporting has changed from 6 to 0. Ageids 1-5
276+
# and 7-9 retain the same the same meanings.
277+
n_expected_groups = 23
278+
if len(data[epiweek].keys()) != n_expected_groups:
279+
warnings.warn(
280+
f"{location} {epiweek} data does not contain the expected {n_expected_groups} groups"
281+
)
271282

272-
lag = delta_epiweeks(epiweek, issue)
273-
if lag > 52:
274-
# Ignore values older than one year, as (1) they are assumed not to
275-
# change, and (2) it would adversely affect database performance if all
276-
# values (including duplicates) were stored on each run.
277-
continue
278283
args_meta = {
279284
"release_date": release_date,
280285
"issue": issue,
@@ -313,20 +318,28 @@ def main():
313318
# fmt: on
314319
args = parser.parse_args()
315320

316-
data = fetch_flusurv_object()
321+
data = flusurv.fetch_flusurv_metadata()
317322

318323
# scrape current issue from the main page
319324
issue = flusurv.get_current_issue(data)
320325
print(f"current issue: {int(issue)}")
321326

327+
# Ignore seasons with all dates older than one year
328+
seasonids = {
329+
season_blob["seasonid"] for season_blob in data["seasons"]
330+
if delta_epiweeks(flusurv.mmwrid_to_epiweek(season_blob["endweek"]), issue) < max_age_to_consider_weeks
331+
}
332+
322333
# fetch flusurv data
323334
if args.location == "all":
324335
# all locations
325336
for location in flusurv.location_to_code.keys():
326-
update(issue, location, args.test)
337+
update(issue, location, seasonids, args.test)
327338
else:
328339
# single location
329-
update(issue, args.location, args.test)
340+
assert args.location in flusurv.location_to_code.keys(), \
341+
f"Requested location {args.location} not available"
342+
update(issue, args.location, seasonids, args.test)
330343

331344

332345
if __name__ == "__main__":

0 commit comments

Comments
 (0)