Skip to content

Commit 67adb8d

Browse files
authored
Use national data for nchs-mortality signals (#1912)
1 parent 833e818 commit 67adb8d

File tree

5 files changed

+66
-57
lines changed

5 files changed

+66
-57
lines changed

nchs_mortality/.pylintrc

+2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
disable=logging-format-interpolation,
55
too-many-locals,
66
too-many-arguments,
7+
too-many-branches,
8+
too-many-statements,
79
# Allow pytest functions to be part of a class.
810
no-self-use,
911
# Allow pytest classes to have one test.

nchs_mortality/delphi_nchs_mortality/constants.py

-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
"prop"
2626
]
2727
INCIDENCE_BASE = 100000
28-
GEO_RES = "state"
2928

3029
# this is necessary as a delimiter in the f-string expressions we use to
3130
# construct detailed error reports

nchs_mortality/delphi_nchs_mortality/pull.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,6 @@ def pull_nchs_mortality_data(token: str, test_file: Optional[str]=None):
9696
{NEWLINE.join(df.columns)}
9797
""") from exc
9898

99-
# Drop rows for locations outside US
100-
df = df[df["state"] != "United States"]
10199
df = df[keep_columns + ["timestamp", "state"]].set_index("timestamp")
102100

103101
# NCHS considers NYC as an individual state, however, we want it included
@@ -124,6 +122,11 @@ def pull_nchs_mortality_data(token: str, test_file: Optional[str]=None):
124122
# Add population info
125123
keep_columns.extend(["timestamp", "geo_id", "population"])
126124
gmpr = GeoMapper()
127-
df = gmpr.add_population_column(df, "state_name", geocode_col="state")
128-
df = gmpr.add_geocode(df, "state_name", "state_id", from_col="state", new_col="geo_id")
125+
# Map state to geo_id, but set dropna=False as we also have national data
126+
df = gmpr.add_population_column(df, "state_name",
127+
geocode_col="state", dropna=False)
128+
df = gmpr.add_geocode(df, "state_name", "state_id",
129+
from_col="state", new_col="geo_id", dropna=False)
130+
# Manually set geo_id for national data
131+
df.loc[df["state"] == "United States", "geo_id"] = "us"
129132
return df[keep_columns]

nchs_mortality/delphi_nchs_mortality/run.py

+37-34
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
from .archive_diffs import arch_diffs
1515
from .constants import (METRICS, SENSOR_NAME_MAP,
16-
SENSORS, INCIDENCE_BASE, GEO_RES)
16+
SENSORS, INCIDENCE_BASE)
1717
from .pull import pull_nchs_mortality_data
1818

1919

@@ -72,51 +72,54 @@ def run_module(params: Dict[str, Any]):
7272
stats = []
7373
df_pull = pull_nchs_mortality_data(token, test_file)
7474
for metric in METRICS:
75-
if metric == 'percent_of_expected_deaths':
76-
logger.info("Generating signal and exporting to CSV",
77-
metric = metric)
78-
df = df_pull.copy()
79-
df["val"] = df[metric]
80-
df["se"] = np.nan
81-
df["sample_size"] = np.nan
82-
df = add_nancodes(df)
83-
# df = df[~df["val"].isnull()]
84-
sensor_name = "_".join([SENSOR_NAME_MAP[metric]])
85-
dates = create_export_csv(
86-
df,
87-
geo_res=GEO_RES,
88-
export_dir=daily_export_dir,
89-
start_date=datetime.strptime(export_start_date, "%Y-%m-%d"),
90-
sensor=sensor_name,
91-
weekly_dates=True
92-
)
93-
if len(dates) > 0:
94-
stats.append((max(dates), len(dates)))
95-
else:
96-
for sensor in SENSORS:
75+
for geo in ["state", "nation"]:
76+
if metric == 'percent_of_expected_deaths':
9777
logger.info("Generating signal and exporting to CSV",
98-
metric = metric,
99-
sensor = sensor)
78+
metric=metric, geo_level=geo)
10079
df = df_pull.copy()
101-
if sensor == "num":
102-
df["val"] = df[metric]
80+
if geo == "nation":
81+
df = df[df["geo_id"] == "us"]
10382
else:
104-
df["val"] = df[metric] / df["population"] * INCIDENCE_BASE
83+
df = df[df["geo_id"] != "us"]
84+
df["val"] = df[metric]
10585
df["se"] = np.nan
10686
df["sample_size"] = np.nan
10787
df = add_nancodes(df)
108-
# df = df[~df["val"].isnull()]
109-
sensor_name = "_".join([SENSOR_NAME_MAP[metric], sensor])
11088
dates = create_export_csv(
11189
df,
112-
geo_res=GEO_RES,
90+
geo_res=geo,
11391
export_dir=daily_export_dir,
11492
start_date=datetime.strptime(export_start_date, "%Y-%m-%d"),
115-
sensor=sensor_name,
93+
sensor=SENSOR_NAME_MAP[metric],
11694
weekly_dates=True
11795
)
118-
if len(dates) > 0:
119-
stats.append((max(dates), len(dates)))
96+
else:
97+
for sensor in SENSORS:
98+
logger.info("Generating signal and exporting to CSV",
99+
metric=metric, sensor=sensor, geo_level=geo)
100+
df = df_pull.copy()
101+
if geo == "nation":
102+
df = df[df["geo_id"] == "us"]
103+
else:
104+
df = df[df["geo_id"] != "us"]
105+
if sensor == "num":
106+
df["val"] = df[metric]
107+
else:
108+
df["val"] = df[metric] / df["population"] * INCIDENCE_BASE
109+
df["se"] = np.nan
110+
df["sample_size"] = np.nan
111+
df = add_nancodes(df)
112+
sensor_name = "_".join([SENSOR_NAME_MAP[metric], sensor])
113+
dates = create_export_csv(
114+
df,
115+
geo_res=geo,
116+
export_dir=daily_export_dir,
117+
start_date=datetime.strptime(export_start_date, "%Y-%m-%d"),
118+
sensor=sensor_name,
119+
weekly_dates=True
120+
)
121+
if len(dates) > 0:
122+
stats.append((max(dates), len(dates)))
120123

121124
# Weekly run of archive utility on Monday
122125
# - Does not upload to S3, that is handled by daily run of archive utility

nchs_mortality/tests/test_run.py

+20-18
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ def test_output_files_exist(self, run_as_module, date):
1919
for output_folder in folders:
2020
csv_files = listdir(output_folder)
2121

22+
geos = ["nation", "state"]
2223
dates = [
2324
"202030",
2425
"202031",
@@ -38,15 +39,14 @@ def test_output_files_exist(self, run_as_module, date):
3839
sensors = ["num", "prop"]
3940

4041
expected_files = []
41-
for d in dates:
42-
for metric in metrics:
43-
if metric == "deaths_percent_of_expected":
44-
expected_files += ["weekly_" + d + "_state_" \
45-
+ metric + ".csv"]
46-
else:
47-
for sensor in sensors:
48-
expected_files += ["weekly_" + d + "_state_" \
49-
+ metric + "_" + sensor + ".csv"]
42+
for geo in geos:
43+
for d in dates:
44+
for metric in metrics:
45+
if metric == "deaths_percent_of_expected":
46+
expected_files += [f"weekly_{d}_{geo}_{metric}.csv"]
47+
else:
48+
for sensor in sensors:
49+
expected_files += [f"weekly_{d}_{geo}_{metric}_{sensor}.csv"]
5050
assert set(expected_files).issubset(set(csv_files))
5151

5252
# the 14th was a Monday
@@ -58,12 +58,14 @@ def test_output_file_format(self, run_as_module, date):
5858
if is_mon_or_thurs:
5959
folders.append("receiving")
6060

61-
for output_folder in folders:
62-
df = pd.read_csv(
63-
join(output_folder, "weekly_202026_state_deaths_covid_incidence_prop.csv")
64-
)
65-
expected_columns = [
66-
"geo_id", "val", "se", "sample_size",
67-
"missing_val", "missing_se", "missing_sample_size"
68-
]
69-
assert (df.columns.values == expected_columns).all()
61+
geos = ["nation", "state"]
62+
for geo in geos:
63+
for output_folder in folders:
64+
df = pd.read_csv(
65+
join(output_folder, f"weekly_202026_{geo}_deaths_covid_incidence_prop.csv")
66+
)
67+
expected_columns = [
68+
"geo_id", "val", "se", "sample_size",
69+
"missing_val", "missing_se", "missing_sample_size"
70+
]
71+
assert (df.columns.values == expected_columns).all()

0 commit comments

Comments
 (0)