Use national data for nchs-mortality signals (#1912)

rzats · web-flow · commit 67adb8d98efc · 2024-01-11T15:31:33.000-05:00
diff --git a/nchs_mortality/.pylintrc b/nchs_mortality/.pylintrc
@@ -4,6 +4,8 @@
 disable=logging-format-interpolation,
     too-many-locals,
     too-many-arguments,
+    too-many-branches,
+    too-many-statements,
     # Allow pytest functions to be part of a class.
     no-self-use,
     # Allow pytest classes to have one test.
diff --git a/nchs_mortality/delphi_nchs_mortality/constants.py b/nchs_mortality/delphi_nchs_mortality/constants.py
@@ -25,7 +25,6 @@
         "prop"
 ]
 INCIDENCE_BASE = 100000
-GEO_RES = "state"
 
 # this is necessary as a delimiter in the f-string expressions we use to
 # construct detailed error reports
diff --git a/nchs_mortality/delphi_nchs_mortality/pull.py b/nchs_mortality/delphi_nchs_mortality/pull.py
@@ -96,8 +96,6 @@ def pull_nchs_mortality_data(token: str, test_file: Optional[str]=None):
 {NEWLINE.join(df.columns)}
 """) from exc
 
-    # Drop rows for locations outside US
-    df = df[df["state"] != "United States"]
     df = df[keep_columns + ["timestamp", "state"]].set_index("timestamp")
 
     # NCHS considers NYC as an individual state, however, we want it included
@@ -124,6 +122,11 @@ def pull_nchs_mortality_data(token: str, test_file: Optional[str]=None):
     # Add population info
     keep_columns.extend(["timestamp", "geo_id", "population"])
     gmpr = GeoMapper()
-    df = gmpr.add_population_column(df, "state_name", geocode_col="state")
-    df = gmpr.add_geocode(df, "state_name", "state_id", from_col="state", new_col="geo_id")
+    # Map state to geo_id, but set dropna=False as we also have national data
+    df = gmpr.add_population_column(df, "state_name",
+                                    geocode_col="state", dropna=False)
+    df = gmpr.add_geocode(df, "state_name", "state_id",
+                          from_col="state", new_col="geo_id", dropna=False)
+    # Manually set geo_id for national data
+    df.loc[df["state"] == "United States", "geo_id"] = "us"
     return df[keep_columns]
diff --git a/nchs_mortality/delphi_nchs_mortality/run.py b/nchs_mortality/delphi_nchs_mortality/run.py
@@ -13,7 +13,7 @@
 
 from .archive_diffs import arch_diffs
 from .constants import (METRICS, SENSOR_NAME_MAP,
-                        SENSORS, INCIDENCE_BASE, GEO_RES)
+                        SENSORS, INCIDENCE_BASE)
 from .pull import pull_nchs_mortality_data
 
 
@@ -72,51 +72,54 @@ def run_module(params: Dict[str, Any]):
     stats = []
     df_pull = pull_nchs_mortality_data(token, test_file)
     for metric in METRICS:
-        if metric == 'percent_of_expected_deaths':
-            logger.info("Generating signal and exporting to CSV",
-                        metric = metric)
-            df = df_pull.copy()
-            df["val"] = df[metric]
-            df["se"] = np.nan
-            df["sample_size"] = np.nan
-            df = add_nancodes(df)
-            # df = df[~df["val"].isnull()]
-            sensor_name = "_".join([SENSOR_NAME_MAP[metric]])
-            dates = create_export_csv(
-                df,
-                geo_res=GEO_RES,
-                export_dir=daily_export_dir,
-                start_date=datetime.strptime(export_start_date, "%Y-%m-%d"),
-                sensor=sensor_name,
-                weekly_dates=True
-            )
-            if len(dates) > 0:
-                stats.append((max(dates), len(dates)))
-        else:
-            for sensor in SENSORS:
+        for geo in ["state", "nation"]:
+            if metric == 'percent_of_expected_deaths':
                 logger.info("Generating signal and exporting to CSV",
-                            metric = metric,
-                            sensor = sensor)
+                            metric=metric, geo_level=geo)
                 df = df_pull.copy()
-                if sensor == "num":
-                    df["val"] = df[metric]
+                if geo == "nation":
+                    df = df[df["geo_id"] == "us"]
                 else:
-                    df["val"] = df[metric] / df["population"] * INCIDENCE_BASE
+                    df = df[df["geo_id"] != "us"]
+                df["val"] = df[metric]
                 df["se"] = np.nan
                 df["sample_size"] = np.nan
                 df = add_nancodes(df)
-                # df = df[~df["val"].isnull()]
-                sensor_name = "_".join([SENSOR_NAME_MAP[metric], sensor])
                 dates = create_export_csv(
                     df,
-                    geo_res=GEO_RES,
+                    geo_res=geo,
                     export_dir=daily_export_dir,
                     start_date=datetime.strptime(export_start_date, "%Y-%m-%d"),
-                    sensor=sensor_name,
+                    sensor=SENSOR_NAME_MAP[metric],
                     weekly_dates=True
                 )
-                if len(dates) > 0:
-                    stats.append((max(dates), len(dates)))
+            else:
+                for sensor in SENSORS:
+                    logger.info("Generating signal and exporting to CSV",
+                                metric=metric, sensor=sensor, geo_level=geo)
+                    df = df_pull.copy()
+                    if geo == "nation":
+                        df = df[df["geo_id"] == "us"]
+                    else:
+                        df = df[df["geo_id"] != "us"]
+                    if sensor == "num":
+                        df["val"] = df[metric]
+                    else:
+                        df["val"] = df[metric] / df["population"] * INCIDENCE_BASE
+                    df["se"] = np.nan
+                    df["sample_size"] = np.nan
+                    df = add_nancodes(df)
+                    sensor_name = "_".join([SENSOR_NAME_MAP[metric], sensor])
+                    dates = create_export_csv(
+                        df,
+                        geo_res=geo,
+                        export_dir=daily_export_dir,
+                        start_date=datetime.strptime(export_start_date, "%Y-%m-%d"),
+                        sensor=sensor_name,
+                        weekly_dates=True
+                    )
+            if len(dates) > 0:
+                stats.append((max(dates), len(dates)))
 
 #     Weekly run of archive utility on Monday
 #     - Does not upload to S3, that is handled by daily run of archive utility
diff --git a/nchs_mortality/tests/test_run.py b/nchs_mortality/tests/test_run.py
@@ -19,6 +19,7 @@ def test_output_files_exist(self, run_as_module, date):
         for output_folder in folders:
             csv_files = listdir(output_folder)
 
+            geos = ["nation", "state"]
             dates = [
                 "202030",
                 "202031",
@@ -38,15 +39,14 @@ def test_output_files_exist(self, run_as_module, date):
             sensors = ["num", "prop"]
 
             expected_files = []
-            for d in dates:
-                for metric in metrics:
-                    if metric == "deaths_percent_of_expected":
-                        expected_files += ["weekly_" + d + "_state_" \
-                                           + metric + ".csv"]
-                    else:
-                        for sensor in sensors:
-                            expected_files += ["weekly_" + d + "_state_" \
-                                               + metric + "_" + sensor + ".csv"]
+            for geo in geos:
+                for d in dates:
+                    for metric in metrics:
+                        if metric == "deaths_percent_of_expected":
+                            expected_files += [f"weekly_{d}_{geo}_{metric}.csv"]
+                        else:
+                            for sensor in sensors:
+                                expected_files += [f"weekly_{d}_{geo}_{metric}_{sensor}.csv"]
             assert set(expected_files).issubset(set(csv_files))
 
     # the 14th was a Monday
@@ -58,12 +58,14 @@ def test_output_file_format(self, run_as_module, date):
         if is_mon_or_thurs:
             folders.append("receiving")
 
-        for output_folder in folders:
-            df = pd.read_csv(
-                join(output_folder, "weekly_202026_state_deaths_covid_incidence_prop.csv")
-            )
-            expected_columns = [
-                "geo_id", "val", "se", "sample_size",
-                "missing_val", "missing_se", "missing_sample_size"
-            ]
-            assert (df.columns.values == expected_columns).all()
+        geos = ["nation", "state"]
+        for geo in geos:
+            for output_folder in folders:
+                df = pd.read_csv(
+                    join(output_folder, f"weekly_202026_{geo}_deaths_covid_incidence_prop.csv")
+                )
+                expected_columns = [
+                    "geo_id", "val", "se", "sample_size",
+                    "missing_val", "missing_se", "missing_sample_size"
+                ]
+                assert (df.columns.values == expected_columns).all()

Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,6 @@`
`25`	`25`	`"prop"`
`26`	`26`	`]`
`27`	`27`	`INCIDENCE_BASE = 100000`
`28`		`-GEO_RES = "state"`
`29`	`28`
`30`	`29`	`# this is necessary as a delimiter in the f-string expressions we use to`
`31`	`30`	`# construct detailed error reports`