|
8 | 8 |
|
9 | 9 | # export_start_date = PARAMS["indicator"]["export_start_date"]
|
10 | 10 | EXPORT_DIR = "./receiving"
|
11 |
| -TOKEN = "" |
| 11 | +SOCRATA_TOKEN = "" |
12 | 12 |
|
13 | 13 |
|
14 | 14 | class TestPullNCHS:
|
15 | 15 | def test_standardize_columns(self):
|
16 | 16 | df = standardize_columns(
|
17 |
| - pd.DataFrame({ |
18 |
| - "start_week": [1], |
19 |
| - "covid_deaths": [2], |
20 |
| - "pneumonia_and_covid_deaths": [4], |
21 |
| - "pneumonia_influenza_or_covid_19_deaths": [8] |
22 |
| - })) |
23 |
| - expected = pd.DataFrame({ |
24 |
| - "timestamp": [1], |
25 |
| - "covid_19_deaths": [2], |
26 |
| - "pneumonia_and_covid_19_deaths": [4], |
27 |
| - "pneumonia_influenza_or_covid_19_deaths": [8] |
28 |
| - }) |
| 17 | + pd.DataFrame( |
| 18 | + { |
| 19 | + "start_week": [1], |
| 20 | + "covid_deaths": [2], |
| 21 | + "pneumonia_and_covid_deaths": [4], |
| 22 | + "pneumonia_influenza_or_covid_19_deaths": [8], |
| 23 | + } |
| 24 | + ) |
| 25 | + ) |
| 26 | + expected = pd.DataFrame( |
| 27 | + { |
| 28 | + "timestamp": [1], |
| 29 | + "covid_19_deaths": [2], |
| 30 | + "pneumonia_and_covid_19_deaths": [4], |
| 31 | + "pneumonia_influenza_or_covid_19_deaths": [8], |
| 32 | + } |
| 33 | + ) |
29 | 34 | pd.testing.assert_frame_equal(expected, df)
|
30 | 35 |
|
31 | 36 | def test_good_file(self):
|
32 |
| - df = pull_nchs_mortality_data(TOKEN, "test_data.csv") |
| 37 | + df = pull_nchs_mortality_data(SOCRATA_TOKEN, "test_data.csv") |
33 | 38 |
|
34 | 39 | # Test columns
|
35 |
| - assert (df.columns.values == [ |
36 |
| - 'covid_19_deaths', 'total_deaths', 'percent_of_expected_deaths', |
37 |
| - 'pneumonia_deaths', 'pneumonia_and_covid_19_deaths', |
38 |
| - 'influenza_deaths', 'pneumonia_influenza_or_covid_19_deaths', |
39 |
| - "timestamp", "geo_id", "population"]).all() |
| 40 | + assert ( |
| 41 | + df.columns.values |
| 42 | + == [ |
| 43 | + "covid_19_deaths", |
| 44 | + "total_deaths", |
| 45 | + "percent_of_expected_deaths", |
| 46 | + "pneumonia_deaths", |
| 47 | + "pneumonia_and_covid_19_deaths", |
| 48 | + "influenza_deaths", |
| 49 | + "pneumonia_influenza_or_covid_19_deaths", |
| 50 | + "timestamp", |
| 51 | + "geo_id", |
| 52 | + "population", |
| 53 | + ] |
| 54 | + ).all() |
40 | 55 |
|
41 | 56 | # Test aggregation for NYC and NY
|
42 | 57 | raw_df = pd.read_csv("./test_data/test_data.csv", parse_dates=["start_week"])
|
43 | 58 | raw_df = standardize_columns(raw_df)
|
44 | 59 | for metric in METRICS:
|
45 |
| - ny_list = raw_df.loc[(raw_df["state"] == "New York") |
46 |
| - & (raw_df[metric].isnull()), "timestamp"].values |
47 |
| - nyc_list = raw_df.loc[(raw_df["state"] == "New York City") |
48 |
| - & (raw_df[metric].isnull()), "timestamp"].values |
49 |
| - final_list = df.loc[(df["geo_id"] == "ny") |
50 |
| - & (df[metric].isnull()), "timestamp"].values |
| 60 | + ny_list = raw_df.loc[ |
| 61 | + (raw_df["state"] == "New York") & (raw_df[metric].isnull()), "timestamp" |
| 62 | + ].values |
| 63 | + nyc_list = raw_df.loc[ |
| 64 | + (raw_df["state"] == "New York City") & (raw_df[metric].isnull()), |
| 65 | + "timestamp", |
| 66 | + ].values |
| 67 | + final_list = df.loc[ |
| 68 | + (df["geo_id"] == "ny") & (df[metric].isnull()), "timestamp" |
| 69 | + ].values |
51 | 70 | assert set(final_list) == set(ny_list).intersection(set(nyc_list))
|
52 | 71 |
|
53 | 72 | # Test missing value
|
54 | 73 | gmpr = GeoMapper()
|
55 | 74 | state_ids = pd.DataFrame(list(gmpr.get_geo_values("state_id")))
|
56 |
| - state_names = gmpr.replace_geocode(state_ids, |
57 |
| - "state_id", |
58 |
| - "state_name", |
59 |
| - from_col=0, |
60 |
| - date_col=None) |
| 75 | + state_names = gmpr.replace_geocode( |
| 76 | + state_ids, "state_id", "state_name", from_col=0, date_col=None |
| 77 | + ) |
61 | 78 | for state, geo_id in zip(state_names, state_ids):
|
62 | 79 | if state in set(["New York", "New York City"]):
|
63 | 80 | continue
|
64 | 81 | for metric in METRICS:
|
65 |
| - test_list = raw_df.loc[(raw_df["state"] == state) |
66 |
| - & (raw_df[metric].isnull()), "timestamp"].values |
67 |
| - final_list = df.loc[(df["geo_id"] == geo_id) |
68 |
| - & (df[metric].isnull()), "timestamp"].values |
| 82 | + test_list = raw_df.loc[ |
| 83 | + (raw_df["state"] == state) & (raw_df[metric].isnull()), "timestamp" |
| 84 | + ].values |
| 85 | + final_list = df.loc[ |
| 86 | + (df["geo_id"] == geo_id) & (df[metric].isnull()), "timestamp" |
| 87 | + ].values |
69 | 88 | assert set(final_list) == set(test_list)
|
70 | 89 |
|
71 | 90 | def test_bad_file_with_inconsistent_time_col(self):
|
72 | 91 | with pytest.raises(ValueError):
|
73 |
| - pull_nchs_mortality_data(TOKEN, "bad_data_with_inconsistent_time_col.csv") |
| 92 | + pull_nchs_mortality_data( |
| 93 | + SOCRATA_TOKEN, "bad_data_with_inconsistent_time_col.csv" |
| 94 | + ) |
74 | 95 |
|
75 | 96 | def test_bad_file_with_missing_cols(self):
|
76 | 97 | with pytest.raises(ValueError):
|
77 |
| - pull_nchs_mortality_data(TOKEN, "bad_data_with_missing_cols.csv") |
| 98 | + pull_nchs_mortality_data(SOCRATA_TOKEN, "bad_data_with_missing_cols.csv") |
0 commit comments